1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
39 #include "osd/scrub_machine.h"
40 #include "osd/pg_scrubber.h"
42 #include "include/types.h"
43 #include "include/compat.h"
44 #include "include/random.h"
49 #include "osdc/Objecter.h"
51 #include "common/errno.h"
52 #include "common/ceph_argparse.h"
53 #include "common/ceph_releases.h"
54 #include "common/ceph_time.h"
55 #include "common/version.h"
56 #include "common/async/blocked_completion.h"
57 #include "common/pick_address.h"
58 #include "common/blkdev.h"
59 #include "common/numa.h"
61 #include "os/ObjectStore.h"
63 #include "os/FuseStore.h"
66 #include "PrimaryLogPG.h"
68 #include "msg/Messenger.h"
69 #include "msg/Message.h"
71 #include "mon/MonClient.h"
73 #include "messages/MLog.h"
75 #include "messages/MGenericMessage.h"
76 #include "messages/MOSDPing.h"
77 #include "messages/MOSDFailure.h"
78 #include "messages/MOSDMarkMeDown.h"
79 #include "messages/MOSDMarkMeDead.h"
80 #include "messages/MOSDFull.h"
81 #include "messages/MOSDOp.h"
82 #include "messages/MOSDOpReply.h"
83 #include "messages/MOSDBackoff.h"
84 #include "messages/MOSDBeacon.h"
85 #include "messages/MOSDRepOp.h"
86 #include "messages/MOSDRepOpReply.h"
87 #include "messages/MOSDBoot.h"
88 #include "messages/MOSDPGTemp.h"
89 #include "messages/MOSDPGReadyToMerge.h"
91 #include "messages/MOSDMap.h"
92 #include "messages/MMonGetOSDMap.h"
93 #include "messages/MOSDPGNotify.h"
94 #include "messages/MOSDPGNotify2.h"
95 #include "messages/MOSDPGQuery.h"
96 #include "messages/MOSDPGQuery2.h"
97 #include "messages/MOSDPGLog.h"
98 #include "messages/MOSDPGRemove.h"
99 #include "messages/MOSDPGInfo.h"
100 #include "messages/MOSDPGInfo2.h"
101 #include "messages/MOSDPGCreate.h"
102 #include "messages/MOSDPGCreate2.h"
103 #include "messages/MBackfillReserve.h"
104 #include "messages/MRecoveryReserve.h"
105 #include "messages/MOSDForceRecovery.h"
106 #include "messages/MOSDECSubOpWrite.h"
107 #include "messages/MOSDECSubOpWriteReply.h"
108 #include "messages/MOSDECSubOpRead.h"
109 #include "messages/MOSDECSubOpReadReply.h"
110 #include "messages/MOSDPGCreated.h"
111 #include "messages/MOSDPGUpdateLogMissing.h"
112 #include "messages/MOSDPGUpdateLogMissingReply.h"
114 #include "messages/MOSDPeeringOp.h"
116 #include "messages/MOSDAlive.h"
118 #include "messages/MOSDScrub.h"
119 #include "messages/MOSDScrub2.h"
120 #include "messages/MOSDRepScrub.h"
122 #include "messages/MCommand.h"
123 #include "messages/MCommandReply.h"
125 #include "messages/MPGStats.h"
127 #include "messages/MWatchNotify.h"
128 #include "messages/MOSDPGPush.h"
129 #include "messages/MOSDPGPushReply.h"
130 #include "messages/MOSDPGPull.h"
132 #include "messages/MMonGetPurgedSnaps.h"
133 #include "messages/MMonGetPurgedSnapsReply.h"
135 #include "common/perf_counters.h"
136 #include "common/Timer.h"
137 #include "common/LogClient.h"
138 #include "common/AsyncReserver.h"
139 #include "common/HeartbeatMap.h"
140 #include "common/admin_socket.h"
141 #include "common/ceph_context.h"
143 #include "global/signal_handler.h"
144 #include "global/pidfile.h"
146 #include "include/color.h"
147 #include "perfglue/cpu_profiler.h"
148 #include "perfglue/heap_profiler.h"
150 #include "osd/ClassHandler.h"
151 #include "osd/OpRequest.h"
153 #include "auth/AuthAuthorizeHandler.h"
154 #include "auth/RotatingKeyRing.h"
156 #include "objclass/objclass.h"
158 #include "common/cmdparse.h"
159 #include "include/str_list.h"
160 #include "include/util.h"
162 #include "include/ceph_assert.h"
163 #include "common/config.h"
164 #include "common/EventTrace.h"
166 #include "json_spirit/json_spirit_reader.h"
167 #include "json_spirit/json_spirit_writer.h"
170 #define TRACEPOINT_DEFINE
171 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #include "tracing/osd.h"
173 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
174 #undef TRACEPOINT_DEFINE
176 #define tracepoint(...)
179 #include "common/tracer.h"
182 #define dout_context cct
183 #define dout_subsys ceph_subsys_osd
185 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
189 using std::lock_guard
;
190 using std::make_pair
;
191 using std::make_tuple
;
192 using std::make_unique
;
195 using std::ostringstream
;
199 using std::stringstream
;
200 using std::to_string
;
201 using std::unique_ptr
;
204 using ceph::bufferlist
;
205 using ceph::bufferptr
;
208 using ceph::fixed_u_to_string
;
209 using ceph::Formatter
;
210 using ceph::heartbeat_handle_d
;
211 using ceph::make_mutex
;
213 using namespace ceph::osd::scheduler
;
214 using TOPNSPC::common::cmd_getval
;
216 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
217 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
220 //Initial features in new superblock.
221 //Features here are also automatically upgraded
222 CompatSet
OSD::get_osd_initial_compat_set() {
223 CompatSet::FeatureSet ceph_osd_feature_compat
;
224 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
225 CompatSet::FeatureSet ceph_osd_feature_incompat
;
226 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
227 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
228 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
229 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
230 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
231 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
232 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
233 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
234 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
235 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
236 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
237 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
238 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
239 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
240 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES
);
241 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
);
242 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
243 ceph_osd_feature_incompat
);
246 //Features are added here that this OSD supports.
247 CompatSet
OSD::get_osd_compat_set() {
248 CompatSet compat
= get_osd_initial_compat_set();
249 //Any features here can be set in code, but not in initial superblock
250 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
254 OSDService::OSDService(OSD
*osd
, ceph::async::io_context_pool
& poolctx
) :
257 whoami(osd
->whoami
), store(osd
->store
),
258 log_client(osd
->log_client
), clog(osd
->clog
),
259 pg_recovery_stats(osd
->pg_recovery_stats
),
260 cluster_messenger(osd
->cluster_messenger
),
261 client_messenger(osd
->client_messenger
),
263 recoverystate_perf(osd
->recoverystate_perf
),
265 osd_max_object_size(cct
->_conf
, "osd_max_object_size"),
266 osd_skip_data_digest(cct
->_conf
, "osd_skip_data_digest"),
267 publish_lock
{ceph::make_mutex("OSDService::publish_lock")},
268 pre_publish_lock
{ceph::make_mutex("OSDService::pre_publish_lock")},
272 agent_valid_iterator(false),
274 flush_mode_high_count(0),
277 agent_stop_flag(false),
278 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
279 last_recalibrate(ceph_clock_now()),
280 promote_max_objects(0),
281 promote_max_bytes(0),
283 objecter(make_unique
<Objecter
>(osd
->client_messenger
->cct
,
284 osd
->objecter_messenger
,
285 osd
->monc
, poolctx
)),
286 m_objecter_finishers(cct
->_conf
->osd_objecter_finishers
),
287 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
289 recovery_request_timer(cct
, recovery_request_lock
, false),
290 sleep_timer(cct
, sleep_lock
, false),
291 reserver_finisher(cct
),
292 local_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
293 cct
->_conf
->osd_min_recovery_priority
),
294 remote_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
295 cct
->_conf
->osd_min_recovery_priority
),
296 snap_reserver(cct
, &reserver_finisher
,
297 cct
->_conf
->osd_max_trimming_pgs
),
298 recovery_ops_active(0),
299 recovery_ops_reserved(0),
300 recovery_paused(false),
301 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
302 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
303 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
305 cur_ratio(0), physical_ratio(0),
306 boot_epoch(0), up_epoch(0), bind_epoch(0)
310 for (int i
= 0; i
< m_objecter_finishers
; i
++) {
312 str
<< "objecter-finisher-" << i
;
313 auto fin
= make_unique
<Finisher
>(osd
->client_messenger
->cct
, str
.str(), "finisher");
314 objecter_finishers
.push_back(std::move(fin
));
319 void OSDService::add_pgid(spg_t pgid
, PG
*pg
) {
320 std::lock_guard
l(pgid_lock
);
321 if (!pgid_tracker
.count(pgid
)) {
324 pgid_tracker
[pgid
]++;
326 void OSDService::remove_pgid(spg_t pgid
, PG
*pg
)
328 std::lock_guard
l(pgid_lock
);
329 ceph_assert(pgid_tracker
.count(pgid
));
330 ceph_assert(pgid_tracker
[pgid
] > 0);
331 pgid_tracker
[pgid
]--;
332 if (pgid_tracker
[pgid
] == 0) {
333 pgid_tracker
.erase(pgid
);
334 live_pgs
.erase(pgid
);
337 void OSDService::dump_live_pgids()
339 std::lock_guard
l(pgid_lock
);
340 derr
<< "live pgids:" << dendl
;
341 for (map
<spg_t
, int>::const_iterator i
= pgid_tracker
.cbegin();
342 i
!= pgid_tracker
.cend();
344 derr
<< "\t" << *i
<< dendl
;
345 live_pgs
[i
->first
]->dump_live_ids();
351 ceph::signedspan
OSDService::get_mnow()
353 return ceph::mono_clock::now() - osd
->startup_time
;
356 void OSDService::identify_splits_and_merges(
360 set
<pair
<spg_t
,epoch_t
>> *split_children
,
361 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
363 if (!old_map
->have_pg_pool(pgid
.pool())) {
366 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
367 auto p
= osd
->pg_num_history
.pg_nums
.find(pgid
.pool());
368 if (p
== osd
->pg_num_history
.pg_nums
.end()) {
371 dout(20) << __func__
<< " " << pgid
<< " e" << old_map
->get_epoch()
372 << " to e" << new_map
->get_epoch()
373 << " pg_nums " << p
->second
<< dendl
;
375 queue
.push_back(pgid
);
377 while (!queue
.empty()) {
378 auto cur
= queue
.front();
381 unsigned pgnum
= old_pgnum
;
382 for (auto q
= p
->second
.lower_bound(old_map
->get_epoch());
383 q
!= p
->second
.end() &&
384 q
->first
<= new_map
->get_epoch();
386 if (pgnum
< q
->second
) {
388 if (cur
.ps() < pgnum
) {
390 if (cur
.is_split(pgnum
, q
->second
, &children
)) {
391 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
392 << " pg_num " << pgnum
<< " -> " << q
->second
393 << " children " << children
<< dendl
;
394 for (auto i
: children
) {
395 split_children
->insert(make_pair(i
, q
->first
));
400 } else if (cur
.ps() < q
->second
) {
401 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
402 << " pg_num " << pgnum
<< " -> " << q
->second
403 << " is a child" << dendl
;
404 // normally we'd capture this from the parent, but it's
405 // possible the parent doesn't exist yet (it will be
406 // fabricated to allow an intervening merge). note this PG
407 // as a split child here to be sure we catch it.
408 split_children
->insert(make_pair(cur
, q
->first
));
410 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
411 << " pg_num " << pgnum
<< " -> " << q
->second
412 << " is post-split, skipping" << dendl
;
414 } else if (merge_pgs
) {
416 if (cur
.ps() >= q
->second
) {
417 if (cur
.ps() < pgnum
) {
419 if (cur
.is_merge_source(pgnum
, q
->second
, &parent
)) {
421 parent
.is_split(q
->second
, pgnum
, &children
);
422 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
423 << " pg_num " << pgnum
<< " -> " << q
->second
424 << " is merge source, target " << parent
425 << ", source(s) " << children
<< dendl
;
426 merge_pgs
->insert(make_pair(parent
, q
->first
));
427 if (!did
.count(parent
)) {
428 // queue (and re-scan) parent in case it might not exist yet
429 // and there are some future splits pending on it
430 queue
.push_back(parent
);
432 for (auto c
: children
) {
433 merge_pgs
->insert(make_pair(c
, q
->first
));
439 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
440 << " pg_num " << pgnum
<< " -> " << q
->second
441 << " is beyond old pgnum, skipping" << dendl
;
445 if (cur
.is_split(q
->second
, pgnum
, &children
)) {
446 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
447 << " pg_num " << pgnum
<< " -> " << q
->second
448 << " is merge target, source " << children
<< dendl
;
449 for (auto c
: children
) {
450 merge_pgs
->insert(make_pair(c
, q
->first
));
454 merge_pgs
->insert(make_pair(cur
, q
->first
));
463 void OSDService::need_heartbeat_peer_update()
465 osd
->need_heartbeat_peer_update();
468 HeartbeatStampsRef
OSDService::get_hb_stamps(unsigned peer
)
470 std::lock_guard
l(hb_stamp_lock
);
471 if (peer
>= hb_stamps
.size()) {
472 hb_stamps
.resize(peer
+ 1);
474 if (!hb_stamps
[peer
]) {
475 hb_stamps
[peer
] = ceph::make_ref
<HeartbeatStamps
>(peer
);
477 return hb_stamps
[peer
];
480 void OSDService::queue_renew_lease(epoch_t epoch
, spg_t spgid
)
482 osd
->enqueue_peering_evt(
485 std::make_shared
<PGPeeringEvent
>(
490 void OSDService::start_shutdown()
493 std::lock_guard
l(agent_timer_lock
);
494 agent_timer
.shutdown();
498 std::lock_guard
l(sleep_lock
);
499 sleep_timer
.shutdown();
503 std::lock_guard
l(recovery_request_lock
);
504 recovery_request_timer
.shutdown();
508 void OSDService::shutdown_reserver()
510 reserver_finisher
.wait_for_empty();
511 reserver_finisher
.stop();
514 void OSDService::shutdown()
516 mono_timer
.suspend();
519 std::lock_guard
l(watch_lock
);
520 watch_timer
.shutdown();
523 objecter
->shutdown();
524 for (auto& f
: objecter_finishers
) {
529 publish_map(OSDMapRef());
530 next_osdmap
= OSDMapRef();
533 void OSDService::init()
535 reserver_finisher
.start();
536 for (auto& f
: objecter_finishers
) {
539 objecter
->set_client_incarnation(0);
541 // deprioritize objecter in daemonperf output
542 objecter
->get_logger()->set_prio_adjust(-3);
548 agent_thread
.create("osd_srv_agent");
550 if (cct
->_conf
->osd_recovery_delay_start
)
551 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
554 void OSDService::final_init()
556 objecter
->start(osdmap
.get());
559 void OSDService::activate_map()
561 // wake/unwake the tiering agent
562 std::lock_guard l
{agent_lock
};
564 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
566 agent_cond
.notify_all();
569 void OSDService::request_osdmap_update(epoch_t e
)
571 osd
->osdmap_subscribe(e
, false);
575 class AgentTimeoutCB
: public Context
{
578 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
579 void finish(int) override
{
580 pg
->agent_choose_mode_restart();
584 void OSDService::agent_entry()
586 dout(10) << __func__
<< " start" << dendl
;
587 std::unique_lock agent_locker
{agent_lock
};
589 while (!agent_stop_flag
) {
590 if (agent_queue
.empty()) {
591 dout(20) << __func__
<< " empty queue" << dendl
;
592 agent_cond
.wait(agent_locker
);
595 uint64_t level
= agent_queue
.rbegin()->first
;
596 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
598 << " tiers " << agent_queue
.size()
599 << ", top is " << level
600 << " with pgs " << top
.size()
601 << ", ops " << agent_ops
<< "/"
602 << cct
->_conf
->osd_agent_max_ops
603 << (agent_active
? " active" : " NOT ACTIVE")
605 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
606 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
607 int agent_flush_quota
= max
;
608 if (!flush_mode_high_count
)
609 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
610 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
611 agent_cond
.wait(agent_locker
);
615 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
616 agent_queue_pos
= top
.begin();
617 agent_valid_iterator
= true;
619 PGRef pg
= *agent_queue_pos
;
620 dout(10) << "high_count " << flush_mode_high_count
621 << " agent_ops " << agent_ops
622 << " flush_quota " << agent_flush_quota
<< dendl
;
623 agent_locker
.unlock();
624 if (!pg
->agent_work(max
, agent_flush_quota
)) {
625 dout(10) << __func__
<< " " << pg
->pg_id
626 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
627 << " seconds" << dendl
;
629 logger
->inc(l_osd_tier_delay
);
630 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
631 std::lock_guard timer_locker
{agent_timer_lock
};
632 Context
*cb
= new AgentTimeoutCB(pg
);
633 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
637 dout(10) << __func__
<< " finish" << dendl
;
640 void OSDService::agent_stop()
643 std::lock_guard
l(agent_lock
);
645 // By this time all ops should be cancelled
646 ceph_assert(agent_ops
== 0);
647 // By this time all PGs are shutdown and dequeued
648 if (!agent_queue
.empty()) {
649 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
650 derr
<< "agent queue not empty, for example " << (*top
.begin())->get_pgid() << dendl
;
651 ceph_abort_msg("agent queue not empty");
654 agent_stop_flag
= true;
655 agent_cond
.notify_all();
660 // -------------------------------------
662 void OSDService::promote_throttle_recalibrate()
664 utime_t now
= ceph_clock_now();
665 double dur
= now
- last_recalibrate
;
666 last_recalibrate
= now
;
667 unsigned prob
= promote_probability_millis
;
669 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
670 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
672 unsigned min_prob
= 1;
674 uint64_t attempts
, obj
, bytes
;
675 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
676 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
677 << obj
<< " objects and " << byte_u_t(bytes
) << "; target "
678 << target_obj_sec
<< " obj/sec or "
679 << byte_u_t(target_bytes_sec
) << "/sec"
682 // calculate what the probability *should* be, given the targets
684 if (attempts
&& dur
> 0) {
685 uint64_t avg_size
= 1;
687 avg_size
= std::max
<uint64_t>(bytes
/ obj
, 1);
688 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
689 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
691 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
692 << avg_size
<< dendl
;
693 if (target_obj_sec
&& target_bytes_sec
)
694 new_prob
= std::min(po
, pb
);
695 else if (target_obj_sec
)
697 else if (target_bytes_sec
)
704 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
706 // correct for persistent skew between target rate and actual rate, adjust
709 if (attempts
&& obj
) {
710 actual
= obj
* 1000 / attempts
;
711 ratio
= (double)actual
/ (double)prob
;
712 new_prob
= (double)new_prob
/ ratio
;
714 new_prob
= std::max(new_prob
, min_prob
);
715 new_prob
= std::min(new_prob
, 1000u);
718 prob
= (prob
+ new_prob
) / 2;
719 prob
= std::max(prob
, min_prob
);
720 prob
= std::min(prob
, 1000u);
721 dout(10) << __func__
<< " actual " << actual
722 << ", actual/prob ratio " << ratio
723 << ", adjusted new_prob " << new_prob
724 << ", prob " << promote_probability_millis
<< " -> " << prob
726 promote_probability_millis
= prob
;
728 // set hard limits for this interval to mitigate stampedes
729 promote_max_objects
= target_obj_sec
* osd
->OSD_TICK_INTERVAL
* 2;
730 promote_max_bytes
= target_bytes_sec
* osd
->OSD_TICK_INTERVAL
* 2;
733 // -------------------------------------
735 float OSDService::get_failsafe_full_ratio()
737 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
738 if (full_ratio
> 1.0) full_ratio
/= 100.0;
742 OSDService::s_names
OSDService::recalc_full_state(float ratio
, float pratio
, string
&inject
)
744 // The OSDMap ratios take precendence. So if the failsafe is .95 and
745 // the admin sets the cluster full to .96, the failsafe moves up to .96
746 // too. (Not that having failsafe == full is ideal, but it's better than
747 // dropping writes before the clusters appears full.)
748 OSDMapRef osdmap
= get_osdmap();
749 if (!osdmap
|| osdmap
->get_epoch() == 0) {
752 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
753 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
754 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
755 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
757 if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
758 // use the failsafe for nearfull and full; the mon isn't using the
759 // flags anyway because we're mid-upgrade.
760 full_ratio
= failsafe_ratio
;
761 backfillfull_ratio
= failsafe_ratio
;
762 nearfull_ratio
= failsafe_ratio
;
763 } else if (full_ratio
<= 0 ||
764 backfillfull_ratio
<= 0 ||
765 nearfull_ratio
<= 0) {
766 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
767 // use failsafe flag. ick. the monitor did something wrong or the user
768 // did something stupid.
769 full_ratio
= failsafe_ratio
;
770 backfillfull_ratio
= failsafe_ratio
;
771 nearfull_ratio
= failsafe_ratio
;
774 if (injectfull_state
> NONE
&& injectfull
) {
775 inject
= "(Injected)";
776 return injectfull_state
;
777 } else if (pratio
> failsafe_ratio
) {
779 } else if (ratio
> full_ratio
) {
781 } else if (ratio
> backfillfull_ratio
) {
783 } else if (pratio
> nearfull_ratio
) {
789 void OSDService::check_full_status(float ratio
, float pratio
)
791 std::lock_guard
l(full_status_lock
);
794 physical_ratio
= pratio
;
798 new_state
= recalc_full_state(ratio
, pratio
, inject
);
800 dout(20) << __func__
<< " cur ratio " << ratio
801 << ", physical ratio " << pratio
802 << ", new state " << get_full_state_name(new_state
)
807 if (cur_state
!= new_state
) {
808 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
809 << " -> " << get_full_state_name(new_state
) << dendl
;
810 if (new_state
== FAILSAFE
) {
811 clog
->error() << "full status failsafe engaged, dropping updates, now "
812 << (int)roundf(ratio
* 100) << "% full";
813 } else if (cur_state
== FAILSAFE
) {
814 clog
->error() << "full status failsafe disengaged, no longer dropping "
815 << "updates, now " << (int)roundf(ratio
* 100) << "% full";
817 cur_state
= new_state
;
821 bool OSDService::need_fullness_update()
823 OSDMapRef osdmap
= get_osdmap();
825 if (osdmap
->exists(whoami
)) {
826 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
828 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
830 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
837 else if (is_backfillfull())
839 else if (is_nearfull())
844 bool OSDService::_check_inject_full(DoutPrefixProvider
*dpp
, s_names type
) const
846 if (injectfull
&& injectfull_state
>= type
) {
847 // injectfull is either a count of the number of times to return failsafe full
848 // or if -1 then always return full
851 ldpp_dout(dpp
, 10) << __func__
<< " Injected " << get_full_state_name(type
) << " OSD ("
852 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")"
859 bool OSDService::_check_full(DoutPrefixProvider
*dpp
, s_names type
) const
861 std::lock_guard
l(full_status_lock
);
863 if (_check_inject_full(dpp
, type
))
866 if (cur_state
>= type
)
867 ldpp_dout(dpp
, 10) << __func__
<< " current usage is " << cur_ratio
868 << " physical " << physical_ratio
<< dendl
;
870 return cur_state
>= type
;
873 bool OSDService::_tentative_full(DoutPrefixProvider
*dpp
, s_names type
, uint64_t adjust_used
, osd_stat_t adjusted_stat
)
875 ldpp_dout(dpp
, 20) << __func__
<< " type " << get_full_state_name(type
) << " adjust_used " << (adjust_used
>> 10) << "KiB" << dendl
;
877 std::lock_guard
l(full_status_lock
);
878 if (_check_inject_full(dpp
, type
)) {
884 float ratio
= compute_adjusted_ratio(adjusted_stat
, &pratio
, adjust_used
);
887 s_names tentative_state
= recalc_full_state(ratio
, pratio
, notused
);
889 if (tentative_state
>= type
)
890 ldpp_dout(dpp
, 10) << __func__
<< " tentative usage is " << ratio
<< dendl
;
892 return tentative_state
>= type
;
895 bool OSDService::check_failsafe_full(DoutPrefixProvider
*dpp
) const
897 return _check_full(dpp
, FAILSAFE
);
900 bool OSDService::check_full(DoutPrefixProvider
*dpp
) const
902 return _check_full(dpp
, FULL
);
905 bool OSDService::tentative_backfill_full(DoutPrefixProvider
*dpp
, uint64_t adjust_used
, osd_stat_t stats
)
907 return _tentative_full(dpp
, BACKFILLFULL
, adjust_used
, stats
);
910 bool OSDService::check_backfill_full(DoutPrefixProvider
*dpp
) const
912 return _check_full(dpp
, BACKFILLFULL
);
915 bool OSDService::check_nearfull(DoutPrefixProvider
*dpp
) const
917 return _check_full(dpp
, NEARFULL
);
920 bool OSDService::is_failsafe_full() const
922 std::lock_guard
l(full_status_lock
);
923 return cur_state
== FAILSAFE
;
926 bool OSDService::is_full() const
928 std::lock_guard
l(full_status_lock
);
929 return cur_state
>= FULL
;
932 bool OSDService::is_backfillfull() const
934 std::lock_guard
l(full_status_lock
);
935 return cur_state
>= BACKFILLFULL
;
938 bool OSDService::is_nearfull() const
940 std::lock_guard
l(full_status_lock
);
941 return cur_state
>= NEARFULL
;
944 void OSDService::set_injectfull(s_names type
, int64_t count
)
946 std::lock_guard
l(full_status_lock
);
947 injectfull_state
= type
;
951 void OSDService::set_statfs(const struct store_statfs_t
&stbuf
,
952 osd_alert_list_t
& alerts
)
954 uint64_t bytes
= stbuf
.total
;
955 uint64_t avail
= stbuf
.available
;
956 uint64_t used
= stbuf
.get_used_raw();
958 // For testing fake statfs values so it doesn't matter if all
959 // OSDs are using the same partition.
960 if (cct
->_conf
->fake_statfs_for_testing
) {
961 uint64_t total_num_bytes
= 0;
965 total_num_bytes
+= p
->get_stats_num_bytes();
967 bytes
= cct
->_conf
->fake_statfs_for_testing
;
968 if (total_num_bytes
< bytes
)
969 avail
= bytes
- total_num_bytes
;
972 dout(0) << __func__
<< " fake total " << cct
->_conf
->fake_statfs_for_testing
973 << " adjust available " << avail
975 used
= bytes
- avail
;
978 logger
->set(l_osd_stat_bytes
, bytes
);
979 logger
->set(l_osd_stat_bytes_used
, used
);
980 logger
->set(l_osd_stat_bytes_avail
, avail
);
982 std::lock_guard
l(stat_lock
);
983 osd_stat
.statfs
= stbuf
;
984 osd_stat
.os_alerts
.clear();
985 osd_stat
.os_alerts
[whoami
].swap(alerts
);
986 if (cct
->_conf
->fake_statfs_for_testing
) {
987 osd_stat
.statfs
.total
= bytes
;
988 osd_stat
.statfs
.available
= avail
;
989 // For testing don't want used to go negative, so clear reserved
990 osd_stat
.statfs
.internally_reserved
= 0;
994 osd_stat_t
OSDService::set_osd_stat(vector
<int>& hb_peers
,
997 utime_t now
= ceph_clock_now();
998 auto stale_time
= g_conf().get_val
<int64_t>("osd_mon_heartbeat_stat_stale");
999 std::lock_guard
l(stat_lock
);
1000 osd_stat
.hb_peers
.swap(hb_peers
);
1001 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
1002 osd_stat
.num_pgs
= num_pgs
;
1003 // Clean entries that aren't updated
1004 // This is called often enough that we can just remove 1 at a time
1005 for (auto i
: osd_stat
.hb_pingtime
) {
1006 if (i
.second
.last_update
== 0)
1008 if (stale_time
&& now
.sec() - i
.second
.last_update
> stale_time
) {
1009 dout(20) << __func__
<< " time out heartbeat for osd " << i
.first
1010 << " last_update " << i
.second
.last_update
<< dendl
;
1011 osd_stat
.hb_pingtime
.erase(i
.first
);
1018 void OSDService::inc_osd_stat_repaired()
1020 std::lock_guard
l(stat_lock
);
1021 osd_stat
.num_shards_repaired
++;
1025 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat
, float *pratio
,
1026 uint64_t adjust_used
)
1029 ((float)new_stat
.statfs
.get_used_raw()) / ((float)new_stat
.statfs
.total
);
1032 dout(20) << __func__
<< " Before kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1033 if (new_stat
.statfs
.available
> adjust_used
)
1034 new_stat
.statfs
.available
-= adjust_used
;
1036 new_stat
.statfs
.available
= 0;
1037 dout(20) << __func__
<< " After kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1040 // Check all pgs and adjust kb_used to include all pending backfill data
1041 int backfill_adjusted
= 0;
1043 osd
->_get_pgs(&pgs
);
1044 for (auto p
: pgs
) {
1045 backfill_adjusted
+= p
->pg_stat_adjust(&new_stat
);
1047 if (backfill_adjusted
) {
1048 dout(20) << __func__
<< " backfill adjusted " << new_stat
<< dendl
;
1050 return ((float)new_stat
.statfs
.get_used_raw()) / ((float)new_stat
.statfs
.total
);
1053 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
1055 OSDMapRef next_map
= get_nextmap_reserved();
1056 // service map is always newer/newest
1057 ceph_assert(from_epoch
<= next_map
->get_epoch());
1059 if (next_map
->is_down(peer
) ||
1060 next_map
->get_info(peer
).up_from
> from_epoch
) {
1062 release_map(next_map
);
1065 ConnectionRef peer_con
;
1066 if (peer
== whoami
) {
1067 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1069 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1070 next_map
->get_cluster_addrs(peer
), false, true);
1072 maybe_share_map(peer_con
.get(), next_map
);
1073 peer_con
->send_message(m
);
1074 release_map(next_map
);
1077 void OSDService::send_message_osd_cluster(std::vector
<std::pair
<int, Message
*>>& messages
, epoch_t from_epoch
)
1079 OSDMapRef next_map
= get_nextmap_reserved();
1080 // service map is always newer/newest
1081 ceph_assert(from_epoch
<= next_map
->get_epoch());
1083 for (auto& iter
: messages
) {
1084 if (next_map
->is_down(iter
.first
) ||
1085 next_map
->get_info(iter
.first
).up_from
> from_epoch
) {
1089 ConnectionRef peer_con
;
1090 if (iter
.first
== whoami
) {
1091 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1093 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1094 next_map
->get_cluster_addrs(iter
.first
), false, true);
1096 maybe_share_map(peer_con
.get(), next_map
);
1097 peer_con
->send_message(iter
.second
);
1099 release_map(next_map
);
1101 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
1103 OSDMapRef next_map
= get_nextmap_reserved();
1104 // service map is always newer/newest
1105 ceph_assert(from_epoch
<= next_map
->get_epoch());
1107 if (next_map
->is_down(peer
) ||
1108 next_map
->get_info(peer
).up_from
> from_epoch
) {
1109 release_map(next_map
);
1113 if (peer
== whoami
) {
1114 con
= osd
->cluster_messenger
->get_loopback_connection();
1116 con
= osd
->cluster_messenger
->connect_to_osd(
1117 next_map
->get_cluster_addrs(peer
), false, true);
1119 release_map(next_map
);
1123 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
1125 OSDMapRef next_map
= get_nextmap_reserved();
1126 // service map is always newer/newest
1127 ceph_assert(from_epoch
<= next_map
->get_epoch());
1129 pair
<ConnectionRef
,ConnectionRef
> ret
;
1130 if (next_map
->is_down(peer
) ||
1131 next_map
->get_info(peer
).up_from
> from_epoch
) {
1132 release_map(next_map
);
1135 ret
.first
= osd
->hb_back_client_messenger
->connect_to_osd(
1136 next_map
->get_hb_back_addrs(peer
));
1137 ret
.second
= osd
->hb_front_client_messenger
->connect_to_osd(
1138 next_map
->get_hb_front_addrs(peer
));
1139 release_map(next_map
);
1143 entity_name_t
OSDService::get_cluster_msgr_name() const
1145 return cluster_messenger
->get_myname();
1148 void OSDService::queue_want_pg_temp(pg_t pgid
,
1149 const vector
<int>& want
,
1152 std::lock_guard
l(pg_temp_lock
);
1153 auto p
= pg_temp_pending
.find(pgid
);
1154 if (p
== pg_temp_pending
.end() ||
1155 p
->second
.acting
!= want
||
1157 pg_temp_wanted
[pgid
] = {want
, forced
};
1161 void OSDService::remove_want_pg_temp(pg_t pgid
)
1163 std::lock_guard
l(pg_temp_lock
);
1164 pg_temp_wanted
.erase(pgid
);
1165 pg_temp_pending
.erase(pgid
);
1168 void OSDService::_sent_pg_temp()
1170 #ifdef HAVE_STDLIB_MAP_SPLICING
1171 pg_temp_pending
.merge(pg_temp_wanted
);
1173 pg_temp_pending
.insert(make_move_iterator(begin(pg_temp_wanted
)),
1174 make_move_iterator(end(pg_temp_wanted
)));
1176 pg_temp_wanted
.clear();
1179 void OSDService::requeue_pg_temp()
1181 std::lock_guard
l(pg_temp_lock
);
1182 // wanted overrides pending. note that remove_want_pg_temp
1183 // clears the item out of both.
1184 unsigned old_wanted
= pg_temp_wanted
.size();
1185 unsigned old_pending
= pg_temp_pending
.size();
1187 pg_temp_wanted
.swap(pg_temp_pending
);
1188 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1189 << pg_temp_wanted
.size() << dendl
;
1192 std::ostream
& operator<<(std::ostream
& out
,
1193 const OSDService::pg_temp_t
& pg_temp
)
1195 out
<< pg_temp
.acting
;
1196 if (pg_temp
.forced
) {
1202 void OSDService::send_pg_temp()
1204 std::lock_guard
l(pg_temp_lock
);
1205 if (pg_temp_wanted
.empty())
1207 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1208 MOSDPGTemp
*ms
[2] = {nullptr, nullptr};
1209 for (auto& [pgid
, pg_temp
] : pg_temp_wanted
) {
1210 auto& m
= ms
[pg_temp
.forced
];
1212 m
= new MOSDPGTemp(osdmap
->get_epoch());
1213 m
->forced
= pg_temp
.forced
;
1215 m
->pg_temp
.emplace(pgid
, pg_temp
.acting
);
1219 monc
->send_mon_message(m
);
1225 void OSDService::send_pg_created(pg_t pgid
)
1227 std::lock_guard
l(pg_created_lock
);
1228 dout(20) << __func__
<< dendl
;
1229 auto o
= get_osdmap();
1230 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1231 pg_created
.insert(pgid
);
1232 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1236 void OSDService::send_pg_created()
1238 std::lock_guard
l(pg_created_lock
);
1239 dout(20) << __func__
<< dendl
;
1240 auto o
= get_osdmap();
1241 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1242 for (auto pgid
: pg_created
) {
1243 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1248 void OSDService::prune_pg_created()
1250 std::lock_guard
l(pg_created_lock
);
1251 dout(20) << __func__
<< dendl
;
1252 auto o
= get_osdmap();
1253 auto i
= pg_created
.begin();
1254 while (i
!= pg_created
.end()) {
1255 auto p
= o
->get_pg_pool(i
->pool());
1256 if (!p
|| !p
->has_flag(pg_pool_t::FLAG_CREATING
)) {
1257 dout(20) << __func__
<< " pruning " << *i
<< dendl
;
1258 i
= pg_created
.erase(i
);
1260 dout(20) << __func__
<< " keeping " << *i
<< dendl
;
1267 // --------------------------------------
1270 bool OSDService::can_inc_scrubs()
1272 bool can_inc
= false;
1273 std::lock_guard
l(sched_scrub_lock
);
1275 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1276 dout(20) << __func__
<< " == true " << scrubs_local
<< " local + " << scrubs_remote
1277 << " remote < max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1280 dout(20) << __func__
<< " == false " << scrubs_local
<< " local + " << scrubs_remote
1281 << " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1287 bool OSDService::inc_scrubs_local()
1289 bool result
= false;
1290 std::lock_guard l
{sched_scrub_lock
};
1291 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1292 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
+1)
1293 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1297 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1302 void OSDService::dec_scrubs_local()
1304 std::lock_guard l
{sched_scrub_lock
};
1305 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
-1)
1306 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1308 ceph_assert(scrubs_local
>= 0);
1311 bool OSDService::inc_scrubs_remote()
1313 bool result
= false;
1314 std::lock_guard l
{sched_scrub_lock
};
1315 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1316 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
+1)
1317 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1321 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1326 void OSDService::dec_scrubs_remote()
1328 std::lock_guard l
{sched_scrub_lock
};
1329 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
-1)
1330 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1332 ceph_assert(scrubs_remote
>= 0);
1335 void OSDService::dump_scrub_reservations(Formatter
*f
)
1337 std::lock_guard l
{sched_scrub_lock
};
1338 f
->dump_int("scrubs_local", scrubs_local
);
1339 f
->dump_int("scrubs_remote", scrubs_remote
);
1340 f
->dump_int("osd_max_scrubs", cct
->_conf
->osd_max_scrubs
);
1343 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1344 epoch_t
*_bind_epoch
) const
1346 std::lock_guard
l(epoch_lock
);
1348 *_boot_epoch
= boot_epoch
;
1350 *_up_epoch
= up_epoch
;
1352 *_bind_epoch
= bind_epoch
;
1355 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1356 const epoch_t
*_bind_epoch
)
1358 std::lock_guard
l(epoch_lock
);
1360 ceph_assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1361 boot_epoch
= *_boot_epoch
;
1364 ceph_assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1365 up_epoch
= *_up_epoch
;
1368 ceph_assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1369 bind_epoch
= *_bind_epoch
;
1373 bool OSDService::prepare_to_stop()
1375 std::unique_lock
l(is_stopping_lock
);
1376 if (get_state() != NOT_STOPPING
)
1379 OSDMapRef osdmap
= get_osdmap();
1380 if (osdmap
&& osdmap
->is_up(whoami
)) {
1381 dout(0) << __func__
<< " telling mon we are shutting down" << dendl
;
1382 set_state(PREPARING_TO_STOP
);
1383 monc
->send_mon_message(
1387 osdmap
->get_addrs(whoami
),
1388 osdmap
->get_epoch(),
1391 const auto timeout
= ceph::make_timespan(cct
->_conf
->osd_mon_shutdown_timeout
);
1392 is_stopping_cond
.wait_for(l
, timeout
,
1393 [this] { return get_state() == STOPPING
; });
1395 dout(0) << __func__
<< " starting shutdown" << dendl
;
1396 set_state(STOPPING
);
1400 void OSDService::got_stop_ack()
1402 std::scoped_lock
l(is_stopping_lock
);
1403 if (get_state() == PREPARING_TO_STOP
) {
1404 dout(0) << __func__
<< " starting shutdown" << dendl
;
1405 set_state(STOPPING
);
1406 is_stopping_cond
.notify_all();
1408 dout(10) << __func__
<< " ignoring msg" << dendl
;
1412 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1413 OSDSuperblock
& sblock
)
1415 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1416 osdmap
->get_encoding_features());
1417 m
->oldest_map
= max_oldest_map
;
1418 m
->newest_map
= sblock
.newest_map
;
1420 int max
= cct
->_conf
->osd_map_message_max
;
1421 ssize_t max_bytes
= cct
->_conf
->osd_map_message_max_bytes
;
1423 if (since
< m
->oldest_map
) {
1424 // we don't have the next map the target wants, so start with a
1427 dout(10) << __func__
<< " oldest map " << max_oldest_map
<< " > since "
1428 << since
<< ", starting with full map" << dendl
;
1429 since
= m
->oldest_map
;
1430 if (!get_map_bl(since
, bl
)) {
1431 derr
<< __func__
<< " missing full map " << since
<< dendl
;
1435 max_bytes
-= bl
.length();
1436 m
->maps
[since
] = std::move(bl
);
1438 for (epoch_t e
= since
+ 1; e
<= to
; ++e
) {
1440 if (get_inc_map_bl(e
, bl
)) {
1441 m
->incremental_maps
[e
] = std::move(bl
);
1443 dout(10) << __func__
<< " missing incremental map " << e
<< dendl
;
1444 if (!get_map_bl(e
, bl
)) {
1445 derr
<< __func__
<< " also missing full map " << e
<< dendl
;
1448 m
->maps
[e
] = std::move(bl
);
1451 max_bytes
-= bl
.length();
1452 if (max
<= 0 || max_bytes
<= 0) {
1459 if (!m
->maps
.empty() ||
1460 !m
->incremental_maps
.empty()) {
1461 // send what we have so far
1466 if (get_inc_map_bl(m
->newest_map
, bl
)) {
1467 m
->incremental_maps
[m
->newest_map
] = std::move(bl
);
1469 derr
<< __func__
<< " unable to load latest map " << m
->newest_map
<< dendl
;
1470 if (!get_map_bl(m
->newest_map
, bl
)) {
1471 derr
<< __func__
<< " unable to load latest full map " << m
->newest_map
1475 m
->maps
[m
->newest_map
] = std::move(bl
);
1480 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1482 con
->send_message(m
);
1485 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1486 const OSDMapRef
& osdmap
)
1488 epoch_t to
= osdmap
->get_epoch();
1489 dout(10) << "send_incremental_map " << since
<< " -> " << to
1490 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1494 OSDSuperblock
sblock(get_superblock());
1495 if (since
< sblock
.oldest_map
) {
1496 // just send latest full map
1497 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1498 osdmap
->get_encoding_features());
1499 m
->oldest_map
= max_oldest_map
;
1500 m
->newest_map
= sblock
.newest_map
;
1501 get_map_bl(to
, m
->maps
[to
]);
1506 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1507 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1508 << ", only sending most recent" << dendl
;
1509 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1512 m
= build_incremental_map_msg(since
, to
, sblock
);
1517 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1519 bool found
= map_bl_cache
.lookup(e
, &bl
);
1521 logger
->inc(l_osd_map_bl_cache_hit
);
1524 logger
->inc(l_osd_map_bl_cache_miss
);
1525 found
= store
->read(meta_ch
,
1526 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
,
1527 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1534 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1536 std::lock_guard
l(map_cache_lock
);
1537 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1539 logger
->inc(l_osd_map_bl_cache_hit
);
1542 logger
->inc(l_osd_map_bl_cache_miss
);
1543 found
= store
->read(meta_ch
,
1544 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
,
1545 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1547 _add_map_inc_bl(e
, bl
);
1552 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1554 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1555 // cache a contiguous buffer
1556 if (bl
.get_num_buffers() > 1) {
1559 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1560 map_bl_cache
.add(e
, bl
);
1563 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1565 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1566 // cache a contiguous buffer
1567 if (bl
.get_num_buffers() > 1) {
1570 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1571 map_bl_inc_cache
.add(e
, bl
);
1574 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1576 epoch_t e
= o
->get_epoch();
1578 if (cct
->_conf
->osd_map_dedup
) {
1579 // Dedup against an existing map at a nearby epoch
1580 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1582 OSDMap::dedup(for_dedup
.get(), o
);
1586 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1593 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1595 std::lock_guard
l(map_cache_lock
);
1596 OSDMapRef retval
= map_cache
.lookup(epoch
);
1598 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1599 logger
->inc(l_osd_map_cache_hit
);
1603 logger
->inc(l_osd_map_cache_miss
);
1604 epoch_t lb
= map_cache
.cached_key_lower_bound();
1606 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1607 logger
->inc(l_osd_map_cache_miss_low
);
1608 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1612 OSDMap
*map
= new OSDMap
;
1614 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1616 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1617 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1623 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1625 return _add_map(map
);
1631 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1633 reply_op_error(op
, err
, eversion_t(), 0, {});
1636 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1638 vector
<pg_log_op_return_item_t
> op_returns
)
1640 auto m
= op
->get_req
<MOSDOp
>();
1641 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1643 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1645 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
,
1646 !m
->has_flag(CEPH_OSD_FLAG_RETURNVEC
));
1647 reply
->set_reply_versions(v
, uv
);
1648 reply
->set_op_returns(op_returns
);
1649 m
->get_connection()->send_message(reply
);
1652 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1654 if (!cct
->_conf
->osd_debug_misdirected_ops
) {
1658 auto m
= op
->get_req
<MOSDOp
>();
1659 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1661 ceph_assert(m
->get_map_epoch() >= pg
->get_history().same_primary_since
);
1663 if (pg
->is_ec_pg()) {
1665 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1666 * can get this result:
1667 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1668 * [CRUSH_ITEM_NONE, 2, 3]/3
1669 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1671 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1673 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1676 * We can't compute the op target based on the sending map epoch due to
1677 * splitting. The simplest thing is to detect such cases here and drop
1678 * them without an error (the client will resend anyway).
1680 ceph_assert(m
->get_map_epoch() <= superblock
.newest_map
);
1681 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1683 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1684 << m
->get_map_epoch() << ", dropping" << dendl
;
1687 pg_t _pgid
= m
->get_raw_pg();
1689 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1690 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1691 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1692 pgid
.shard
!= pg
->pg_id
.shard
) {
1693 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1694 << m
->get_map_epoch() << ", dropping" << dendl
;
1699 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1700 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1701 << " pg " << m
->get_raw_pg()
1702 << " to osd." << whoami
1703 << " not " << pg
->get_acting()
1704 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1707 void OSDService::enqueue_back(OpSchedulerItem
&& qi
)
1709 osd
->op_shardedwq
.queue(std::move(qi
));
1712 void OSDService::enqueue_front(OpSchedulerItem
&& qi
)
1714 osd
->op_shardedwq
.queue_front(std::move(qi
));
1717 void OSDService::queue_recovery_context(
1719 GenContext
<ThreadPool::TPHandle
&> *c
)
1721 epoch_t e
= get_osdmap_epoch();
1724 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1725 new PGRecoveryContext(pg
->get_pgid(), c
, e
)),
1726 cct
->_conf
->osd_recovery_cost
,
1727 cct
->_conf
->osd_recovery_priority
,
1733 void OSDService::queue_for_snap_trim(PG
*pg
)
1735 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1738 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1739 new PGSnapTrim(pg
->get_pgid(), pg
->get_osdmap_epoch())),
1740 cct
->_conf
->osd_snap_trim_cost
,
1741 cct
->_conf
->osd_snap_trim_priority
,
1744 pg
->get_osdmap_epoch()));
1747 template <class MSG_TYPE
>
1748 void OSDService::queue_scrub_event_msg(PG
* pg
,
1749 Scrub::scrub_prio_t with_priority
,
1750 unsigned int qu_priority
)
1752 const auto epoch
= pg
->get_osdmap_epoch();
1753 auto msg
= new MSG_TYPE(pg
->get_pgid(), epoch
);
1754 dout(15) << "queue a scrub event (" << *msg
<< ") for " << *pg
<< ". Epoch: " << epoch
<< dendl
;
1756 enqueue_back(OpSchedulerItem(
1757 unique_ptr
<OpSchedulerItem::OpQueueable
>(msg
), cct
->_conf
->osd_scrub_cost
,
1758 pg
->scrub_requeue_priority(with_priority
, qu_priority
), ceph_clock_now(), 0, epoch
));
1761 template <class MSG_TYPE
>
1762 void OSDService::queue_scrub_event_msg(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1764 const auto epoch
= pg
->get_osdmap_epoch();
1765 auto msg
= new MSG_TYPE(pg
->get_pgid(), epoch
);
1766 dout(15) << "queue a scrub event (" << *msg
<< ") for " << *pg
<< ". Epoch: " << epoch
<< dendl
;
1768 enqueue_back(OpSchedulerItem(
1769 unique_ptr
<OpSchedulerItem::OpQueueable
>(msg
), cct
->_conf
->osd_scrub_cost
,
1770 pg
->scrub_requeue_priority(with_priority
), ceph_clock_now(), 0, epoch
));
1773 void OSDService::queue_for_scrub(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1775 queue_scrub_event_msg
<PGScrub
>(pg
, with_priority
);
1778 void OSDService::queue_scrub_after_repair(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1780 queue_scrub_event_msg
<PGScrubAfterRepair
>(pg
, with_priority
);
1783 void OSDService::queue_for_rep_scrub(PG
* pg
,
1784 Scrub::scrub_prio_t with_priority
,
1785 unsigned int qu_priority
)
1787 queue_scrub_event_msg
<PGRepScrub
>(pg
, with_priority
, qu_priority
);
1790 void OSDService::queue_for_rep_scrub_resched(PG
* pg
,
1791 Scrub::scrub_prio_t with_priority
,
1792 unsigned int qu_priority
)
1794 // Resulting scrub event: 'SchedReplica'
1795 queue_scrub_event_msg
<PGRepScrubResched
>(pg
, with_priority
, qu_priority
);
1798 void OSDService::queue_for_scrub_granted(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1800 // Resulting scrub event: 'RemotesReserved'
1801 queue_scrub_event_msg
<PGScrubResourcesOK
>(pg
, with_priority
);
1804 void OSDService::queue_for_scrub_denied(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1806 // Resulting scrub event: 'ReservationFailure'
1807 queue_scrub_event_msg
<PGScrubDenied
>(pg
, with_priority
);
1810 void OSDService::queue_for_scrub_resched(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1812 // Resulting scrub event: 'InternalSchedScrub'
1813 queue_scrub_event_msg
<PGScrubResched
>(pg
, with_priority
);
1816 void OSDService::queue_scrub_pushes_update(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1818 // Resulting scrub event: 'ActivePushesUpd'
1819 queue_scrub_event_msg
<PGScrubPushesUpdate
>(pg
, with_priority
);
1822 void OSDService::queue_scrub_applied_update(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1824 queue_scrub_event_msg
<PGScrubAppliedUpdate
>(pg
, with_priority
);
1827 void OSDService::queue_scrub_unblocking(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1829 // Resulting scrub event: 'Unblocked'
1830 queue_scrub_event_msg
<PGScrubUnblocked
>(pg
, with_priority
);
1833 void OSDService::queue_scrub_digest_update(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1835 // Resulting scrub event: 'DigestUpdate'
1836 queue_scrub_event_msg
<PGScrubDigestUpdate
>(pg
, with_priority
);
1839 void OSDService::queue_scrub_got_repl_maps(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1841 // Resulting scrub event: 'GotReplicas'
1842 queue_scrub_event_msg
<PGScrubGotReplMaps
>(pg
, with_priority
);
1845 void OSDService::queue_scrub_replica_pushes(PG
*pg
, Scrub::scrub_prio_t with_priority
)
1847 // Resulting scrub event: 'ReplicaPushesUpd'
1848 queue_scrub_event_msg
<PGScrubReplicaPushes
>(pg
, with_priority
);
1851 void OSDService::queue_for_pg_delete(spg_t pgid
, epoch_t e
)
1853 dout(10) << __func__
<< " on " << pgid
<< " e " << e
<< dendl
;
1856 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1857 new PGDelete(pgid
, e
)),
1858 cct
->_conf
->osd_pg_delete_cost
,
1859 cct
->_conf
->osd_pg_delete_priority
,
1865 bool OSDService::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
1867 return osd
->try_finish_pg_delete(pg
, old_pg_num
);
1872 void OSDService::set_ready_to_merge_source(PG
*pg
, eversion_t version
)
1874 std::lock_guard
l(merge_lock
);
1875 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1876 ready_to_merge_source
[pg
->pg_id
.pgid
] = version
;
1877 assert(not_ready_to_merge_source
.count(pg
->pg_id
.pgid
) == 0);
1878 _send_ready_to_merge();
1881 void OSDService::set_ready_to_merge_target(PG
*pg
,
1883 epoch_t last_epoch_started
,
1884 epoch_t last_epoch_clean
)
1886 std::lock_guard
l(merge_lock
);
1887 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1888 ready_to_merge_target
.insert(make_pair(pg
->pg_id
.pgid
,
1891 last_epoch_clean
)));
1892 assert(not_ready_to_merge_target
.count(pg
->pg_id
.pgid
) == 0);
1893 _send_ready_to_merge();
1896 void OSDService::set_not_ready_to_merge_source(pg_t source
)
1898 std::lock_guard
l(merge_lock
);
1899 dout(10) << __func__
<< " " << source
<< dendl
;
1900 not_ready_to_merge_source
.insert(source
);
1901 assert(ready_to_merge_source
.count(source
) == 0);
1902 _send_ready_to_merge();
1905 void OSDService::set_not_ready_to_merge_target(pg_t target
, pg_t source
)
1907 std::lock_guard
l(merge_lock
);
1908 dout(10) << __func__
<< " " << target
<< " source " << source
<< dendl
;
1909 not_ready_to_merge_target
[target
] = source
;
1910 assert(ready_to_merge_target
.count(target
) == 0);
1911 _send_ready_to_merge();
1914 void OSDService::send_ready_to_merge()
1916 std::lock_guard
l(merge_lock
);
1917 _send_ready_to_merge();
1920 void OSDService::_send_ready_to_merge()
1922 dout(20) << __func__
1923 << " ready_to_merge_source " << ready_to_merge_source
1924 << " not_ready_to_merge_source " << not_ready_to_merge_source
1925 << " ready_to_merge_target " << ready_to_merge_target
1926 << " not_ready_to_merge_target " << not_ready_to_merge_target
1927 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1929 for (auto src
: not_ready_to_merge_source
) {
1930 if (sent_ready_to_merge_source
.count(src
) == 0) {
1931 monc
->send_mon_message(new MOSDPGReadyToMerge(
1935 osdmap
->get_epoch()));
1936 sent_ready_to_merge_source
.insert(src
);
1939 for (auto p
: not_ready_to_merge_target
) {
1940 if (sent_ready_to_merge_source
.count(p
.second
) == 0) {
1941 monc
->send_mon_message(new MOSDPGReadyToMerge(
1945 osdmap
->get_epoch()));
1946 sent_ready_to_merge_source
.insert(p
.second
);
1949 for (auto src
: ready_to_merge_source
) {
1950 if (not_ready_to_merge_source
.count(src
.first
) ||
1951 not_ready_to_merge_target
.count(src
.first
.get_parent())) {
1954 auto p
= ready_to_merge_target
.find(src
.first
.get_parent());
1955 if (p
!= ready_to_merge_target
.end() &&
1956 sent_ready_to_merge_source
.count(src
.first
) == 0) {
1957 monc
->send_mon_message(new MOSDPGReadyToMerge(
1958 src
.first
, // source pgid
1959 src
.second
, // src version
1960 std::get
<0>(p
->second
), // target version
1961 std::get
<1>(p
->second
), // PG's last_epoch_started
1962 std::get
<2>(p
->second
), // PG's last_epoch_clean
1964 osdmap
->get_epoch()));
1965 sent_ready_to_merge_source
.insert(src
.first
);
1970 void OSDService::clear_ready_to_merge(PG
*pg
)
1972 std::lock_guard
l(merge_lock
);
1973 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1974 ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1975 ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1976 not_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1977 not_ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1978 sent_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1981 void OSDService::clear_sent_ready_to_merge()
1983 std::lock_guard
l(merge_lock
);
1984 sent_ready_to_merge_source
.clear();
1987 void OSDService::prune_sent_ready_to_merge(const OSDMapRef
& osdmap
)
1989 std::lock_guard
l(merge_lock
);
1990 auto i
= sent_ready_to_merge_source
.begin();
1991 while (i
!= sent_ready_to_merge_source
.end()) {
1992 if (!osdmap
->pg_exists(*i
)) {
1993 dout(10) << __func__
<< " " << *i
<< dendl
;
1994 i
= sent_ready_to_merge_source
.erase(i
);
2003 void OSDService::_queue_for_recovery(
2004 std::pair
<epoch_t
, PGRef
> p
,
2005 uint64_t reserved_pushes
)
2007 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
2010 unique_ptr
<OpSchedulerItem::OpQueueable
>(
2012 p
.second
->get_pgid(), p
.first
, reserved_pushes
)),
2013 cct
->_conf
->osd_recovery_cost
,
2014 cct
->_conf
->osd_recovery_priority
,
2020 // ====================================================================
2024 #define dout_prefix *_dout
2026 // Commands shared between OSD's console and admin console:
2027 namespace ceph::osd_cmds
{
2029 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
);
2031 } // namespace ceph::osd_cmds
2033 int OSD::mkfs(CephContext
*cct
, ObjectStore
*store
, uuid_d fsid
, int whoami
, string osdspec_affinity
)
2039 ObjectStore::CollectionHandle ch
;
2041 // if we are fed a uuid for this osd, use it.
2042 store
->set_fsid(cct
->_conf
->osd_uuid
);
2044 ret
= store
->mkfs();
2046 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error "
2047 << cpp_strerror(ret
) << dendl
;
2051 store
->set_cache_shards(1); // doesn't matter for mkfs!
2053 ret
= store
->mount();
2055 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error "
2056 << cpp_strerror(ret
) << dendl
;
2060 ch
= store
->open_collection(coll_t::meta());
2062 ret
= store
->read(ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
2064 derr
<< "OSD::mkfs: have meta collection but no superblock" << dendl
;
2067 /* if we already have superblock, check content of superblock */
2068 dout(0) << " have superblock" << dendl
;
2069 auto p
= sbbl
.cbegin();
2071 if (whoami
!= sb
.whoami
) {
2072 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
2077 if (fsid
!= sb
.cluster_fsid
) {
2078 derr
<< "provided cluster fsid " << fsid
2079 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
2084 // create superblock
2085 sb
.cluster_fsid
= fsid
;
2086 sb
.osd_fsid
= store
->get_fsid();
2088 sb
.compat_features
= get_osd_initial_compat_set();
2093 ObjectStore::CollectionHandle ch
= store
->create_new_collection(
2095 ObjectStore::Transaction t
;
2096 t
.create_collection(coll_t::meta(), 0);
2097 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
2098 ret
= store
->queue_transaction(ch
, std::move(t
));
2100 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2101 << "queue_transaction returned " << cpp_strerror(ret
) << dendl
;
2106 ret
= write_meta(cct
, store
, sb
.cluster_fsid
, sb
.osd_fsid
, whoami
, osdspec_affinity
);
2108 derr
<< "OSD::mkfs: failed to write fsid file: error "
2109 << cpp_strerror(ret
) << dendl
;
2123 int OSD::write_meta(CephContext
*cct
, ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
, string
& osdspec_affinity
)
2128 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
2129 r
= store
->write_meta("magic", val
);
2133 snprintf(val
, sizeof(val
), "%d", whoami
);
2134 r
= store
->write_meta("whoami", val
);
2138 cluster_fsid
.print(val
);
2139 r
= store
->write_meta("ceph_fsid", val
);
2143 string key
= cct
->_conf
.get_val
<string
>("key");
2145 r
= store
->write_meta("osd_key", key
);
2149 string keyfile
= cct
->_conf
.get_val
<string
>("keyfile");
2150 if (!keyfile
.empty()) {
2153 r
= keybl
.read_file(keyfile
.c_str(), &err
);
2155 derr
<< __func__
<< " failed to read keyfile " << keyfile
<< ": "
2156 << err
<< ": " << cpp_strerror(r
) << dendl
;
2159 r
= store
->write_meta("osd_key", keybl
.to_str());
2164 if (!osdspec_affinity
.empty()) {
2165 r
= store
->write_meta("osdspec_affinity", osdspec_affinity
.c_str());
2170 r
= store
->write_meta("ready", "ready");
2177 int OSD::peek_meta(ObjectStore
*store
,
2179 uuid_d
*cluster_fsid
,
2182 ceph_release_t
*require_osd_release
)
2186 int r
= store
->read_meta("magic", &val
);
2191 r
= store
->read_meta("whoami", &val
);
2194 *whoami
= atoi(val
.c_str());
2196 r
= store
->read_meta("ceph_fsid", &val
);
2199 r
= cluster_fsid
->parse(val
.c_str());
2203 r
= store
->read_meta("fsid", &val
);
2205 *osd_fsid
= uuid_d();
2207 r
= osd_fsid
->parse(val
.c_str());
2212 r
= store
->read_meta("require_osd_release", &val
);
2214 *require_osd_release
= ceph_release_from_name(val
);
2222 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2226 OSD::OSD(CephContext
*cct_
, ObjectStore
*store_
,
2228 Messenger
*internal_messenger
,
2229 Messenger
*external_messenger
,
2230 Messenger
*hb_client_front
,
2231 Messenger
*hb_client_back
,
2232 Messenger
*hb_front_serverm
,
2233 Messenger
*hb_back_serverm
,
2234 Messenger
*osdc_messenger
,
2236 const std::string
&dev
, const std::string
&jdev
,
2237 ceph::async::io_context_pool
& poolctx
) :
2239 tick_timer(cct
, osd_lock
),
2240 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
2241 gss_ktfile_client(cct
->_conf
.get_val
<std::string
>("gss_ktab_client_file")),
2242 cluster_messenger(internal_messenger
),
2243 client_messenger(external_messenger
),
2244 objecter_messenger(osdc_messenger
),
2246 mgrc(cct_
, client_messenger
, &mc
->monmap
),
2247 logger(create_logger()),
2248 recoverystate_perf(create_recoverystate_perf()),
2250 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
2251 clog(log_client
.create_channel()),
2253 dev_path(dev
), journal_path(jdev
),
2254 store_is_rotational(store
->is_rotational()),
2255 trace_endpoint("0.0.0.0", 0, "osd"),
2257 m_osd_pg_epoch_max_lag_factor(cct
->_conf
.get_val
<double>(
2258 "osd_pg_epoch_max_lag_factor")),
2259 osd_compat(get_osd_compat_set()),
2260 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
2261 get_num_op_threads()),
2262 heartbeat_stop(false),
2263 heartbeat_need_update(true),
2264 hb_front_client_messenger(hb_client_front
),
2265 hb_back_client_messenger(hb_client_back
),
2266 hb_front_server_messenger(hb_front_serverm
),
2267 hb_back_server_messenger(hb_back_serverm
),
2269 heartbeat_thread(this),
2270 heartbeat_dispatcher(this),
2271 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
2272 cct
->_conf
->osd_num_op_tracker_shard
),
2273 test_ops_hook(NULL
),
2276 ceph::make_timespan(cct
->_conf
->osd_op_thread_timeout
),
2277 ceph::make_timespan(cct
->_conf
->osd_op_thread_suicide_timeout
),
2279 last_pg_create_epoch(0),
2282 requested_full_first(0),
2283 requested_full_last(0),
2284 service(this, poolctx
)
2287 if (!gss_ktfile_client
.empty()) {
2288 // Assert we can export environment variable
2290 The default client keytab is used, if it is present and readable,
2291 to automatically obtain initial credentials for GSSAPI client
2292 applications. The principal name of the first entry in the client
2293 keytab is used by default when obtaining initial credentials.
2294 1. The KRB5_CLIENT_KTNAME environment variable.
2295 2. The default_client_keytab_name profile variable in [libdefaults].
2296 3. The hardcoded default, DEFCKTNAME.
2298 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2299 gss_ktfile_client
.c_str(), 1));
2300 ceph_assert(set_result
== 0);
2303 monc
->set_messenger(client_messenger
);
2304 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
2305 cct
->_conf
->osd_op_log_threshold
);
2306 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
2307 cct
->_conf
->osd_op_history_duration
);
2308 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
2309 cct
->_conf
->osd_op_history_slow_op_threshold
);
2310 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
2312 std::stringstream ss
;
2313 ss
<< "osd." << whoami
;
2314 trace_endpoint
.copy_name(ss
.str());
2317 // initialize shards
2318 num_shards
= get_num_op_shards();
2319 for (uint32_t i
= 0; i
< num_shards
; i
++) {
2320 OSDShard
*one_shard
= new OSDShard(
2324 shards
.push_back(one_shard
);
2327 // override some config options if mclock is enabled on all the shards
2328 maybe_override_options_for_qos();
2333 while (!shards
.empty()) {
2334 delete shards
.back();
2337 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
2338 cct
->get_perfcounters_collection()->remove(logger
);
2339 delete recoverystate_perf
;
2344 double OSD::get_tick_interval() const
2346 // vary +/- 5% to avoid scrub scheduling livelocks
2347 constexpr auto delta
= 0.05;
2348 return (OSD_TICK_INTERVAL
*
2349 ceph::util::generate_random_number(1.0 - delta
, 1.0 + delta
));
2352 void OSD::handle_signal(int signum
)
2354 ceph_assert(signum
== SIGINT
|| signum
== SIGTERM
);
2355 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
2361 std::lock_guard
lock(osd_lock
);
2365 if (store
->test_mount_in_use()) {
2366 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
2367 << "currently in use. (Is ceph-osd already running?)" << dendl
;
2371 cct
->_conf
.add_observer(this);
2375 int OSD::set_numa_affinity()
2377 // storage numa node
2378 int store_node
= -1;
2379 store
->get_numa_node(&store_node
, nullptr, nullptr);
2380 if (store_node
>= 0) {
2381 dout(1) << __func__
<< " storage numa node " << store_node
<< dendl
;
2384 // check network numa node(s)
2385 int front_node
= -1, back_node
= -1;
2386 string front_iface
= pick_iface(
2388 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
2389 string back_iface
= pick_iface(
2391 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
2392 int r
= get_iface_numa_node(front_iface
, &front_node
);
2393 if (r
>= 0 && front_node
>= 0) {
2394 dout(1) << __func__
<< " public network " << front_iface
<< " numa node "
2395 << front_node
<< dendl
;
2396 r
= get_iface_numa_node(back_iface
, &back_node
);
2397 if (r
>= 0 && back_node
>= 0) {
2398 dout(1) << __func__
<< " cluster network " << back_iface
<< " numa node "
2399 << back_node
<< dendl
;
2400 if (front_node
== back_node
&&
2401 front_node
== store_node
) {
2402 dout(1) << " objectstore and network numa nodes all match" << dendl
;
2403 if (g_conf().get_val
<bool>("osd_numa_auto_affinity")) {
2404 numa_node
= front_node
;
2406 } else if (front_node
!= back_node
) {
2407 dout(1) << __func__
<< " public and cluster network numa nodes do not match"
2410 dout(1) << __func__
<< " objectstore and network numa nodes do not match"
2413 } else if (back_node
== -2) {
2414 dout(1) << __func__
<< " cluster network " << back_iface
2415 << " ports numa nodes do not match" << dendl
;
2417 derr
<< __func__
<< " unable to identify cluster interface '" << back_iface
2418 << "' numa node: " << cpp_strerror(r
) << dendl
;
2420 } else if (front_node
== -2) {
2421 dout(1) << __func__
<< " public network " << front_iface
2422 << " ports numa nodes do not match" << dendl
;
2424 derr
<< __func__
<< " unable to identify public interface '" << front_iface
2425 << "' numa node: " << cpp_strerror(r
) << dendl
;
2427 if (int node
= g_conf().get_val
<int64_t>("osd_numa_node"); node
>= 0) {
2428 // this takes precedence over the automagic logic above
2431 if (numa_node
>= 0) {
2432 int r
= get_numa_node_cpu_set(numa_node
, &numa_cpu_set_size
, &numa_cpu_set
);
2434 dout(1) << __func__
<< " unable to determine numa node " << numa_node
2435 << " CPUs" << dendl
;
2438 dout(1) << __func__
<< " setting numa affinity to node " << numa_node
2440 << cpu_set_to_str_list(numa_cpu_set_size
, &numa_cpu_set
)
2442 r
= set_cpu_affinity_all_threads(numa_cpu_set_size
, &numa_cpu_set
);
2445 derr
<< __func__
<< " failed to set numa affinity: " << cpp_strerror(r
)
2451 dout(1) << __func__
<< " not setting numa affinity" << dendl
;
2458 class OSDSocketHook
: public AdminSocketHook
{
2461 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
2462 int call(std::string_view prefix
, const cmdmap_t
& cmdmap
,
2465 bufferlist
& out
) override
{
2466 ceph_abort("should use async hook");
2469 std::string_view prefix
,
2470 const cmdmap_t
& cmdmap
,
2472 const bufferlist
& inbl
,
2473 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
) override
{
2475 osd
->asok_command(prefix
, cmdmap
, f
, inbl
, on_finish
);
2476 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2478 on_finish(-EINVAL
, e
.what(), empty
);
2483 std::set
<int64_t> OSD::get_mapped_pools()
2485 std::set
<int64_t> pools
;
2486 std::vector
<spg_t
> pgids
;
2488 for (const auto &pgid
: pgids
) {
2489 pools
.insert(pgid
.pool());
2494 void OSD::asok_command(
2495 std::string_view prefix
, const cmdmap_t
& cmdmap
,
2497 const bufferlist
& inbl
,
2498 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
2501 stringstream ss
; // stderr error message stream
2502 bufferlist outbl
; // if empty at end, we'll dump formatter as output
2504 // --- PG commands are routed here to PG::do_command ---
2505 if (prefix
== "pg" ||
2506 prefix
== "query" ||
2507 prefix
== "mark_unfound_lost" ||
2508 prefix
== "list_unfound" ||
2509 prefix
== "scrub" ||
2510 prefix
== "deep_scrub"
2514 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
2515 ss
<< "no pgid specified";
2519 if (!pgid
.parse(pgidstr
.c_str())) {
2520 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
2526 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
2527 (pg
= _lookup_lock_pg(pcand
))) {
2528 if (pg
->is_primary()) {
2529 cmdmap_t new_cmdmap
= cmdmap
;
2531 pg
->do_command(prefix
, new_cmdmap
, inbl
, on_finish
);
2533 return; // the pg handler calls on_finish directly
2534 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2541 ss
<< "not primary for pgid " << pgid
;
2542 // do not reply; they will get newer maps and realize they
2549 ss
<< "i don't have pgid " << pgid
;
2554 // --- OSD commands follow ---
2556 else if (prefix
== "status") {
2557 lock_guard
l(osd_lock
);
2558 f
->open_object_section("status");
2559 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
2560 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
2561 f
->dump_unsigned("whoami", superblock
.whoami
);
2562 f
->dump_string("state", get_state_name(get_state()));
2563 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
2564 f
->dump_unsigned("newest_map", superblock
.newest_map
);
2565 f
->dump_unsigned("num_pgs", num_pgs
);
2567 } else if (prefix
== "flush_journal") {
2568 store
->flush_journal();
2569 } else if (prefix
== "dump_ops_in_flight" ||
2571 prefix
== "dump_blocked_ops" ||
2572 prefix
== "dump_historic_ops" ||
2573 prefix
== "dump_historic_ops_by_duration" ||
2574 prefix
== "dump_historic_slow_ops") {
2576 const string error_str
= "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2577 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2578 will start to track new ops received afterwards.";
2580 set
<string
> filters
;
2581 vector
<string
> filter_str
;
2582 if (cmd_getval(cmdmap
, "filterstr", filter_str
)) {
2583 copy(filter_str
.begin(), filter_str
.end(),
2584 inserter(filters
, filters
.end()));
2587 if (prefix
== "dump_ops_in_flight" ||
2589 if (!op_tracker
.dump_ops_in_flight(f
, false, filters
)) {
2595 if (prefix
== "dump_blocked_ops") {
2596 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
)) {
2602 if (prefix
== "dump_historic_ops") {
2603 if (!op_tracker
.dump_historic_ops(f
, false, filters
)) {
2609 if (prefix
== "dump_historic_ops_by_duration") {
2610 if (!op_tracker
.dump_historic_ops(f
, true, filters
)) {
2616 if (prefix
== "dump_historic_slow_ops") {
2617 if (!op_tracker
.dump_historic_slow_ops(f
, filters
)) {
2623 } else if (prefix
== "dump_op_pq_state") {
2624 f
->open_object_section("pq");
2625 op_shardedwq
.dump(f
);
2627 } else if (prefix
== "dump_blocklist") {
2628 list
<pair
<entity_addr_t
,utime_t
> > bl
;
2629 OSDMapRef curmap
= service
.get_osdmap();
2631 f
->open_array_section("blocklist");
2632 curmap
->get_blocklist(&bl
);
2633 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
2634 it
!= bl
.end(); ++it
) {
2635 f
->open_object_section("entry");
2636 f
->open_object_section("entity_addr_t");
2638 f
->close_section(); //entity_addr_t
2639 it
->second
.localtime(f
->dump_stream("expire_time"));
2640 f
->close_section(); //entry
2642 f
->close_section(); //blocklist
2643 } else if (prefix
== "dump_watchers") {
2644 list
<obj_watch_item_t
> watchers
;
2648 for (auto& pg
: pgs
) {
2649 list
<obj_watch_item_t
> pg_watchers
;
2650 pg
->get_watchers(&pg_watchers
);
2651 watchers
.splice(watchers
.end(), pg_watchers
);
2654 f
->open_array_section("watchers");
2655 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2656 it
!= watchers
.end(); ++it
) {
2658 f
->open_object_section("watch");
2660 f
->dump_string("namespace", it
->obj
.nspace
);
2661 f
->dump_string("object", it
->obj
.oid
.name
);
2663 f
->open_object_section("entity_name");
2664 it
->wi
.name
.dump(f
);
2665 f
->close_section(); //entity_name_t
2667 f
->dump_unsigned("cookie", it
->wi
.cookie
);
2668 f
->dump_unsigned("timeout", it
->wi
.timeout_seconds
);
2670 f
->open_object_section("entity_addr_t");
2671 it
->wi
.addr
.dump(f
);
2672 f
->close_section(); //entity_addr_t
2674 f
->close_section(); //watch
2677 f
->close_section(); //watchers
2678 } else if (prefix
== "dump_recovery_reservations") {
2679 f
->open_object_section("reservations");
2680 f
->open_object_section("local_reservations");
2681 service
.local_reserver
.dump(f
);
2683 f
->open_object_section("remote_reservations");
2684 service
.remote_reserver
.dump(f
);
2687 } else if (prefix
== "dump_scrub_reservations") {
2688 f
->open_object_section("scrub_reservations");
2689 service
.dump_scrub_reservations(f
);
2691 } else if (prefix
== "get_latest_osdmap") {
2692 get_latest_osdmap();
2693 } else if (prefix
== "set_heap_property") {
2697 bool success
= false;
2698 if (!cmd_getval(cmdmap
, "property", property
)) {
2699 error
= "unable to get property";
2701 } else if (!cmd_getval(cmdmap
, "value", value
)) {
2702 error
= "unable to get value";
2704 } else if (value
< 0) {
2705 error
= "negative value not allowed";
2707 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2708 error
= "invalid property";
2713 f
->open_object_section("result");
2714 f
->dump_string("error", error
);
2715 f
->dump_bool("success", success
);
2717 } else if (prefix
== "get_heap_property") {
2721 bool success
= false;
2722 if (!cmd_getval(cmdmap
, "property", property
)) {
2723 error
= "unable to get property";
2725 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2726 error
= "invalid property";
2731 f
->open_object_section("result");
2732 f
->dump_string("error", error
);
2733 f
->dump_bool("success", success
);
2734 f
->dump_int("value", value
);
2736 } else if (prefix
== "dump_objectstore_kv_stats") {
2737 store
->get_db_statistics(f
);
2738 } else if (prefix
== "dump_scrubs") {
2739 service
.dumps_scrub(f
);
2740 } else if (prefix
== "calc_objectstore_db_histogram") {
2741 store
->generate_db_histogram(f
);
2742 } else if (prefix
== "flush_store_cache") {
2743 store
->flush_cache(&ss
);
2744 } else if (prefix
== "dump_pgstate_history") {
2745 f
->open_object_section("pgstate_history");
2746 f
->open_array_section("pgs");
2749 for (auto& pg
: pgs
) {
2750 f
->open_object_section("pg");
2751 f
->dump_stream("pg") << pg
->pg_id
;
2752 f
->dump_string("currently", pg
->get_current_state());
2753 pg
->dump_pgstate_history(f
);
2758 } else if (prefix
== "compact") {
2759 dout(1) << "triggering manual compaction" << dendl
;
2760 auto start
= ceph::coarse_mono_clock::now();
2762 auto end
= ceph::coarse_mono_clock::now();
2763 double duration
= std::chrono::duration
<double>(end
-start
).count();
2764 dout(1) << "finished manual compaction in "
2766 << " seconds" << dendl
;
2767 f
->open_object_section("compact_result");
2768 f
->dump_float("elapsed_time", duration
);
2770 } else if (prefix
== "get_mapped_pools") {
2771 f
->open_array_section("mapped_pools");
2772 set
<int64_t> poollist
= get_mapped_pools();
2773 for (auto pool
: poollist
) {
2774 f
->dump_int("pool_id", pool
);
2777 } else if (prefix
== "smart") {
2779 cmd_getval(cmdmap
, "devid", devid
);
2781 probe_smart(devid
, out
);
2782 outbl
.append(out
.str());
2783 } else if (prefix
== "list_devices") {
2784 set
<string
> devnames
;
2785 store
->get_devices(&devnames
);
2786 f
->open_array_section("list_devices");
2787 for (auto dev
: devnames
) {
2788 if (dev
.find("dm-") == 0) {
2792 f
->open_object_section("device");
2793 f
->dump_string("device", "/dev/" + dev
);
2794 f
->dump_string("device_id", get_device_id(dev
, &err
));
2798 } else if (prefix
== "send_beacon") {
2799 lock_guard
l(osd_lock
);
2801 send_beacon(ceph::coarse_mono_clock::now());
2805 else if (prefix
== "cluster_log") {
2807 cmd_getval(cmdmap
, "message", msg
);
2810 ss
<< "ignoring empty log message";
2813 string message
= msg
.front();
2814 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
2815 message
+= " " + *a
;
2817 cmd_getval(cmdmap
, "level", lvl
);
2818 clog_type level
= string_to_clog_type(lvl
);
2821 ss
<< "unknown level '" << lvl
<< "'";
2824 clog
->do_log(level
, message
);
2827 else if (prefix
== "bench") {
2830 int64_t osize
, onum
;
2831 // default count 1G, size 4MB
2832 cmd_getval(cmdmap
, "count", count
, (int64_t)1 << 30);
2833 cmd_getval(cmdmap
, "size", bsize
, (int64_t)4 << 20);
2834 cmd_getval(cmdmap
, "object_size", osize
, (int64_t)0);
2835 cmd_getval(cmdmap
, "object_num", onum
, (int64_t)0);
2837 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
2839 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
2840 // let us limit the block size because the next checks rely on it
2841 // having a sane value. If we allow any block size to be set things
2842 // can still go sideways.
2843 ss
<< "block 'size' values are capped at "
2844 << byte_u_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
2845 << " a higher value, please adjust 'osd_bench_max_block_size'";
2848 } else if (bsize
< (int64_t) (1 << 20)) {
2849 // entering the realm of small block sizes.
2850 // limit the count to a sane value, assuming a configurable amount of
2851 // IOPS and duration, so that the OSD doesn't get hung up on this,
2852 // preventing timeouts from going off
2854 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
2855 if (count
> max_count
) {
2856 ss
<< "'count' values greater than " << max_count
2857 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
2858 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
2859 << " for " << duration
<< " seconds,"
2860 << " can cause ill effects on osd. "
2861 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2862 << " value if you wish to use a higher 'count'.";
2867 // 1MB block sizes are big enough so that we get more stuff done.
2868 // However, to avoid the osd from getting hung on this and having
2869 // timers being triggered, we are going to limit the count assuming
2870 // a configurable throughput and duration.
2871 // NOTE: max_count is the total amount of bytes that we believe we
2872 // will be able to write during 'duration' for the given
2873 // throughput. The block size hardly impacts this unless it's
2874 // way too big. Given we already check how big the block size
2875 // is, it's safe to assume everything will check out.
2877 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
2878 if (count
> max_count
) {
2879 ss
<< "'count' values greater than " << max_count
2880 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
2881 << byte_u_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
2882 << " for " << duration
<< " seconds,"
2883 << " can cause ill effects on osd. "
2884 << " Please adjust 'osd_bench_large_size_max_throughput'"
2885 << " with a higher value if you wish to use a higher 'count'.";
2891 if (osize
&& bsize
> osize
)
2894 dout(1) << " bench count " << count
2895 << " bsize " << byte_u_t(bsize
) << dendl
;
2897 ObjectStore::Transaction cleanupt
;
2899 if (osize
&& onum
) {
2901 bufferptr
bp(osize
);
2903 bl
.push_back(std::move(bp
));
2904 bl
.rebuild_page_aligned();
2905 for (int i
=0; i
<onum
; ++i
) {
2907 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
2909 hobject_t
soid(sobject_t(oid
, 0));
2910 ObjectStore::Transaction t
;
2911 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
2912 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
2913 cleanupt
.remove(coll_t(), ghobject_t(soid
));
2918 bufferptr
bp(bsize
);
2920 bl
.push_back(std::move(bp
));
2921 bl
.rebuild_page_aligned();
2925 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2930 utime_t start
= ceph_clock_now();
2931 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
2933 unsigned offset
= 0;
2934 if (onum
&& osize
) {
2935 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
2936 offset
= rand() % (osize
/ bsize
) * bsize
;
2938 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
2941 hobject_t
soid(sobject_t(oid
, 0));
2942 ObjectStore::Transaction t
;
2943 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
2944 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
2945 if (!onum
|| !osize
)
2946 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
2951 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2955 utime_t end
= ceph_clock_now();
2958 store
->queue_transaction(service
.meta_ch
, std::move(cleanupt
), NULL
);
2961 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2966 double elapsed
= end
- start
;
2967 double rate
= count
/ elapsed
;
2968 double iops
= rate
/ bsize
;
2969 f
->open_object_section("osd_bench_results");
2970 f
->dump_int("bytes_written", count
);
2971 f
->dump_int("blocksize", bsize
);
2972 f
->dump_float("elapsed_sec", elapsed
);
2973 f
->dump_float("bytes_per_sec", rate
);
2974 f
->dump_float("iops", iops
);
2978 else if (prefix
== "flush_pg_stats") {
2979 mgrc
.send_pgstats();
2980 f
->dump_unsigned("stat_seq", service
.get_osd_stat_seq());
2983 else if (prefix
== "heap") {
2984 ret
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ss
);
2987 else if (prefix
== "debug dump_missing") {
2988 f
->open_array_section("pgs");
2991 for (auto& pg
: pgs
) {
2992 string s
= stringify(pg
->pg_id
);
2993 f
->open_array_section(s
.c_str());
2995 pg
->dump_missing(f
);
3002 else if (prefix
== "debug kick_recovery_wq") {
3004 cmd_getval(cmdmap
, "delay", delay
);
3007 ret
= cct
->_conf
.set_val("osd_recovery_delay_start", oss
.str().c_str());
3009 ss
<< "kick_recovery_wq: error setting "
3010 << "osd_recovery_delay_start to '" << delay
<< "': error "
3014 cct
->_conf
.apply_changes(nullptr);
3015 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
3016 << "to " << cct
->_conf
->osd_recovery_delay_start
;
3019 else if (prefix
== "cpu_profiler") {
3022 cmd_getval(cmdmap
, "arg", arg
);
3023 vector
<string
> argvec
;
3024 get_str_vec(arg
, argvec
);
3025 cpu_profiler_handle_command(argvec
, ds
);
3026 outbl
.append(ds
.str());
3029 else if (prefix
== "dump_pg_recovery_stats") {
3030 lock_guard
l(osd_lock
);
3031 pg_recovery_stats
.dump_formatted(f
);
3034 else if (prefix
== "reset_pg_recovery_stats") {
3035 lock_guard
l(osd_lock
);
3036 pg_recovery_stats
.reset();
3039 else if (prefix
== "perf histogram dump") {
3041 std::string counter
;
3042 cmd_getval(cmdmap
, "logger", logger
);
3043 cmd_getval(cmdmap
, "counter", counter
);
3044 cct
->get_perfcounters_collection()->dump_formatted_histograms(
3045 f
, false, logger
, counter
);
3048 else if (prefix
== "cache drop") {
3049 lock_guard
l(osd_lock
);
3050 dout(20) << "clearing all caches" << dendl
;
3051 // Clear the objectstore's cache - onode and buffer for Bluestore,
3052 // system's pagecache for Filestore
3053 ret
= store
->flush_cache(&ss
);
3055 ss
<< "Error flushing objectstore cache: " << cpp_strerror(ret
);
3058 // Clear the objectcontext cache (per PG)
3061 for (auto& pg
: pgs
) {
3066 else if (prefix
== "cache status") {
3067 lock_guard
l(osd_lock
);
3068 int obj_ctx_count
= 0;
3071 for (auto& pg
: pgs
) {
3072 obj_ctx_count
+= pg
->get_cache_obj_count();
3074 f
->open_object_section("cache_status");
3075 f
->dump_int("object_ctx", obj_ctx_count
);
3076 store
->dump_cache_stats(f
);
3080 else if (prefix
== "scrub_purged_snaps") {
3081 lock_guard
l(osd_lock
);
3082 scrub_purged_snaps();
3085 else if (prefix
== "dump_osd_network") {
3086 lock_guard
l(osd_lock
);
3088 if (!(cmd_getval(cmdmap
, "value", value
))) {
3089 // Convert milliseconds to microseconds
3090 value
= static_cast<double>(g_conf().get_val
<double>(
3091 "mon_warn_on_slow_ping_time")) * 1000;
3093 double ratio
= g_conf().get_val
<double>("mon_warn_on_slow_ping_ratio");
3094 value
= g_conf().get_val
<int64_t>("osd_heartbeat_grace");
3095 value
*= 1000000 * ratio
; // Seconds of grace to microseconds at ratio
3098 // Convert user input to microseconds
3101 if (value
< 0) value
= 0;
3103 struct osd_ping_time_t
{
3107 std::array
<uint32_t,3> times
;
3108 std::array
<uint32_t,3> min
;
3109 std::array
<uint32_t,3> max
;
3111 uint32_t last_update
;
3113 bool operator<(const osd_ping_time_t
& rhs
) const {
3114 if (pingtime
< rhs
.pingtime
)
3116 if (pingtime
> rhs
.pingtime
)
3126 set
<osd_ping_time_t
> sorted
;
3127 // Get pingtimes under lock and not on the stack
3128 map
<int, osd_stat_t::Interfaces
> *pingtimes
= new map
<int, osd_stat_t::Interfaces
>;
3129 service
.get_hb_pingtime(pingtimes
);
3130 for (auto j
: *pingtimes
) {
3131 if (j
.second
.last_update
== 0)
3133 osd_ping_time_t item
;
3134 item
.pingtime
= std::max(j
.second
.back_pingtime
[0], j
.second
.back_pingtime
[1]);
3135 item
.pingtime
= std::max(item
.pingtime
, j
.second
.back_pingtime
[2]);
3136 if (item
.pingtime
>= value
) {
3138 item
.times
[0] = j
.second
.back_pingtime
[0];
3139 item
.times
[1] = j
.second
.back_pingtime
[1];
3140 item
.times
[2] = j
.second
.back_pingtime
[2];
3141 item
.min
[0] = j
.second
.back_min
[0];
3142 item
.min
[1] = j
.second
.back_min
[1];
3143 item
.min
[2] = j
.second
.back_min
[2];
3144 item
.max
[0] = j
.second
.back_max
[0];
3145 item
.max
[1] = j
.second
.back_max
[1];
3146 item
.max
[2] = j
.second
.back_max
[2];
3147 item
.last
= j
.second
.back_last
;
3149 item
.last_update
= j
.second
.last_update
;
3150 sorted
.emplace(item
);
3152 if (j
.second
.front_last
== 0)
3154 item
.pingtime
= std::max(j
.second
.front_pingtime
[0], j
.second
.front_pingtime
[1]);
3155 item
.pingtime
= std::max(item
.pingtime
, j
.second
.front_pingtime
[2]);
3156 if (item
.pingtime
>= value
) {
3158 item
.times
[0] = j
.second
.front_pingtime
[0];
3159 item
.times
[1] = j
.second
.front_pingtime
[1];
3160 item
.times
[2] = j
.second
.front_pingtime
[2];
3161 item
.min
[0] = j
.second
.front_min
[0];
3162 item
.min
[1] = j
.second
.front_min
[1];
3163 item
.min
[2] = j
.second
.front_min
[2];
3164 item
.max
[0] = j
.second
.front_max
[0];
3165 item
.max
[1] = j
.second
.front_max
[1];
3166 item
.max
[2] = j
.second
.front_max
[2];
3167 item
.last
= j
.second
.front_last
;
3168 item
.last_update
= j
.second
.last_update
;
3170 sorted
.emplace(item
);
3175 // Network ping times (1min 5min 15min)
3176 f
->open_object_section("network_ping_times");
3177 f
->dump_int("threshold", value
/ 1000);
3178 f
->open_array_section("entries");
3179 for (auto &sitem
: boost::adaptors::reverse(sorted
)) {
3180 ceph_assert(sitem
.pingtime
>= value
);
3181 f
->open_object_section("entry");
3183 const time_t lu(sitem
.last_update
);
3185 string
lustr(ctime_r(&lu
, buffer
));
3186 lustr
.pop_back(); // Remove trailing \n
3187 auto stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3188 f
->dump_string("last update", lustr
);
3189 f
->dump_bool("stale", ceph_clock_now().sec() - sitem
.last_update
> stale
);
3190 f
->dump_int("from osd", whoami
);
3191 f
->dump_int("to osd", sitem
.to
);
3192 f
->dump_string("interface", (sitem
.back
? "back" : "front"));
3193 f
->open_object_section("average");
3194 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.times
[0],3).c_str());
3195 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.times
[1],3).c_str());
3196 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.times
[2],3).c_str());
3197 f
->close_section(); // average
3198 f
->open_object_section("min");
3199 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3200 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3201 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3202 f
->close_section(); // min
3203 f
->open_object_section("max");
3204 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3205 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3206 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3207 f
->close_section(); // max
3208 f
->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem
.last
,3).c_str());
3209 f
->close_section(); // entry
3211 f
->close_section(); // entries
3212 f
->close_section(); // network_ping_times
3214 ceph_abort_msg("broken asok registration");
3218 on_finish(ret
, ss
.str(), outbl
);
3221 class TestOpsSocketHook
: public AdminSocketHook
{
3222 OSDService
*service
;
3225 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
3226 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
3228 std::ostream
& errss
,
3229 bufferlist
& out
) override
{
3233 test_ops(service
, store
, command
, cmdmap
, outss
);
3235 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
3241 void test_ops(OSDService
*service
, ObjectStore
*store
,
3242 std::string_view command
, const cmdmap_t
& cmdmap
, ostream
&ss
);
3246 class OSD::C_Tick
: public Context
{
3249 explicit C_Tick(OSD
*o
) : osd(o
) {}
3250 void finish(int r
) override
{
3255 class OSD::C_Tick_WithoutOSDLock
: public Context
{
3258 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
3259 void finish(int r
) override
{
3260 osd
->tick_without_osd_lock();
3264 int OSD::enable_disable_fuse(bool stop
)
3268 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
3269 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
3270 dout(1) << __func__
<< " disabling" << dendl
;
3274 r
= ::rmdir(mntpath
.c_str());
3277 derr
<< __func__
<< " failed to rmdir " << mntpath
<< ": "
3278 << cpp_strerror(r
) << dendl
;
3283 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
3284 dout(1) << __func__
<< " enabling" << dendl
;
3285 r
= ::mkdir(mntpath
.c_str(), 0700);
3288 if (r
< 0 && r
!= -EEXIST
) {
3289 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
3290 << cpp_strerror(r
) << dendl
;
3293 fuse_store
= new FuseStore(store
, mntpath
);
3294 r
= fuse_store
->start();
3296 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
3302 #endif // HAVE_LIBFUSE
3306 size_t OSD::get_num_cache_shards()
3308 return cct
->_conf
.get_val
<Option::size_t>("osd_num_cache_shards");
3311 int OSD::get_num_op_shards()
3313 if (cct
->_conf
->osd_op_num_shards
)
3314 return cct
->_conf
->osd_op_num_shards
;
3315 if (store_is_rotational
)
3316 return cct
->_conf
->osd_op_num_shards_hdd
;
3318 return cct
->_conf
->osd_op_num_shards_ssd
;
3321 int OSD::get_num_op_threads()
3323 if (cct
->_conf
->osd_op_num_threads_per_shard
)
3324 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard
;
3325 if (store_is_rotational
)
3326 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_hdd
;
3328 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_ssd
;
3331 float OSD::get_osd_recovery_sleep()
3333 if (cct
->_conf
->osd_recovery_sleep
)
3334 return cct
->_conf
->osd_recovery_sleep
;
3335 if (!store_is_rotational
&& !journal_is_rotational
)
3336 return cct
->_conf
->osd_recovery_sleep_ssd
;
3337 else if (store_is_rotational
&& !journal_is_rotational
)
3338 return cct
->_conf
.get_val
<double>("osd_recovery_sleep_hybrid");
3340 return cct
->_conf
->osd_recovery_sleep_hdd
;
3343 float OSD::get_osd_delete_sleep()
3345 float osd_delete_sleep
= cct
->_conf
.get_val
<double>("osd_delete_sleep");
3346 if (osd_delete_sleep
> 0)
3347 return osd_delete_sleep
;
3348 if (!store_is_rotational
&& !journal_is_rotational
)
3349 return cct
->_conf
.get_val
<double>("osd_delete_sleep_ssd");
3350 if (store_is_rotational
&& !journal_is_rotational
)
3351 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hybrid");
3352 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hdd");
3355 int OSD::get_recovery_max_active()
3357 if (cct
->_conf
->osd_recovery_max_active
)
3358 return cct
->_conf
->osd_recovery_max_active
;
3359 if (store_is_rotational
)
3360 return cct
->_conf
->osd_recovery_max_active_hdd
;
3362 return cct
->_conf
->osd_recovery_max_active_ssd
;
3365 float OSD::get_osd_snap_trim_sleep()
3367 float osd_snap_trim_sleep
= cct
->_conf
.get_val
<double>("osd_snap_trim_sleep");
3368 if (osd_snap_trim_sleep
> 0)
3369 return osd_snap_trim_sleep
;
3370 if (!store_is_rotational
&& !journal_is_rotational
)
3371 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_ssd");
3372 if (store_is_rotational
&& !journal_is_rotational
)
3373 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hybrid");
3374 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hdd");
3380 CompatSet initial
, diff
;
3381 std::lock_guard
lock(osd_lock
);
3386 tick_timer_without_osd_lock
.init();
3387 service
.recovery_request_timer
.init();
3388 service
.sleep_timer
.init();
3390 boot_finisher
.start();
3394 store
->read_meta("require_osd_release", &val
);
3395 last_require_osd_release
= ceph_release_from_name(val
);
3399 dout(2) << "init " << dev_path
3400 << " (looks like " << (store_is_rotational
? "hdd" : "ssd") << ")"
3402 dout(2) << "journal " << journal_path
<< dendl
;
3403 ceph_assert(store
); // call pre_init() first!
3405 store
->set_cache_shards(get_num_cache_shards());
3407 int r
= store
->mount();
3409 derr
<< "OSD:init: unable to mount object store" << dendl
;
3412 journal_is_rotational
= store
->is_journal_rotational();
3413 dout(2) << "journal looks like " << (journal_is_rotational
? "hdd" : "ssd")
3416 enable_disable_fuse(false);
3418 dout(2) << "boot" << dendl
;
3420 service
.meta_ch
= store
->open_collection(coll_t::meta());
3422 // initialize the daily loadavg with current 15min loadavg
3424 if (getloadavg(loadavgs
, 3) == 3) {
3425 daily_loadavg
= loadavgs
[2];
3427 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
3428 daily_loadavg
= 1.0;
3431 int rotating_auth_attempts
= 0;
3432 auto rotating_auth_timeout
=
3433 g_conf().get_val
<int64_t>("rotating_keys_bootstrap_timeout");
3435 // sanity check long object name handling
3438 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
3439 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
3440 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
3441 r
= store
->validate_hobject_key(l
);
3443 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
3444 << "object name[space] len" << dendl
;
3445 derr
<< " osd max object name len = "
3446 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
3447 derr
<< " osd max object namespace len = "
3448 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
3449 derr
<< cpp_strerror(r
) << dendl
;
3450 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
3453 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
3456 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
3461 r
= read_superblock();
3463 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
3468 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
3469 derr
<< "The disk uses features unsupported by the executable." << dendl
;
3470 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
3471 derr
<< " daemon features " << osd_compat
<< dendl
;
3473 if (osd_compat
.writeable(superblock
.compat_features
)) {
3474 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3475 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
3480 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3481 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
3487 assert_warn(whoami
== superblock
.whoami
);
3488 if (whoami
!= superblock
.whoami
) {
3489 derr
<< "OSD::init: superblock says osd"
3490 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
3495 startup_time
= ceph::mono_clock::now();
3497 // load up "current" osdmap
3498 assert_warn(!get_osdmap());
3500 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
3504 osdmap
= get_map(superblock
.current_epoch
);
3507 // make sure we don't have legacy pgs deleting
3510 int r
= store
->list_collections(ls
);
3511 ceph_assert(r
>= 0);
3514 if (c
.is_pg(&pgid
) &&
3515 !osdmap
->have_pg_pool(pgid
.pool())) {
3516 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
3517 if (!store
->exists(service
.meta_ch
, oid
)) {
3518 derr
<< __func__
<< " missing pg_pool_t for deleted pool "
3519 << pgid
.pool() << " for pg " << pgid
3520 << "; please downgrade to luminous and allow "
3521 << "pg deletion to complete before upgrading" << dendl
;
3528 initial
= get_osd_initial_compat_set();
3529 diff
= superblock
.compat_features
.unsupported(initial
);
3530 if (superblock
.compat_features
.merge(initial
)) {
3531 // Are we adding SNAPMAPPER2?
3532 if (diff
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
)) {
3533 dout(1) << __func__
<< " upgrade snap_mapper (first start as octopus)"
3535 auto ch
= service
.meta_ch
;
3536 auto hoid
= make_snapmapper_oid();
3537 unsigned max
= cct
->_conf
->osd_target_transaction_size
;
3538 r
= SnapMapper::convert_legacy(cct
, store
, ch
, hoid
, max
);
3542 // We need to persist the new compat_set before we
3544 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
3545 ObjectStore::Transaction t
;
3546 write_superblock(t
);
3547 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3552 // make sure snap mapper object exists
3553 if (!store
->exists(service
.meta_ch
, OSD::make_snapmapper_oid())) {
3554 dout(10) << "init creating/touching snapmapper object" << dendl
;
3555 ObjectStore::Transaction t
;
3556 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3557 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3561 if (!store
->exists(service
.meta_ch
, OSD::make_purged_snaps_oid())) {
3562 dout(10) << "init creating/touching purged_snaps object" << dendl
;
3563 ObjectStore::Transaction t
;
3564 t
.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3565 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3570 if (cct
->_conf
->osd_open_classes_on_start
) {
3571 int r
= ClassHandler::get_instance().open_all_classes();
3573 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
3576 check_osdmap_features();
3579 epoch_t bind_epoch
= osdmap
->get_epoch();
3580 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
3583 clear_temp_objects();
3585 // initialize osdmap references in sharded wq
3586 for (auto& shard
: shards
) {
3587 std::lock_guard
l(shard
->osdmap_lock
);
3588 shard
->shard_osdmap
= osdmap
;
3591 // load up pgs (as they previously existed)
3594 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
3596 if (cct
->_conf
.get_val
<bool>("osd_compact_on_start")) {
3597 dout(2) << "compacting object store's omap" << dendl
;
3603 struct store_statfs_t stbuf
;
3604 osd_alert_list_t alerts
;
3605 int r
= store
->statfs(&stbuf
, &alerts
);
3606 ceph_assert(r
== 0);
3607 service
.set_statfs(stbuf
, alerts
);
3610 // client_messenger's auth_client will be set up by monc->init() later.
3611 for (auto m
: { cluster_messenger
,
3613 hb_front_client_messenger
,
3614 hb_back_client_messenger
,
3615 hb_front_server_messenger
,
3616 hb_back_server_messenger
} ) {
3617 m
->set_auth_client(monc
);
3619 for (auto m
: { client_messenger
,
3621 hb_front_server_messenger
,
3622 hb_back_server_messenger
}) {
3623 m
->set_auth_server(monc
);
3625 monc
->set_handle_authentication_dispatcher(this);
3627 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
3628 | CEPH_ENTITY_TYPE_MGR
);
3633 mgrc
.set_pgstats_cb([this]() { return collect_pg_stats(); });
3634 mgrc
.set_perf_metric_query_cb(
3635 [this](const ConfigPayload
&config_payload
) {
3636 set_perf_queries(config_payload
);
3639 return get_perf_reports();
3643 // tell monc about log_client so it will know about mon session resets
3644 monc
->set_log_client(&log_client
);
3645 update_log_config();
3648 client_messenger
->add_dispatcher_tail(&mgrc
);
3649 client_messenger
->add_dispatcher_tail(this);
3650 cluster_messenger
->add_dispatcher_head(this);
3652 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3653 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3654 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3655 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3657 objecter_messenger
->add_dispatcher_head(service
.objecter
.get());
3660 service
.publish_map(osdmap
);
3661 service
.publish_superblock(superblock
);
3662 service
.max_oldest_map
= superblock
.oldest_map
;
3664 for (auto& shard
: shards
) {
3665 // put PGs in a temporary set because we may modify pg_slots
3666 // unordered_map below.
3668 for (auto& i
: shard
->pg_slots
) {
3669 PGRef pg
= i
.second
->pg
;
3675 for (auto pg
: pgs
) {
3676 std::scoped_lock l
{*pg
};
3677 set
<pair
<spg_t
,epoch_t
>> new_children
;
3678 set
<pair
<spg_t
,epoch_t
>> merge_pgs
;
3679 service
.identify_splits_and_merges(pg
->get_osdmap(), osdmap
, pg
->pg_id
,
3680 &new_children
, &merge_pgs
);
3681 if (!new_children
.empty()) {
3682 for (auto shard
: shards
) {
3683 shard
->prime_splits(osdmap
, &new_children
);
3685 assert(new_children
.empty());
3687 if (!merge_pgs
.empty()) {
3688 for (auto shard
: shards
) {
3689 shard
->prime_merges(osdmap
, &merge_pgs
);
3691 assert(merge_pgs
.empty());
3698 // start the heartbeat
3699 heartbeat_thread
.create("osd_srv_heartbt");
3702 tick_timer
.add_event_after(get_tick_interval(),
3705 std::lock_guard
l(tick_timer_lock
);
3706 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
3707 new C_Tick_WithoutOSDLock(this));
3712 r
= monc
->authenticate();
3714 derr
<< __func__
<< " authentication failed: " << cpp_strerror(r
)
3719 while (monc
->wait_auth_rotating(rotating_auth_timeout
) < 0) {
3720 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
3721 ++rotating_auth_attempts
;
3722 if (rotating_auth_attempts
> g_conf()->max_rotating_auth_attempts
) {
3723 derr
<< __func__
<< " wait_auth_rotating timed out" << dendl
;
3728 r
= update_crush_device_class();
3730 derr
<< __func__
<< " unable to update_crush_device_class: "
3731 << cpp_strerror(r
) << dendl
;
3735 r
= update_crush_location();
3737 derr
<< __func__
<< " unable to update_crush_location: "
3738 << cpp_strerror(r
) << dendl
;
3746 // start objecter *after* we have authenticated, so that we don't ignore
3747 // the OSDMaps it requests.
3748 service
.final_init();
3752 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
3755 dout(0) << "done with init, starting boot process" << dendl
;
3757 // subscribe to any pg creations
3758 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
3760 // MgrClient needs this (it doesn't have MonClient reference itself)
3761 monc
->sub_want("mgrmap", 0, 0);
3763 // we don't need to ask for an osdmap here; objecter will
3764 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3773 enable_disable_fuse(true);
3780 void OSD::final_init()
3782 AdminSocket
*admin_socket
= cct
->get_admin_socket();
3783 asok_hook
= new OSDSocketHook(this);
3784 int r
= admin_socket
->register_command("status", asok_hook
,
3785 "high-level status of OSD");
3786 ceph_assert(r
== 0);
3787 r
= admin_socket
->register_command("flush_journal",
3789 "flush the journal to permanent store");
3790 ceph_assert(r
== 0);
3791 r
= admin_socket
->register_command("dump_ops_in_flight " \
3792 "name=filterstr,type=CephString,n=N,req=false",
3794 "show the ops currently in flight");
3795 ceph_assert(r
== 0);
3796 r
= admin_socket
->register_command("ops " \
3797 "name=filterstr,type=CephString,n=N,req=false",
3799 "show the ops currently in flight");
3800 ceph_assert(r
== 0);
3801 r
= admin_socket
->register_command("dump_blocked_ops " \
3802 "name=filterstr,type=CephString,n=N,req=false",
3804 "show the blocked ops currently in flight");
3805 ceph_assert(r
== 0);
3806 r
= admin_socket
->register_command("dump_historic_ops " \
3807 "name=filterstr,type=CephString,n=N,req=false",
3810 ceph_assert(r
== 0);
3811 r
= admin_socket
->register_command("dump_historic_slow_ops " \
3812 "name=filterstr,type=CephString,n=N,req=false",
3814 "show slowest recent ops");
3815 ceph_assert(r
== 0);
3816 r
= admin_socket
->register_command("dump_historic_ops_by_duration " \
3817 "name=filterstr,type=CephString,n=N,req=false",
3819 "show slowest recent ops, sorted by duration");
3820 ceph_assert(r
== 0);
3821 r
= admin_socket
->register_command("dump_op_pq_state",
3823 "dump op priority queue state");
3824 ceph_assert(r
== 0);
3825 r
= admin_socket
->register_command("dump_blocklist",
3827 "dump blocklisted clients and times");
3828 ceph_assert(r
== 0);
3829 r
= admin_socket
->register_command("dump_watchers",
3831 "show clients which have active watches,"
3832 " and on which objects");
3833 ceph_assert(r
== 0);
3834 r
= admin_socket
->register_command("dump_recovery_reservations",
3836 "show recovery reservations");
3837 ceph_assert(r
== 0);
3838 r
= admin_socket
->register_command("dump_scrub_reservations",
3840 "show scrub reservations");
3841 ceph_assert(r
== 0);
3842 r
= admin_socket
->register_command("get_latest_osdmap",
3844 "force osd to update the latest map from "
3846 ceph_assert(r
== 0);
3848 r
= admin_socket
->register_command("set_heap_property " \
3849 "name=property,type=CephString " \
3850 "name=value,type=CephInt",
3852 "update malloc extension heap property");
3853 ceph_assert(r
== 0);
3855 r
= admin_socket
->register_command("get_heap_property " \
3856 "name=property,type=CephString",
3858 "get malloc extension heap property");
3859 ceph_assert(r
== 0);
3861 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
3863 "print statistics of kvdb which used by bluestore");
3864 ceph_assert(r
== 0);
3866 r
= admin_socket
->register_command("dump_scrubs",
3868 "print scheduled scrubs");
3869 ceph_assert(r
== 0);
3871 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
3873 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3874 ceph_assert(r
== 0);
3876 r
= admin_socket
->register_command("flush_store_cache",
3878 "Flush bluestore internal cache");
3879 ceph_assert(r
== 0);
3880 r
= admin_socket
->register_command("dump_pgstate_history",
3882 "show recent state history");
3883 ceph_assert(r
== 0);
3885 r
= admin_socket
->register_command("compact",
3887 "Commpact object store's omap."
3888 " WARNING: Compaction probably slows your requests");
3889 ceph_assert(r
== 0);
3891 r
= admin_socket
->register_command("get_mapped_pools",
3893 "dump pools whose PG(s) are mapped to this OSD.");
3895 ceph_assert(r
== 0);
3897 r
= admin_socket
->register_command("smart name=devid,type=CephString,req=false",
3899 "probe OSD devices for SMART data.");
3901 ceph_assert(r
== 0);
3903 r
= admin_socket
->register_command("list_devices",
3905 "list OSD devices.");
3906 r
= admin_socket
->register_command("send_beacon",
3908 "send OSD beacon to mon immediately");
3910 r
= admin_socket
->register_command(
3911 "dump_osd_network name=value,type=CephInt,req=false", asok_hook
,
3912 "Dump osd heartbeat network ping times");
3913 ceph_assert(r
== 0);
3915 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
);
3916 // Note: pools are CephString instead of CephPoolname because
3917 // these commands traditionally support both pool names and numbers
3918 r
= admin_socket
->register_command(
3920 "name=pool,type=CephString " \
3921 "name=objname,type=CephObjectname " \
3922 "name=key,type=CephString "\
3923 "name=val,type=CephString",
3926 ceph_assert(r
== 0);
3927 r
= admin_socket
->register_command(
3929 "name=pool,type=CephString " \
3930 "name=objname,type=CephObjectname " \
3931 "name=key,type=CephString",
3934 ceph_assert(r
== 0);
3935 r
= admin_socket
->register_command(
3937 "name=pool,type=CephString " \
3938 "name=objname,type=CephObjectname " \
3939 "name=header,type=CephString",
3942 ceph_assert(r
== 0);
3944 r
= admin_socket
->register_command(
3946 "name=pool,type=CephString " \
3947 "name=objname,type=CephObjectname",
3949 "output entire object map");
3950 ceph_assert(r
== 0);
3952 r
= admin_socket
->register_command(
3954 "name=pool,type=CephString " \
3955 "name=objname,type=CephObjectname " \
3956 "name=len,type=CephInt",
3958 "truncate object to length");
3959 ceph_assert(r
== 0);
3961 r
= admin_socket
->register_command(
3963 "name=pool,type=CephString " \
3964 "name=objname,type=CephObjectname " \
3965 "name=shardid,type=CephInt,req=false,range=0|255",
3967 "inject data error to an object");
3968 ceph_assert(r
== 0);
3970 r
= admin_socket
->register_command(
3972 "name=pool,type=CephString " \
3973 "name=objname,type=CephObjectname " \
3974 "name=shardid,type=CephInt,req=false,range=0|255",
3976 "inject metadata error to an object");
3977 ceph_assert(r
== 0);
3978 r
= admin_socket
->register_command(
3979 "set_recovery_delay " \
3980 "name=utime,type=CephInt,req=false",
3982 "Delay osd recovery by specified seconds");
3983 ceph_assert(r
== 0);
3984 r
= admin_socket
->register_command(
3986 "name=type,type=CephString,req=false " \
3987 "name=count,type=CephInt,req=false ",
3989 "Inject a full disk (optional count times)");
3990 ceph_assert(r
== 0);
3991 r
= admin_socket
->register_command(
3993 "name=count,type=CephInt,req=false " \
3994 "name=size,type=CephInt,req=false " \
3995 "name=object_size,type=CephInt,req=false " \
3996 "name=object_num,type=CephInt,req=false ",
3998 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3999 "(default count=1G default size=4MB). Results in log.");
4000 ceph_assert(r
== 0);
4001 r
= admin_socket
->register_command(
4003 "name=level,type=CephChoices,strings=error,warning,info,debug " \
4004 "name=message,type=CephString,n=N",
4006 "log a message to the cluster log");
4007 ceph_assert(r
== 0);
4008 r
= admin_socket
->register_command(
4012 ceph_assert(r
== 0);
4013 r
= admin_socket
->register_command(
4015 "name=heapcmd,type=CephChoices,strings=" \
4016 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4017 "name=value,type=CephString,req=false",
4019 "show heap usage info (available only if compiled with tcmalloc)");
4020 ceph_assert(r
== 0);
4021 r
= admin_socket
->register_command(
4022 "debug dump_missing " \
4023 "name=filename,type=CephFilepath",
4025 "dump missing objects to a named file");
4026 ceph_assert(r
== 0);
4027 r
= admin_socket
->register_command(
4028 "debug kick_recovery_wq " \
4029 "name=delay,type=CephInt,range=0",
4031 "set osd_recovery_delay_start to <val>");
4032 ceph_assert(r
== 0);
4033 r
= admin_socket
->register_command(
4035 "name=arg,type=CephChoices,strings=status|flush",
4037 "run cpu profiling on daemon");
4038 ceph_assert(r
== 0);
4039 r
= admin_socket
->register_command(
4040 "dump_pg_recovery_stats",
4042 "dump pg recovery statistics");
4043 ceph_assert(r
== 0);
4044 r
= admin_socket
->register_command(
4045 "reset_pg_recovery_stats",
4047 "reset pg recovery statistics");
4048 ceph_assert(r
== 0);
4049 r
= admin_socket
->register_command(
4052 "Drop all OSD caches");
4053 ceph_assert(r
== 0);
4054 r
= admin_socket
->register_command(
4057 "Get OSD caches statistics");
4058 ceph_assert(r
== 0);
4059 r
= admin_socket
->register_command(
4060 "scrub_purged_snaps",
4062 "Scrub purged_snaps vs snapmapper index");
4063 ceph_assert(r
== 0);
4065 // -- pg commands --
4066 // old form: ceph pg <pgid> command ...
4067 r
= admin_socket
->register_command(
4069 "name=pgid,type=CephPgid " \
4070 "name=cmd,type=CephChoices,strings=query",
4073 ceph_assert(r
== 0);
4074 r
= admin_socket
->register_command(
4076 "name=pgid,type=CephPgid " \
4077 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4078 "name=mulcmd,type=CephChoices,strings=revert|delete",
4081 ceph_assert(r
== 0);
4082 r
= admin_socket
->register_command(
4084 "name=pgid,type=CephPgid " \
4085 "name=cmd,type=CephChoices,strings=list_unfound " \
4086 "name=offset,type=CephString,req=false",
4089 ceph_assert(r
== 0);
4090 r
= admin_socket
->register_command(
4092 "name=pgid,type=CephPgid " \
4093 "name=cmd,type=CephChoices,strings=scrub " \
4094 "name=time,type=CephInt,req=false",
4097 ceph_assert(r
== 0);
4098 r
= admin_socket
->register_command(
4100 "name=pgid,type=CephPgid " \
4101 "name=cmd,type=CephChoices,strings=deep_scrub " \
4102 "name=time,type=CephInt,req=false",
4105 ceph_assert(r
== 0);
4106 // new form: tell <pgid> <cmd> for both cli and rest
4107 r
= admin_socket
->register_command(
4110 "show details of a specific pg");
4111 ceph_assert(r
== 0);
4112 r
= admin_socket
->register_command(
4113 "mark_unfound_lost " \
4114 "name=pgid,type=CephPgid,req=false " \
4115 "name=mulcmd,type=CephChoices,strings=revert|delete",
4117 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4118 ceph_assert(r
== 0);
4119 r
= admin_socket
->register_command(
4121 "name=pgid,type=CephPgid,req=false " \
4122 "name=offset,type=CephString,req=false",
4124 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4125 ceph_assert(r
== 0);
4126 r
= admin_socket
->register_command(
4128 "name=pgid,type=CephPgid,req=false " \
4129 "name=time,type=CephInt,req=false",
4131 "Trigger a scheduled scrub ");
4132 ceph_assert(r
== 0);
4133 r
= admin_socket
->register_command(
4135 "name=pgid,type=CephPgid,req=false " \
4136 "name=time,type=CephInt,req=false",
4138 "Trigger a scheduled deep scrub ");
4139 ceph_assert(r
== 0);
4142 PerfCounters
* OSD::create_logger()
4144 PerfCounters
* logger
= build_osd_logger(cct
);
4145 cct
->get_perfcounters_collection()->add(logger
);
4149 PerfCounters
* OSD::create_recoverystate_perf()
4151 PerfCounters
* recoverystate_perf
= build_recoverystate_perf(cct
);
4152 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
4153 return recoverystate_perf
;
4158 if (cct
->_conf
->osd_fast_shutdown
) {
4159 derr
<< "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl
;
4160 if (cct
->_conf
->osd_fast_shutdown_notify_mon
)
4161 service
.prepare_to_stop();
4166 if (!service
.prepare_to_stop())
4167 return 0; // already shutting down
4169 if (is_stopping()) {
4173 dout(0) << "shutdown" << dendl
;
4175 set_state(STATE_STOPPING
);
4178 if (cct
->_conf
.get_val
<bool>("osd_debug_shutdown")) {
4179 cct
->_conf
.set_val("debug_osd", "100");
4180 cct
->_conf
.set_val("debug_journal", "100");
4181 cct
->_conf
.set_val("debug_filestore", "100");
4182 cct
->_conf
.set_val("debug_bluestore", "100");
4183 cct
->_conf
.set_val("debug_ms", "100");
4184 cct
->_conf
.apply_changes(nullptr);
4187 // stop MgrClient earlier as it's more like an internal consumer of OSD
4190 service
.start_shutdown();
4192 // stop sending work to pgs. this just prevents any new work in _process
4193 // from racing with on_shutdown and potentially entering the pg after.
4194 op_shardedwq
.drain();
4200 for (auto pg
: pgs
) {
4205 // drain op queue again (in case PGs requeued something)
4206 op_shardedwq
.drain();
4208 finished
.clear(); // zap waiters (bleh, this is messy)
4209 waiting_for_osdmap
.clear();
4212 // unregister commands
4213 cct
->get_admin_socket()->unregister_commands(asok_hook
);
4217 cct
->get_admin_socket()->unregister_commands(test_ops_hook
);
4218 delete test_ops_hook
;
4219 test_ops_hook
= NULL
;
4224 std::lock_guard l
{heartbeat_lock
};
4225 heartbeat_stop
= true;
4226 heartbeat_cond
.notify_all();
4227 heartbeat_peers
.clear();
4229 heartbeat_thread
.join();
4231 hb_back_server_messenger
->mark_down_all();
4232 hb_front_server_messenger
->mark_down_all();
4233 hb_front_client_messenger
->mark_down_all();
4234 hb_back_client_messenger
->mark_down_all();
4238 dout(10) << "op sharded tp stopped" << dendl
;
4240 dout(10) << "stopping agent" << dendl
;
4241 service
.agent_stop();
4243 boot_finisher
.wait_for_empty();
4247 boot_finisher
.stop();
4248 reset_heartbeat_peers(true);
4250 tick_timer
.shutdown();
4253 std::lock_guard
l(tick_timer_lock
);
4254 tick_timer_without_osd_lock
.shutdown();
4257 // note unmount epoch
4258 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl
;
4259 superblock
.mounted
= service
.get_boot_epoch();
4260 superblock
.clean_thru
= get_osdmap_epoch();
4261 ObjectStore::Transaction t
;
4262 write_superblock(t
);
4263 int r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
4265 derr
<< "OSD::shutdown: error writing superblock: "
4266 << cpp_strerror(r
) << dendl
;
4270 service
.shutdown_reserver();
4273 #ifdef PG_DEBUG_REFS
4274 service
.dump_live_pgids();
4278 _get_pgs(&pgs
, true);
4282 for (auto& pg
: pgs
) {
4283 if (pg
->is_deleted()) {
4286 dout(20) << " kicking pg " << pg
<< dendl
;
4288 if (pg
->get_num_ref() != 1) {
4289 derr
<< "pgid " << pg
->get_pgid() << " has ref count of "
4290 << pg
->get_num_ref() << dendl
;
4291 #ifdef PG_DEBUG_REFS
4292 pg
->dump_live_ids();
4294 if (cct
->_conf
->osd_shutdown_pgref_assert
) {
4302 #ifdef PG_DEBUG_REFS
4303 service
.dump_live_pgids();
4307 cct
->_conf
.remove_observer(this);
4310 service
.meta_ch
.reset();
4312 dout(10) << "syncing store" << dendl
;
4313 enable_disable_fuse(true);
4315 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
4316 dout(10) << "flushing journal" << dendl
;
4317 store
->flush_journal();
4323 std::unique_lock l
{map_lock
};
4324 set_osdmap(OSDMapRef());
4326 for (auto s
: shards
) {
4327 std::lock_guard
l(s
->osdmap_lock
);
4328 s
->shard_osdmap
= OSDMapRef();
4332 std::lock_guard
lock(osd_lock
);
4336 dout(10) << "Store synced" << dendl
;
4338 op_tracker
.on_shutdown();
4340 ClassHandler::get_instance().shutdown();
4341 client_messenger
->shutdown();
4342 cluster_messenger
->shutdown();
4343 hb_front_client_messenger
->shutdown();
4344 hb_back_client_messenger
->shutdown();
4345 objecter_messenger
->shutdown();
4346 hb_front_server_messenger
->shutdown();
4347 hb_back_server_messenger
->shutdown();
4352 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
4354 bool created
= false;
4356 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
4357 vector
<string
> vcmd
{cmd
};
4361 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
4364 if (r
== -ENOENT
&& !created
) {
4365 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
4366 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
4367 vector
<string
> vnewcmd
{newcmd
};
4371 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
4374 derr
<< __func__
<< " fail: osd does not exist and created failed: "
4375 << cpp_strerror(r
) << dendl
;
4381 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
4390 int OSD::update_crush_location()
4392 if (!cct
->_conf
->osd_crush_update_on_start
) {
4393 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
4398 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
4399 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
4401 struct store_statfs_t st
;
4402 osd_alert_list_t alerts
;
4403 int r
= store
->statfs(&st
, &alerts
);
4405 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
4408 snprintf(weight
, sizeof(weight
), "%.4lf",
4411 double(1ull << 40 /* TB */)));
4414 dout(10) << __func__
<< " crush location is " << cct
->crush_location
<< dendl
;
4417 string("{\"prefix\": \"osd crush create-or-move\", ") +
4418 string("\"id\": ") + stringify(whoami
) + ", " +
4419 string("\"weight\":") + weight
+ ", " +
4420 string("\"args\": [") + stringify(cct
->crush_location
) + "]}";
4421 return mon_cmd_maybe_osd_create(cmd
);
4424 int OSD::update_crush_device_class()
4426 if (!cct
->_conf
->osd_class_update_on_start
) {
4427 dout(10) << __func__
<< " osd_class_update_on_start = false" << dendl
;
4431 string device_class
;
4432 int r
= store
->read_meta("crush_device_class", &device_class
);
4433 if (r
< 0 || device_class
.empty()) {
4434 device_class
= store
->get_default_device_class();
4437 if (device_class
.empty()) {
4438 dout(20) << __func__
<< " no device class stored locally" << dendl
;
4443 string("{\"prefix\": \"osd crush set-device-class\", ") +
4444 string("\"class\": \"") + device_class
+ string("\", ") +
4445 string("\"ids\": [\"") + stringify(whoami
) + string("\"]}");
4447 r
= mon_cmd_maybe_osd_create(cmd
);
4449 // good, already bound to a device-class
4456 void OSD::write_superblock(ObjectStore::Transaction
& t
)
4458 dout(10) << "write_superblock " << superblock
<< dendl
;
4460 //hack: at minimum it's using the baseline feature set
4461 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
4462 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
4465 encode(superblock
, bl
);
4466 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
4469 int OSD::read_superblock()
4472 int r
= store
->read(service
.meta_ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
4476 auto p
= bl
.cbegin();
4477 decode(superblock
, p
);
4479 dout(10) << "read_superblock " << superblock
<< dendl
;
4484 void OSD::clear_temp_objects()
4486 dout(10) << __func__
<< dendl
;
4488 store
->list_collections(ls
);
4489 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
4491 if (!p
->is_pg(&pgid
))
4494 // list temp objects
4495 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
4497 vector
<ghobject_t
> temps
;
4500 vector
<ghobject_t
> objects
;
4501 auto ch
= store
->open_collection(*p
);
4503 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4504 store
->get_ideal_list_max(),
4506 if (objects
.empty())
4508 vector
<ghobject_t
>::iterator q
;
4509 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
4510 // Hammer set pool for temps to -1, so check for clean-up
4511 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
4512 temps
.push_back(*q
);
4517 // If we saw a non-temp object and hit the break above we can
4518 // break out of the while loop too.
4519 if (q
!= objects
.end())
4522 if (!temps
.empty()) {
4523 ObjectStore::Transaction t
;
4525 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
4526 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
4528 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
4529 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4530 t
= ObjectStore::Transaction();
4535 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4541 void OSD::recursive_remove_collection(CephContext
* cct
,
4542 ObjectStore
*store
, spg_t pgid
,
4548 make_snapmapper_oid());
4550 ObjectStore::CollectionHandle ch
= store
->open_collection(tmp
);
4551 ObjectStore::Transaction t
;
4552 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
4555 int max
= cct
->_conf
->osd_target_transaction_size
;
4556 vector
<ghobject_t
> objects
;
4557 objects
.reserve(max
);
4560 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4561 max
, &objects
, &next
);
4562 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
4563 if (objects
.empty())
4565 for (auto& p
: objects
) {
4566 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
4567 int r
= mapper
.remove_oid(p
.hobj
, &_t
);
4568 if (r
!= 0 && r
!= -ENOENT
)
4572 int r
= store
->queue_transaction(ch
, std::move(t
));
4573 ceph_assert(r
== 0);
4574 t
= ObjectStore::Transaction();
4576 t
.remove_collection(tmp
);
4577 int r
= store
->queue_transaction(ch
, std::move(t
));
4578 ceph_assert(r
== 0);
4581 if (!ch
->flush_commit(&waiter
)) {
4587 // ======================================================
4591 OSDMapRef createmap
,
4594 dout(10) << __func__
<< " " << pgid
<< dendl
;
4596 map
<string
,string
> ec_profile
;
4598 if (createmap
->have_pg_pool(pgid
.pool())) {
4599 pi
= *createmap
->get_pg_pool(pgid
.pool());
4600 name
= createmap
->get_pool_name(pgid
.pool());
4601 if (pi
.is_erasure()) {
4602 ec_profile
= createmap
->get_erasure_code_profile(pi
.erasure_code_profile
);
4605 // pool was deleted; grab final pg_pool_t off disk.
4606 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
4608 int r
= store
->read(service
.meta_ch
, oid
, 0, 0, bl
);
4610 derr
<< __func__
<< " missing pool " << pgid
.pool() << " tombstone"
4614 ceph_assert(r
>= 0);
4615 auto p
= bl
.cbegin();
4618 if (p
.end()) { // dev release v13.0.2 did not include ec_profile
4619 derr
<< __func__
<< " missing ec_profile from pool " << pgid
.pool()
4620 << " tombstone" << dendl
;
4623 decode(ec_profile
, p
);
4625 PGPool
pool(createmap
, pgid
.pool(), pi
, name
);
4627 if (pi
.type
== pg_pool_t::TYPE_REPLICATED
||
4628 pi
.type
== pg_pool_t::TYPE_ERASURE
)
4629 pg
= new PrimaryLogPG(&service
, createmap
, pool
, ec_profile
, pgid
);
4635 void OSD::_get_pgs(vector
<PGRef
> *v
, bool clear_too
)
4638 v
->reserve(get_num_pgs());
4639 for (auto& s
: shards
) {
4640 std::lock_guard
l(s
->shard_lock
);
4641 for (auto& j
: s
->pg_slots
) {
4643 !j
.second
->pg
->is_deleted()) {
4644 v
->push_back(j
.second
->pg
);
4646 s
->_detach_pg(j
.second
.get());
4653 void OSD::_get_pgids(vector
<spg_t
> *v
)
4656 v
->reserve(get_num_pgs());
4657 for (auto& s
: shards
) {
4658 std::lock_guard
l(s
->shard_lock
);
4659 for (auto& j
: s
->pg_slots
) {
4661 !j
.second
->pg
->is_deleted()) {
4662 v
->push_back(j
.first
);
4668 void OSD::register_pg(PGRef pg
)
4670 spg_t pgid
= pg
->get_pgid();
4671 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4672 auto sdata
= shards
[shard_index
];
4673 std::lock_guard
l(sdata
->shard_lock
);
4674 auto r
= sdata
->pg_slots
.emplace(pgid
, make_unique
<OSDShardPGSlot
>());
4675 ceph_assert(r
.second
);
4676 auto *slot
= r
.first
->second
.get();
4677 dout(20) << __func__
<< " " << pgid
<< " " << pg
<< dendl
;
4678 sdata
->_attach_pg(slot
, pg
.get());
4681 bool OSD::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
4683 auto sdata
= pg
->osd_shard
;
4686 std::lock_guard
l(sdata
->shard_lock
);
4687 auto p
= sdata
->pg_slots
.find(pg
->pg_id
);
4688 if (p
== sdata
->pg_slots
.end() ||
4690 dout(20) << __func__
<< " " << pg
->pg_id
<< " not found" << dendl
;
4693 if (p
->second
->waiting_for_merge_epoch
) {
4694 dout(20) << __func__
<< " " << pg
->pg_id
<< " waiting for merge" << dendl
;
4697 dout(20) << __func__
<< " " << pg
->pg_id
<< " " << pg
<< dendl
;
4698 sdata
->_detach_pg(p
->second
.get());
4701 for (auto shard
: shards
) {
4702 shard
->unprime_split_children(pg
->pg_id
, old_pg_num
);
4705 // update pg count now since we might not get an osdmap any time soon.
4706 if (pg
->is_primary())
4707 service
.logger
->dec(l_osd_pg_primary
);
4708 else if (pg
->is_nonprimary())
4709 service
.logger
->dec(l_osd_pg_replica
); // misnomver
4711 service
.logger
->dec(l_osd_pg_stray
);
4716 PGRef
OSD::_lookup_pg(spg_t pgid
)
4718 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4719 auto sdata
= shards
[shard_index
];
4720 std::lock_guard
l(sdata
->shard_lock
);
4721 auto p
= sdata
->pg_slots
.find(pgid
);
4722 if (p
== sdata
->pg_slots
.end()) {
4725 return p
->second
->pg
;
4728 PGRef
OSD::_lookup_lock_pg(spg_t pgid
)
4730 PGRef pg
= _lookup_pg(pgid
);
4735 if (!pg
->is_deleted()) {
4742 PGRef
OSD::lookup_lock_pg(spg_t pgid
)
4744 return _lookup_lock_pg(pgid
);
4747 void OSD::load_pgs()
4749 ceph_assert(ceph_mutex_is_locked(osd_lock
));
4750 dout(0) << "load_pgs" << dendl
;
4753 auto pghist
= make_pg_num_history_oid();
4755 int r
= store
->read(service
.meta_ch
, pghist
, 0, 0, bl
, 0);
4756 if (r
>= 0 && bl
.length() > 0) {
4757 auto p
= bl
.cbegin();
4758 decode(pg_num_history
, p
);
4760 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
4764 int r
= store
->list_collections(ls
);
4766 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
4770 for (vector
<coll_t
>::iterator it
= ls
.begin();
4774 if (it
->is_temp(&pgid
) ||
4775 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
, pgid
))) {
4776 dout(10) << "load_pgs " << *it
4777 << " removing, legacy or flagged for removal pg" << dendl
;
4778 recursive_remove_collection(cct
, store
, pgid
, *it
);
4782 if (!it
->is_pg(&pgid
)) {
4783 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
4787 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
4788 epoch_t map_epoch
= 0;
4789 int r
= PG::peek_map_epoch(store
, pgid
, &map_epoch
);
4791 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
4797 if (map_epoch
> 0) {
4798 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
4800 if (!get_osdmap()->have_pg_pool(pgid
.pool())) {
4801 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
4802 << " on pg " << pgid
<< ", but the pool is not present in the "
4803 << "current map, so this is probably a result of bug 10617. "
4804 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4805 << "to clean it up later." << dendl
;
4808 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
4809 << map_epoch
<< ", but missing map. Crashing."
4811 ceph_abort_msg("Missing map in load_pgs");
4814 pg
= _make_pg(pgosdmap
, pgid
);
4816 pg
= _make_pg(get_osdmap(), pgid
);
4819 recursive_remove_collection(cct
, store
, pgid
, *it
);
4823 // there can be no waiters here, so we don't call _wake_pg_slot
4826 pg
->ch
= store
->open_collection(pg
->coll
);
4828 // read pg state, log
4829 pg
->read_state(store
);
4832 dout(10) << "load_pgs " << *it
<< " deleting dne" << dendl
;
4835 recursive_remove_collection(cct
, store
, pgid
, *it
);
4839 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4840 assert(NULL
!= shards
[shard_index
]);
4841 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4844 pg
->reg_next_scrub();
4846 dout(10) << __func__
<< " loaded " << *pg
<< dendl
;
4852 dout(0) << __func__
<< " opened " << num
<< " pgs" << dendl
;
4856 PGRef
OSD::handle_pg_create_info(const OSDMapRef
& osdmap
,
4857 const PGCreateInfo
*info
)
4859 spg_t pgid
= info
->pgid
;
4861 if (maybe_wait_for_max_pg(osdmap
, pgid
, info
->by_mon
)) {
4862 dout(10) << __func__
<< " hit max pg, dropping" << dendl
;
4866 PeeringCtx rctx
= create_context();
4868 OSDMapRef startmap
= get_map(info
->epoch
);
4871 int64_t pool_id
= pgid
.pgid
.pool();
4872 const pg_pool_t
*pool
= osdmap
->get_pg_pool(pool_id
);
4874 dout(10) << __func__
<< " ignoring " << pgid
<< ", pool dne" << dendl
;
4877 if (osdmap
->require_osd_release
>= ceph_release_t::nautilus
&&
4878 !pool
->has_flag(pg_pool_t::FLAG_CREATING
)) {
4879 // this ensures we do not process old creating messages after the
4880 // pool's initial pgs have been created (and pg are subsequently
4881 // allowed to split or merge).
4882 dout(20) << __func__
<< " dropping " << pgid
4883 << "create, pool does not have CREATING flag set" << dendl
;
4888 int up_primary
, acting_primary
;
4889 vector
<int> up
, acting
;
4890 startmap
->pg_to_up_acting_osds(
4891 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4893 const pg_pool_t
* pp
= startmap
->get_pg_pool(pgid
.pool());
4894 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
4895 store
->get_type() != "bluestore") {
4896 clog
->warn() << "pg " << pgid
4897 << " is at risk of silent data corruption: "
4898 << "the pool allows ec overwrites but is not stored in "
4899 << "bluestore, so deep scrubbing will not detect bitrot";
4901 create_pg_collection(
4902 rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
4903 init_pg_ondisk(rctx
.transaction
, pgid
, pp
);
4905 int role
= startmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
4907 PGRef pg
= _make_pg(startmap
, pgid
);
4908 pg
->ch
= store
->create_new_collection(pg
->coll
);
4911 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4912 assert(NULL
!= shards
[shard_index
]);
4913 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4918 // we are holding the shard lock
4919 ceph_assert(!pg
->is_deleted());
4928 info
->past_intervals
,
4932 pg
->init_collection_pool_opts();
4934 if (pg
->is_primary()) {
4935 std::lock_guard locker
{m_perf_queries_lock
};
4936 pg
->set_dynamic_perf_stats_queries(m_perf_queries
);
4939 pg
->handle_initialize(rctx
);
4940 pg
->handle_activate_map(rctx
);
4942 dispatch_context(rctx
, pg
.get(), osdmap
, nullptr);
4944 dout(10) << __func__
<< " new pg " << *pg
<< dendl
;
4948 bool OSD::maybe_wait_for_max_pg(const OSDMapRef
& osdmap
,
4952 const auto max_pgs_per_osd
=
4953 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4954 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4956 if (num_pgs
< max_pgs_per_osd
) {
4960 std::lock_guard
l(pending_creates_lock
);
4961 if (is_mon_create
) {
4962 pending_creates_from_mon
++;
4964 bool is_primary
= osdmap
->get_pg_acting_role(pgid
, whoami
) == 0;
4965 pending_creates_from_osd
.emplace(pgid
, is_primary
);
4967 dout(1) << __func__
<< " withhold creation of pg " << pgid
4968 << ": " << num_pgs
<< " >= "<< max_pgs_per_osd
<< dendl
;
4972 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4973 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4974 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4975 static vector
<int32_t> twiddle(const vector
<int>& acting
) {
4976 if (acting
.size() > 1) {
4979 vector
<int32_t> twiddled(acting
.begin(), acting
.end());
4980 twiddled
.push_back(-1);
4985 void OSD::resume_creating_pg()
4987 bool do_sub_pg_creates
= false;
4988 bool have_pending_creates
= false;
4990 const auto max_pgs_per_osd
=
4991 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4992 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4993 if (max_pgs_per_osd
<= num_pgs
) {
4994 // this could happen if admin decreases this setting before a PG is removed
4997 unsigned spare_pgs
= max_pgs_per_osd
- num_pgs
;
4998 std::lock_guard
l(pending_creates_lock
);
4999 if (pending_creates_from_mon
> 0) {
5000 dout(20) << __func__
<< " pending_creates_from_mon "
5001 << pending_creates_from_mon
<< dendl
;
5002 do_sub_pg_creates
= true;
5003 if (pending_creates_from_mon
>= spare_pgs
) {
5004 spare_pgs
= pending_creates_from_mon
= 0;
5006 spare_pgs
-= pending_creates_from_mon
;
5007 pending_creates_from_mon
= 0;
5010 auto pg
= pending_creates_from_osd
.cbegin();
5011 while (spare_pgs
> 0 && pg
!= pending_creates_from_osd
.cend()) {
5012 dout(20) << __func__
<< " pg " << pg
->first
<< dendl
;
5014 get_osdmap()->pg_to_up_acting_osds(pg
->first
.pgid
, nullptr, nullptr, &acting
, nullptr);
5015 service
.queue_want_pg_temp(pg
->first
.pgid
, twiddle(acting
), true);
5016 pg
= pending_creates_from_osd
.erase(pg
);
5017 do_sub_pg_creates
= true;
5020 have_pending_creates
= (pending_creates_from_mon
> 0 ||
5021 !pending_creates_from_osd
.empty());
5024 bool do_renew_subs
= false;
5025 if (do_sub_pg_creates
) {
5026 if (monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0)) {
5027 dout(4) << __func__
<< ": resolicit pg creates from mon since "
5028 << last_pg_create_epoch
<< dendl
;
5029 do_renew_subs
= true;
5032 version_t start
= get_osdmap_epoch() + 1;
5033 if (have_pending_creates
) {
5034 // don't miss any new osdmap deleting PGs
5035 if (monc
->sub_want("osdmap", start
, 0)) {
5036 dout(4) << __func__
<< ": resolicit osdmap from mon since "
5038 do_renew_subs
= true;
5040 } else if (do_sub_pg_creates
) {
5041 // no need to subscribe the osdmap continuously anymore
5042 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5043 if (monc
->sub_want_increment("osdmap", start
, CEPH_SUBSCRIBE_ONETIME
)) {
5044 dout(4) << __func__
<< ": re-subscribe osdmap(onetime) since "
5046 do_renew_subs
= true;
5050 if (do_renew_subs
) {
5054 service
.send_pg_temp();
5057 void OSD::build_initial_pg_history(
5060 utime_t created_stamp
,
5064 dout(10) << __func__
<< " " << pgid
<< " created " << created
<< dendl
;
5065 *h
= pg_history_t(created
, created_stamp
);
5067 OSDMapRef lastmap
= service
.get_map(created
);
5068 int up_primary
, acting_primary
;
5069 vector
<int> up
, acting
;
5070 lastmap
->pg_to_up_acting_osds(
5071 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
5073 ostringstream debug
;
5074 for (epoch_t e
= created
+ 1; e
<= get_osdmap_epoch(); ++e
) {
5075 OSDMapRef osdmap
= service
.get_map(e
);
5076 int new_up_primary
, new_acting_primary
;
5077 vector
<int> new_up
, new_acting
;
5078 osdmap
->pg_to_up_acting_osds(
5079 pgid
.pgid
, &new_up
, &new_up_primary
, &new_acting
, &new_acting_primary
);
5081 // this is a bit imprecise, but sufficient?
5082 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
5083 const pg_pool_t
*pi
;
5084 bool operator()(const set
<pg_shard_t
> &have
) const {
5085 return have
.size() >= pi
->min_size
;
5087 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
5088 } min_size_predicate(osdmap
->get_pg_pool(pgid
.pgid
.pool()));
5090 bool new_interval
= PastIntervals::check_new_interval(
5097 h
->same_interval_since
,
5098 h
->last_epoch_clean
,
5106 h
->same_interval_since
= e
;
5108 h
->same_up_since
= e
;
5110 if (acting_primary
!= new_acting_primary
) {
5111 h
->same_primary_since
= e
;
5113 if (pgid
.pgid
.is_split(lastmap
->get_pg_num(pgid
.pgid
.pool()),
5114 osdmap
->get_pg_num(pgid
.pgid
.pool()),
5116 h
->last_epoch_split
= e
;
5119 acting
= new_acting
;
5120 up_primary
= new_up_primary
;
5121 acting_primary
= new_acting_primary
;
5125 dout(20) << __func__
<< " " << debug
.str() << dendl
;
5126 dout(10) << __func__
<< " " << *h
<< " " << *pi
5127 << " [" << (pi
->empty() ? pair
<epoch_t
,epoch_t
>(0,0) :
5128 pi
->get_bounds()) << ")"
5132 void OSD::_add_heartbeat_peer(int p
)
5138 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
5139 if (i
== heartbeat_peers
.end()) {
5140 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, get_osdmap_epoch());
5143 assert(cons
.second
);
5145 hi
= &heartbeat_peers
[p
];
5148 auto stamps
= service
.get_hb_stamps(p
);
5150 auto sb
= ceph::make_ref
<Session
>(cct
, cons
.first
.get());
5152 sb
->stamps
= stamps
;
5153 hi
->hb_interval_start
= ceph_clock_now();
5154 hi
->con_back
= cons
.first
.get();
5155 hi
->con_back
->set_priv(sb
);
5157 auto sf
= ceph::make_ref
<Session
>(cct
, cons
.second
.get());
5159 sf
->stamps
= stamps
;
5160 hi
->con_front
= cons
.second
.get();
5161 hi
->con_front
->set_priv(sf
);
5163 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5164 << " " << hi
->con_back
->get_peer_addr()
5165 << " " << hi
->con_front
->get_peer_addr()
5170 hi
->epoch
= get_osdmap_epoch();
5173 void OSD::_remove_heartbeat_peer(int n
)
5175 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
5176 ceph_assert(q
!= heartbeat_peers
.end());
5177 dout(20) << " removing heartbeat peer osd." << n
5178 << " " << q
->second
.con_back
->get_peer_addr()
5179 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
5181 q
->second
.clear_mark_down();
5182 heartbeat_peers
.erase(q
);
5185 void OSD::need_heartbeat_peer_update()
5189 dout(20) << "need_heartbeat_peer_update" << dendl
;
5190 heartbeat_set_peers_need_update();
5193 void OSD::maybe_update_heartbeat_peers()
5195 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5197 if (is_waiting_for_healthy() || is_active()) {
5198 utime_t now
= ceph_clock_now();
5199 if (last_heartbeat_resample
== utime_t()) {
5200 last_heartbeat_resample
= now
;
5201 heartbeat_set_peers_need_update();
5202 } else if (!heartbeat_peers_need_update()) {
5203 utime_t dur
= now
- last_heartbeat_resample
;
5204 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
5205 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
5206 heartbeat_set_peers_need_update();
5207 last_heartbeat_resample
= now
;
5208 // automatically clean up any stale heartbeat peers
5209 // if we are unhealthy, then clean all
5210 reset_heartbeat_peers(is_waiting_for_healthy());
5215 if (!heartbeat_peers_need_update())
5217 heartbeat_clear_peers_need_update();
5219 std::lock_guard
l(heartbeat_lock
);
5221 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
5224 // build heartbeat from set
5228 for (auto& pg
: pgs
) {
5229 pg
->with_heartbeat_peers([&](int peer
) {
5230 if (get_osdmap()->is_up(peer
)) {
5231 _add_heartbeat_peer(peer
);
5237 // include next and previous up osds to ensure we have a fully-connected set
5238 set
<int> want
, extras
;
5239 const int next
= get_osdmap()->get_next_up_osd_after(whoami
);
5242 int prev
= get_osdmap()->get_previous_up_osd_before(whoami
);
5243 if (prev
>= 0 && prev
!= next
)
5246 // make sure we have at least **min_down** osds coming from different
5247 // subtree level (e.g., hosts) for fast failure detection.
5248 auto min_down
= cct
->_conf
.get_val
<uint64_t>("mon_osd_min_down_reporters");
5249 auto subtree
= cct
->_conf
.get_val
<string
>("mon_osd_reporter_subtree_level");
5250 auto limit
= std::max(min_down
, (uint64_t)cct
->_conf
->osd_heartbeat_min_peers
);
5251 get_osdmap()->get_random_up_osds_by_subtree(
5252 whoami
, subtree
, limit
, want
, &want
);
5254 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
5255 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
5257 _add_heartbeat_peer(*p
);
5260 // remove down peers; enumerate extras
5261 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5262 while (p
!= heartbeat_peers
.end()) {
5263 if (!get_osdmap()->is_up(p
->first
)) {
5266 _remove_heartbeat_peer(o
);
5269 if (p
->second
.epoch
< get_osdmap_epoch()) {
5270 extras
.insert(p
->first
);
5276 for (int n
= next
; n
>= 0; ) {
5277 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
5279 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
5280 dout(10) << " adding random peer osd." << n
<< dendl
;
5282 _add_heartbeat_peer(n
);
5284 n
= get_osdmap()->get_next_up_osd_after(n
);
5286 break; // came full circle; stop
5290 for (set
<int>::iterator p
= extras
.begin();
5291 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
5295 _remove_heartbeat_peer(*p
);
5298 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
5300 // clean up stale failure pending
5301 for (auto it
= failure_pending
.begin(); it
!= failure_pending
.end();) {
5302 if (heartbeat_peers
.count(it
->first
) == 0) {
5303 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
5304 failure_pending
.erase(it
++);
5311 void OSD::reset_heartbeat_peers(bool all
)
5313 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5314 dout(10) << "reset_heartbeat_peers" << dendl
;
5315 utime_t stale
= ceph_clock_now();
5316 stale
-= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
5317 std::lock_guard
l(heartbeat_lock
);
5318 for (auto it
= heartbeat_peers
.begin(); it
!= heartbeat_peers
.end();) {
5319 auto& [peer
, hi
] = *it
;
5320 if (all
|| hi
.is_stale(stale
)) {
5321 hi
.clear_mark_down();
5322 // stop sending failure_report to mon too
5323 failure_queue
.erase(peer
);
5324 failure_pending
.erase(peer
);
5325 it
= heartbeat_peers
.erase(it
);
5332 void OSD::handle_osd_ping(MOSDPing
*m
)
5334 if (superblock
.cluster_fsid
!= m
->fsid
) {
5335 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
5336 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
5342 int from
= m
->get_source().num();
5344 heartbeat_lock
.lock();
5345 if (is_stopping()) {
5346 heartbeat_lock
.unlock();
5351 utime_t now
= ceph_clock_now();
5352 auto mnow
= service
.get_mnow();
5353 ConnectionRef
con(m
->get_connection());
5354 OSDMapRef curmap
= service
.get_osdmap();
5356 heartbeat_lock
.unlock();
5361 auto sref
= con
->get_priv();
5362 Session
*s
= static_cast<Session
*>(sref
.get());
5364 heartbeat_lock
.unlock();
5370 s
->stamps
= service
.get_hb_stamps(from
);
5375 case MOSDPing::PING
:
5377 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
5378 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
5379 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
5380 if (heartbeat_drop
->second
== 0) {
5381 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
5383 --heartbeat_drop
->second
;
5384 dout(5) << "Dropping heartbeat from " << from
5385 << ", " << heartbeat_drop
->second
5386 << " remaining to drop" << dendl
;
5389 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
5390 ((((double)(rand()%100))/100.0))) {
5392 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
5393 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
5394 dout(5) << "Dropping heartbeat from " << from
5395 << ", " << heartbeat_drop
->second
5396 << " remaining to drop" << dendl
;
5401 ceph::signedspan sender_delta_ub
{};
5402 s
->stamps
->got_ping(
5408 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5410 if (!cct
->get_heartbeat_map()->is_healthy()) {
5411 dout(10) << "internal heartbeat not healthy, dropping ping request"
5416 Message
*r
= new MOSDPing(monc
->get_fsid(),
5417 curmap
->get_epoch(),
5418 MOSDPing::PING_REPLY
,
5422 service
.get_up_epoch(),
5423 cct
->_conf
->osd_heartbeat_min_size
,
5425 con
->send_message(r
);
5427 if (curmap
->is_up(from
)) {
5429 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5430 from
, curmap
->get_epoch());
5432 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5435 } else if (!curmap
->exists(from
) ||
5436 curmap
->get_down_at(from
) > m
->map_epoch
) {
5437 // tell them they have died
5438 Message
*r
= new MOSDPing(monc
->get_fsid(),
5439 curmap
->get_epoch(),
5444 service
.get_up_epoch(),
5445 cct
->_conf
->osd_heartbeat_min_size
);
5446 con
->send_message(r
);
5451 case MOSDPing::PING_REPLY
:
5453 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
5454 if (i
!= heartbeat_peers
.end()) {
5455 auto acked
= i
->second
.ping_history
.find(m
->ping_stamp
);
5456 if (acked
!= i
->second
.ping_history
.end()) {
5457 int &unacknowledged
= acked
->second
.second
;
5458 if (con
== i
->second
.con_back
) {
5459 dout(25) << "handle_osd_ping got reply from osd." << from
5460 << " first_tx " << i
->second
.first_tx
5461 << " last_tx " << i
->second
.last_tx
5462 << " last_rx_back " << i
->second
.last_rx_back
5464 << " last_rx_front " << i
->second
.last_rx_front
5466 i
->second
.last_rx_back
= now
;
5467 ceph_assert(unacknowledged
> 0);
5469 // if there is no front con, set both stamps.
5470 if (i
->second
.con_front
== NULL
) {
5471 i
->second
.last_rx_front
= now
;
5472 ceph_assert(unacknowledged
> 0);
5475 } else if (con
== i
->second
.con_front
) {
5476 dout(25) << "handle_osd_ping got reply from osd." << from
5477 << " first_tx " << i
->second
.first_tx
5478 << " last_tx " << i
->second
.last_tx
5479 << " last_rx_back " << i
->second
.last_rx_back
5480 << " last_rx_front " << i
->second
.last_rx_front
5483 i
->second
.last_rx_front
= now
;
5484 ceph_assert(unacknowledged
> 0);
5488 if (unacknowledged
== 0) {
5489 // succeeded in getting all replies
5490 dout(25) << "handle_osd_ping got all replies from osd." << from
5491 << " , erase pending ping(sent at " << m
->ping_stamp
<< ")"
5492 << " and older pending ping(s)"
5495 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5496 ++i
->second
.hb_average_count
;
5497 uint32_t back_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_back
- m
->ping_stamp
);
5498 i
->second
.hb_total_back
+= back_pingtime
;
5499 if (back_pingtime
< i
->second
.hb_min_back
)
5500 i
->second
.hb_min_back
= back_pingtime
;
5501 if (back_pingtime
> i
->second
.hb_max_back
)
5502 i
->second
.hb_max_back
= back_pingtime
;
5503 uint32_t front_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_front
- m
->ping_stamp
);
5504 i
->second
.hb_total_front
+= front_pingtime
;
5505 if (front_pingtime
< i
->second
.hb_min_front
)
5506 i
->second
.hb_min_front
= front_pingtime
;
5507 if (front_pingtime
> i
->second
.hb_max_front
)
5508 i
->second
.hb_max_front
= front_pingtime
;
5510 ceph_assert(i
->second
.hb_interval_start
!= utime_t());
5511 if (i
->second
.hb_interval_start
== utime_t())
5512 i
->second
.hb_interval_start
= now
;
5513 int64_t hb_avg_time_period
= 60;
5514 if (cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span")) {
5515 hb_avg_time_period
= cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span");
5517 if (now
- i
->second
.hb_interval_start
>= utime_t(hb_avg_time_period
, 0)) {
5518 uint32_t back_avg
= i
->second
.hb_total_back
/ i
->second
.hb_average_count
;
5519 uint32_t back_min
= i
->second
.hb_min_back
;
5520 uint32_t back_max
= i
->second
.hb_max_back
;
5521 uint32_t front_avg
= i
->second
.hb_total_front
/ i
->second
.hb_average_count
;
5522 uint32_t front_min
= i
->second
.hb_min_front
;
5523 uint32_t front_max
= i
->second
.hb_max_front
;
5525 // Reset for new interval
5526 i
->second
.hb_average_count
= 0;
5527 i
->second
.hb_interval_start
= now
;
5528 i
->second
.hb_total_back
= i
->second
.hb_max_back
= 0;
5529 i
->second
.hb_min_back
= UINT_MAX
;
5530 i
->second
.hb_total_front
= i
->second
.hb_max_front
= 0;
5531 i
->second
.hb_min_front
= UINT_MAX
;
5533 // Record per osd interace ping times
5534 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5535 if (i
->second
.hb_back_pingtime
.size() == 0) {
5536 ceph_assert(i
->second
.hb_front_pingtime
.size() == 0);
5537 for (unsigned k
= 0 ; k
< hb_vector_size
; ++k
) {
5538 i
->second
.hb_back_pingtime
.push_back(back_avg
);
5539 i
->second
.hb_back_min
.push_back(back_min
);
5540 i
->second
.hb_back_max
.push_back(back_max
);
5541 i
->second
.hb_front_pingtime
.push_back(front_avg
);
5542 i
->second
.hb_front_min
.push_back(front_min
);
5543 i
->second
.hb_front_max
.push_back(front_max
);
5544 ++i
->second
.hb_index
;
5547 int index
= i
->second
.hb_index
& (hb_vector_size
- 1);
5548 i
->second
.hb_back_pingtime
[index
] = back_avg
;
5549 i
->second
.hb_back_min
[index
] = back_min
;
5550 i
->second
.hb_back_max
[index
] = back_max
;
5551 i
->second
.hb_front_pingtime
[index
] = front_avg
;
5552 i
->second
.hb_front_min
[index
] = front_min
;
5553 i
->second
.hb_front_max
[index
] = front_max
;
5554 ++i
->second
.hb_index
;
5558 std::lock_guard
l(service
.stat_lock
);
5559 service
.osd_stat
.hb_pingtime
[from
].last_update
= now
.sec();
5560 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5563 uint32_t min
= UINT_MAX
;
5567 uint32_t size
= (uint32_t)i
->second
.hb_back_pingtime
.size();
5568 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5570 int index
= (i
->second
.hb_index
+ k
) % size
;
5571 total
+= i
->second
.hb_back_pingtime
[index
];
5572 if (i
->second
.hb_back_min
[index
] < min
)
5573 min
= i
->second
.hb_back_min
[index
];
5574 if (i
->second
.hb_back_max
[index
] > max
)
5575 max
= i
->second
.hb_back_max
[index
];
5576 if (count
== 1 || count
== 5 || count
== 15) {
5577 service
.osd_stat
.hb_pingtime
[from
].back_pingtime
[which
] = total
/ count
;
5578 service
.osd_stat
.hb_pingtime
[from
].back_min
[which
] = min
;
5579 service
.osd_stat
.hb_pingtime
[from
].back_max
[which
] = max
;
5586 if (i
->second
.con_front
!= NULL
) {
5587 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5594 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5596 int index
= (i
->second
.hb_index
+ k
) % size
;
5597 total
+= i
->second
.hb_front_pingtime
[index
];
5598 if (i
->second
.hb_front_min
[index
] < min
)
5599 min
= i
->second
.hb_front_min
[index
];
5600 if (i
->second
.hb_front_max
[index
] > max
)
5601 max
= i
->second
.hb_front_max
[index
];
5602 if (count
== 1 || count
== 5 || count
== 15) {
5603 service
.osd_stat
.hb_pingtime
[from
].front_pingtime
[which
] = total
/ count
;
5604 service
.osd_stat
.hb_pingtime
[from
].front_min
[which
] = min
;
5605 service
.osd_stat
.hb_pingtime
[from
].front_max
[which
] = max
;
5614 std::lock_guard
l(service
.stat_lock
);
5615 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5616 if (i
->second
.con_front
!= NULL
)
5617 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5619 i
->second
.ping_history
.erase(i
->second
.ping_history
.begin(), ++acked
);
5622 if (i
->second
.is_healthy(now
)) {
5623 // Cancel false reports
5624 auto failure_queue_entry
= failure_queue
.find(from
);
5625 if (failure_queue_entry
!= failure_queue
.end()) {
5626 dout(10) << "handle_osd_ping canceling queued "
5627 << "failure report for osd." << from
<< dendl
;
5628 failure_queue
.erase(failure_queue_entry
);
5631 auto failure_pending_entry
= failure_pending
.find(from
);
5632 if (failure_pending_entry
!= failure_pending
.end()) {
5633 dout(10) << "handle_osd_ping canceling in-flight "
5634 << "failure report for osd." << from
<< dendl
;
5635 send_still_alive(curmap
->get_epoch(),
5637 failure_pending_entry
->second
.second
);
5638 failure_pending
.erase(failure_pending_entry
);
5642 // old replies, deprecated by newly sent pings.
5643 dout(10) << "handle_osd_ping no pending ping(sent at " << m
->ping_stamp
5644 << ") is found, treat as covered by newly sent pings "
5651 curmap
->is_up(from
)) {
5653 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5654 from
, curmap
->get_epoch());
5656 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5661 s
->stamps
->got_ping_reply(
5665 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5669 case MOSDPing::YOU_DIED
:
5670 dout(10) << "handle_osd_ping " << m
->get_source_inst()
5671 << " says i am down in " << m
->map_epoch
<< dendl
;
5672 osdmap_subscribe(curmap
->get_epoch()+1, false);
5676 heartbeat_lock
.unlock();
5680 void OSD::heartbeat_entry()
5682 std::unique_lock
l(heartbeat_lock
);
5685 while (!heartbeat_stop
) {
5689 if (cct
->_conf
.get_val
<bool>("debug_disable_randomized_ping")) {
5690 wait
= (float)cct
->_conf
->osd_heartbeat_interval
;
5692 wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
5694 auto w
= ceph::make_timespan(wait
);
5695 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
5696 heartbeat_cond
.wait_for(l
, w
);
5699 dout(30) << "heartbeat_entry woke up" << dendl
;
5703 void OSD::heartbeat_check()
5705 ceph_assert(ceph_mutex_is_locked(heartbeat_lock
));
5706 utime_t now
= ceph_clock_now();
5708 // check for incoming heartbeats (move me elsewhere?)
5709 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5710 p
!= heartbeat_peers
.end();
5713 if (p
->second
.first_tx
== utime_t()) {
5714 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
5715 << " yet, skipping" << dendl
;
5719 dout(25) << "heartbeat_check osd." << p
->first
5720 << " first_tx " << p
->second
.first_tx
5721 << " last_tx " << p
->second
.last_tx
5722 << " last_rx_back " << p
->second
.last_rx_back
5723 << " last_rx_front " << p
->second
.last_rx_front
5725 if (p
->second
.is_unhealthy(now
)) {
5726 utime_t oldest_deadline
= p
->second
.ping_history
.begin()->second
.first
;
5727 if (p
->second
.last_rx_back
== utime_t() ||
5728 p
->second
.last_rx_front
== utime_t()) {
5729 derr
<< "heartbeat_check: no reply from "
5730 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5731 << " osd." << p
->first
5732 << " ever on either front or back, first ping sent "
5733 << p
->second
.first_tx
5734 << " (oldest deadline " << oldest_deadline
<< ")"
5737 failure_queue
[p
->first
] = p
->second
.first_tx
;
5739 derr
<< "heartbeat_check: no reply from "
5740 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5741 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
5742 << " front " << p
->second
.last_rx_front
5743 << " (oldest deadline " << oldest_deadline
<< ")"
5746 failure_queue
[p
->first
] = std::min(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
5752 void OSD::heartbeat()
5754 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock
));
5755 dout(30) << "heartbeat" << dendl
;
5759 int hb_interval
= cct
->_conf
->osd_heartbeat_interval
;
5760 int n_samples
= 86400;
5761 if (hb_interval
> 1) {
5762 n_samples
/= hb_interval
;
5767 if (getloadavg(loadavgs
, 1) == 1) {
5768 logger
->set(l_osd_loadavg
, 100 * loadavgs
[0]);
5769 daily_loadavg
= (daily_loadavg
* (n_samples
- 1) + loadavgs
[0]) / n_samples
;
5770 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg
<< dendl
;
5773 dout(30) << "heartbeat checking stats" << dendl
;
5775 // refresh peer list and osd stats
5776 vector
<int> hb_peers
;
5777 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5778 p
!= heartbeat_peers
.end();
5780 hb_peers
.push_back(p
->first
);
5782 auto new_stat
= service
.set_osd_stat(hb_peers
, get_num_pgs());
5783 dout(5) << __func__
<< " " << new_stat
<< dendl
;
5784 ceph_assert(new_stat
.statfs
.total
);
5787 float ratio
= service
.compute_adjusted_ratio(new_stat
, &pratio
);
5789 service
.check_full_status(ratio
, pratio
);
5791 utime_t now
= ceph_clock_now();
5792 auto mnow
= service
.get_mnow();
5793 utime_t deadline
= now
;
5794 deadline
+= cct
->_conf
->osd_heartbeat_grace
;
5797 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
5798 i
!= heartbeat_peers
.end();
5800 int peer
= i
->first
;
5801 Session
*s
= static_cast<Session
*>(i
->second
.con_back
->get_priv().get());
5803 dout(30) << "heartbeat osd." << peer
<< " has no open con" << dendl
;
5806 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
5808 i
->second
.last_tx
= now
;
5809 if (i
->second
.first_tx
== utime_t())
5810 i
->second
.first_tx
= now
;
5811 i
->second
.ping_history
[now
] = make_pair(deadline
,
5812 HeartbeatInfo::HEARTBEAT_MAX_CONN
);
5813 if (i
->second
.hb_interval_start
== utime_t())
5814 i
->second
.hb_interval_start
= now
;
5816 std::optional
<ceph::signedspan
> delta_ub
;
5817 s
->stamps
->sent_ping(&delta_ub
);
5819 i
->second
.con_back
->send_message(
5820 new MOSDPing(monc
->get_fsid(),
5821 service
.get_osdmap_epoch(),
5826 service
.get_up_epoch(),
5827 cct
->_conf
->osd_heartbeat_min_size
,
5830 if (i
->second
.con_front
)
5831 i
->second
.con_front
->send_message(
5832 new MOSDPing(monc
->get_fsid(),
5833 service
.get_osdmap_epoch(),
5838 service
.get_up_epoch(),
5839 cct
->_conf
->osd_heartbeat_min_size
,
5843 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
5845 // hmm.. am i all alone?
5846 dout(30) << "heartbeat lonely?" << dendl
;
5847 if (heartbeat_peers
.empty()) {
5848 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
5849 last_mon_heartbeat
= now
;
5850 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
5851 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5855 dout(30) << "heartbeat done" << dendl
;
5858 bool OSD::heartbeat_reset(Connection
*con
)
5860 std::lock_guard
l(heartbeat_lock
);
5861 auto s
= con
->get_priv();
5862 dout(20) << __func__
<< " con " << con
<< " s " << s
.get() << dendl
;
5863 con
->set_priv(nullptr);
5865 if (is_stopping()) {
5868 auto session
= static_cast<Session
*>(s
.get());
5869 auto p
= heartbeat_peers
.find(session
->peer
);
5870 if (p
!= heartbeat_peers
.end() &&
5871 (p
->second
.con_back
== con
||
5872 p
->second
.con_front
== con
)) {
5873 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5874 << ", reopening" << dendl
;
5875 p
->second
.clear_mark_down(con
);
5876 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
5878 p
->second
.con_back
= newcon
.first
.get();
5879 p
->second
.con_back
->set_priv(s
);
5880 if (newcon
.second
) {
5881 p
->second
.con_front
= newcon
.second
.get();
5882 p
->second
.con_front
->set_priv(s
);
5884 p
->second
.ping_history
.clear();
5886 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5887 << ", raced with osdmap update, closing out peer" << dendl
;
5888 heartbeat_peers
.erase(p
);
5891 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
5899 // =========================================
5903 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5904 dout(10) << "tick" << dendl
;
5906 utime_t now
= ceph_clock_now();
5907 // throw out any obsolete markdown log
5908 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
5909 while (!osd_markdown_log
.empty() &&
5910 osd_markdown_log
.front() + grace
< now
)
5911 osd_markdown_log
.pop_front();
5913 if (is_active() || is_waiting_for_healthy()) {
5914 maybe_update_heartbeat_peers();
5917 if (is_waiting_for_healthy()) {
5921 if (is_waiting_for_healthy() || is_booting()) {
5922 std::lock_guard
l(heartbeat_lock
);
5923 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
) {
5924 last_mon_heartbeat
= now
;
5925 dout(1) << __func__
<< " checking mon for new map" << dendl
;
5926 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5932 // scrub purged_snaps every deep scrub interval
5934 const utime_t last
= superblock
.last_purged_snaps_scrub
;
5935 utime_t next
= last
;
5936 next
+= cct
->_conf
->osd_scrub_min_interval
;
5938 // use a seed that is stable for each scrub interval, but varies
5939 // by OSD to avoid any herds.
5940 rng
.seed(whoami
+ superblock
.last_purged_snaps_scrub
.sec());
5941 double r
= (rng() % 1024) / 1024;
5943 cct
->_conf
->osd_scrub_min_interval
*
5944 cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
5945 if (next
< ceph_clock_now()) {
5946 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5947 << " next " << next
<< " ... now" << dendl
;
5948 scrub_purged_snaps();
5950 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5951 << " next " << next
<< dendl
;
5955 tick_timer
.add_event_after(get_tick_interval(), new C_Tick(this));
5958 void OSD::tick_without_osd_lock()
5960 ceph_assert(ceph_mutex_is_locked(tick_timer_lock
));
5961 dout(10) << "tick_without_osd_lock" << dendl
;
5963 logger
->set(l_osd_cached_crc
, ceph::buffer::get_cached_crc());
5964 logger
->set(l_osd_cached_crc_adjusted
, ceph::buffer::get_cached_crc_adjusted());
5965 logger
->set(l_osd_missed_crc
, ceph::buffer::get_missed_crc());
5967 // refresh osd stats
5968 struct store_statfs_t stbuf
;
5969 osd_alert_list_t alerts
;
5970 int r
= store
->statfs(&stbuf
, &alerts
);
5971 ceph_assert(r
== 0);
5972 service
.set_statfs(stbuf
, alerts
);
5974 // osd_lock is not being held, which means the OSD state
5975 // might change when doing the monitor report
5976 if (is_active() || is_waiting_for_healthy()) {
5978 std::lock_guard l
{heartbeat_lock
};
5981 map_lock
.lock_shared();
5982 std::lock_guard
l(mon_report_lock
);
5985 utime_t now
= ceph_clock_now();
5986 if (service
.need_fullness_update() ||
5987 now
- last_mon_report
> cct
->_conf
->osd_mon_report_interval
) {
5988 last_mon_report
= now
;
5992 map_lock
.unlock_shared();
5994 epoch_t max_waiting_epoch
= 0;
5995 for (auto s
: shards
) {
5996 max_waiting_epoch
= std::max(max_waiting_epoch
,
5997 s
->get_max_waiting_epoch());
5999 if (max_waiting_epoch
> get_osdmap()->get_epoch()) {
6000 dout(20) << __func__
<< " max_waiting_epoch " << max_waiting_epoch
6001 << ", requesting new map" << dendl
;
6002 osdmap_subscribe(superblock
.newest_map
+ 1, false);
6007 if (!scrub_random_backoff()) {
6010 service
.promote_throttle_recalibrate();
6011 resume_creating_pg();
6012 bool need_send_beacon
= false;
6013 const auto now
= ceph::coarse_mono_clock::now();
6015 // borrow lec lock to pretect last_sent_beacon from changing
6016 std::lock_guard l
{min_last_epoch_clean_lock
};
6017 const auto elapsed
= now
- last_sent_beacon
;
6018 if (std::chrono::duration_cast
<std::chrono::seconds
>(elapsed
).count() >
6019 cct
->_conf
->osd_beacon_report_interval
) {
6020 need_send_beacon
= true;
6023 if (need_send_beacon
) {
6028 mgrc
.update_daemon_health(get_health_metrics());
6029 service
.kick_recovery_queue();
6030 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
6031 new C_Tick_WithoutOSDLock(this));
6035 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6036 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
6037 // setomapheader <pool-id> [namespace/]<obj-name> <header>
6038 // getomap <pool> [namespace/]<obj-name>
6039 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
6040 // injectmdataerr [namespace/]<obj-name> [shardid]
6041 // injectdataerr [namespace/]<obj-name> [shardid]
6043 // set_recovery_delay [utime]
6044 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
6045 std::string_view command
,
6046 const cmdmap_t
& cmdmap
, ostream
&ss
)
6049 //Support changing the omap on a single osd by using the Admin Socket to
6050 //directly request the osd make a change.
6051 if (command
== "setomapval" || command
== "rmomapkey" ||
6052 command
== "setomapheader" || command
== "getomap" ||
6053 command
== "truncobj" || command
== "injectmdataerr" ||
6054 command
== "injectdataerr"
6058 OSDMapRef curmap
= service
->get_osdmap();
6063 cmd_getval(cmdmap
, "pool", poolstr
);
6064 pool
= curmap
->lookup_pg_pool_name(poolstr
);
6065 //If we can't find it by name then maybe id specified
6066 if (pool
< 0 && isdigit(poolstr
[0]))
6067 pool
= atoll(poolstr
.c_str());
6069 ss
<< "Invalid pool '" << poolstr
<< "''";
6073 string objname
, nspace
;
6074 cmd_getval(cmdmap
, "objname", objname
);
6075 std::size_t found
= objname
.find_first_of('/');
6076 if (found
!= string::npos
) {
6077 nspace
= objname
.substr(0, found
);
6078 objname
= objname
.substr(found
+1);
6080 object_locator_t
oloc(pool
, nspace
);
6081 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
6084 ss
<< "Invalid namespace/objname";
6089 cmd_getval(cmdmap
, "shardid", shardid
, int64_t(shard_id_t::NO_SHARD
));
6090 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
6091 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
6092 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
6093 if (curmap
->pg_is_ec(rawpg
)) {
6094 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
6095 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
6100 ObjectStore::Transaction t
;
6102 if (command
== "setomapval") {
6103 map
<string
, bufferlist
> newattrs
;
6106 cmd_getval(cmdmap
, "key", key
);
6107 cmd_getval(cmdmap
, "val", valstr
);
6110 newattrs
[key
] = val
;
6111 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
6112 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6114 ss
<< "error=" << r
;
6117 } else if (command
== "rmomapkey") {
6119 cmd_getval(cmdmap
, "key", key
);
6121 t
.omap_rmkey(coll_t(pgid
), ghobject_t(obj
), key
);
6122 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6124 ss
<< "error=" << r
;
6127 } else if (command
== "setomapheader") {
6128 bufferlist newheader
;
6131 cmd_getval(cmdmap
, "header", headerstr
);
6132 newheader
.append(headerstr
);
6133 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
6134 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6136 ss
<< "error=" << r
;
6139 } else if (command
== "getomap") {
6140 //Debug: Output entire omap
6142 map
<string
, bufferlist
> keyvals
;
6143 auto ch
= store
->open_collection(coll_t(pgid
));
6145 ss
<< "unable to open collection for " << pgid
;
6148 r
= store
->omap_get(ch
, ghobject_t(obj
), &hdrbl
, &keyvals
);
6150 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
6151 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
6152 it
!= keyvals
.end(); ++it
)
6153 ss
<< " key=" << (*it
).first
<< " val="
6154 << string((*it
).second
.c_str(), (*it
).second
.length());
6156 ss
<< "error=" << r
;
6159 } else if (command
== "truncobj") {
6161 cmd_getval(cmdmap
, "len", trunclen
);
6162 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
6163 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6165 ss
<< "error=" << r
;
6168 } else if (command
== "injectdataerr") {
6169 store
->inject_data_error(gobj
);
6171 } else if (command
== "injectmdataerr") {
6172 store
->inject_mdata_error(gobj
);
6177 if (command
== "set_recovery_delay") {
6179 cmd_getval(cmdmap
, "utime", delay
, (int64_t)0);
6182 int r
= service
->cct
->_conf
.set_val("osd_recovery_delay_start",
6185 ss
<< "set_recovery_delay: error setting "
6186 << "osd_recovery_delay_start to '" << delay
<< "': error "
6190 service
->cct
->_conf
.apply_changes(nullptr);
6191 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
6192 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
6195 if (command
== "injectfull") {
6198 OSDService::s_names state
;
6199 cmd_getval(cmdmap
, "type", type
, string("full"));
6200 cmd_getval(cmdmap
, "count", count
, (int64_t)-1);
6201 if (type
== "none" || count
== 0) {
6205 state
= service
->get_full_state(type
);
6206 if (state
== OSDService::s_names::INVALID
) {
6207 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6210 service
->set_injectfull(state
, count
);
6213 ss
<< "Internal error - command=" << command
;
6216 // =========================================
6218 void OSD::ms_handle_connect(Connection
*con
)
6220 dout(10) << __func__
<< " con " << con
<< dendl
;
6221 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
6222 std::lock_guard
l(osd_lock
);
6225 dout(10) << __func__
<< " on mon" << dendl
;
6229 } else if (is_booting()) {
6230 _send_boot(); // resend boot message
6232 map_lock
.lock_shared();
6233 std::lock_guard
l2(mon_report_lock
);
6235 utime_t now
= ceph_clock_now();
6236 last_mon_report
= now
;
6238 // resend everything, it's a new session
6241 service
.requeue_pg_temp();
6242 service
.clear_sent_ready_to_merge();
6243 service
.send_pg_temp();
6244 service
.send_ready_to_merge();
6245 service
.send_pg_created();
6249 map_lock
.unlock_shared();
6251 send_beacon(ceph::coarse_mono_clock::now());
6255 // full map requests may happen while active or pre-boot
6256 if (requested_full_first
) {
6257 rerequest_full_maps();
6262 void OSD::ms_handle_fast_connect(Connection
*con
)
6264 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6265 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6266 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6267 s
= ceph::make_ref
<Session
>(cct
, con
);
6269 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
6270 << " addr=" << s
->con
->get_peer_addr() << dendl
;
6271 // we don't connect to clients
6272 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6273 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6278 void OSD::ms_handle_fast_accept(Connection
*con
)
6280 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6281 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6282 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6283 s
= ceph::make_ref
<Session
>(cct
, con
);
6285 dout(10) << "new session (incoming)" << s
<< " con=" << con
6286 << " addr=" << con
->get_peer_addr()
6287 << " must have raced with connect" << dendl
;
6288 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6289 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6294 bool OSD::ms_handle_reset(Connection
*con
)
6296 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6297 dout(2) << "ms_handle_reset con " << con
<< " session " << session
.get() << dendl
;
6300 session
->wstate
.reset(con
);
6301 session
->con
->set_priv(nullptr);
6302 session
->con
.reset(); // break con <-> session ref cycle
6303 // note that we break session->con *before* the session_handle_reset
6304 // cleanup below. this avoids a race between us and
6305 // PG::add_backoff, Session::check_backoff, etc.
6306 session_handle_reset(session
);
6310 bool OSD::ms_handle_refused(Connection
*con
)
6312 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
6315 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6316 dout(2) << "ms_handle_refused con " << con
<< " session " << session
.get() << dendl
;
6319 int type
= con
->get_peer_type();
6320 // handle only OSD failures here
6321 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
6322 OSDMapRef osdmap
= get_osdmap();
6324 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
6325 if (id
>= 0 && osdmap
->is_up(id
)) {
6326 // I'm cheating mon heartbeat grace logic, because we know it's not going
6327 // to respawn alone. +1 so we won't hit any boundary case.
6328 monc
->send_mon_message(
6332 osdmap
->get_addrs(id
),
6333 cct
->_conf
->osd_heartbeat_grace
+ 1,
6334 osdmap
->get_epoch(),
6335 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
6343 struct CB_OSD_GetVersion
{
6345 explicit CB_OSD_GetVersion(OSD
*o
) : osd(o
) {}
6346 void operator ()(boost::system::error_code ec
, version_t newest
,
6349 osd
->_got_mon_epochs(oldest
, newest
);
6353 void OSD::start_boot()
6355 if (!_is_healthy()) {
6356 // if we are not healthy, do not mark ourselves up (yet)
6357 dout(1) << "not healthy; waiting to boot" << dendl
;
6358 if (!is_waiting_for_healthy())
6359 start_waiting_for_healthy();
6360 // send pings sooner rather than later
6364 dout(1) << __func__
<< dendl
;
6365 set_state(STATE_PREBOOT
);
6366 dout(10) << "start_boot - have maps " << superblock
.oldest_map
6367 << ".." << superblock
.newest_map
<< dendl
;
6368 monc
->get_version("osdmap", CB_OSD_GetVersion(this));
6371 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
6373 std::lock_guard
l(osd_lock
);
6375 _preboot(oldest
, newest
);
6379 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
6381 ceph_assert(is_preboot());
6382 dout(10) << __func__
<< " _preboot mon has osdmaps "
6383 << oldest
<< ".." << newest
<< dendl
;
6385 // ensure our local fullness awareness is accurate
6387 std::lock_guard
l(heartbeat_lock
);
6391 const auto& monmap
= monc
->monmap
;
6392 const auto osdmap
= get_osdmap();
6393 // if our map within recent history, try to add ourselves to the osdmap.
6394 if (osdmap
->get_epoch() == 0) {
6395 derr
<< "waiting for initial osdmap" << dendl
;
6396 } else if (osdmap
->is_destroyed(whoami
)) {
6397 derr
<< "osdmap says I am destroyed" << dendl
;
6398 // provide a small margin so we don't livelock seeing if we
6399 // un-destroyed ourselves.
6400 if (osdmap
->get_epoch() > newest
- 1) {
6403 } else if (osdmap
->is_noup(whoami
)) {
6404 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
6405 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
6406 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6408 } else if (service
.need_fullness_update()) {
6409 derr
<< "osdmap fullness state needs update" << dendl
;
6411 } else if (monmap
.min_mon_release
>= ceph_release_t::octopus
&&
6412 superblock
.purged_snaps_last
< superblock
.current_epoch
) {
6413 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6414 << " < newest_map " << superblock
.current_epoch
<< dendl
;
6415 _get_purged_snaps();
6416 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
6417 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
6419 // wait for pgs to fully catch up in a different thread, since
6420 // this thread might be required for splitting and merging PGs to
6422 boot_finisher
.queue(
6425 std::unique_lock
l(osd_lock
);
6427 dout(10) << __func__
<< " waiting for peering work to drain"
6430 for (auto shard
: shards
) {
6431 shard
->wait_min_pg_epoch(get_osdmap_epoch());
6442 // get all the latest maps
6443 if (osdmap
->get_epoch() + 1 >= oldest
)
6444 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6446 osdmap_subscribe(oldest
- 1, true);
6449 void OSD::_get_purged_snaps()
6451 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6452 // overlapping requests to the mon, which will be somewhat inefficient, but
6453 // it should be reliable.
6454 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6455 << ", newest_map " << superblock
.current_epoch
<< dendl
;
6456 MMonGetPurgedSnaps
*m
= new MMonGetPurgedSnaps(
6457 superblock
.purged_snaps_last
+ 1,
6458 superblock
.current_epoch
+ 1);
6459 monc
->send_mon_message(m
);
6462 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply
*m
)
6464 dout(10) << __func__
<< " " << *m
<< dendl
;
6465 ObjectStore::Transaction t
;
6466 if (!is_preboot() ||
6467 m
->last
< superblock
.purged_snaps_last
) {
6470 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
6471 make_purged_snaps_oid(), &t
,
6473 superblock
.purged_snaps_last
= m
->last
;
6474 write_superblock(t
);
6475 store
->queue_transaction(
6478 service
.publish_superblock(superblock
);
6479 if (m
->last
< superblock
.current_epoch
) {
6480 _get_purged_snaps();
6488 void OSD::send_full_update()
6490 if (!service
.need_fullness_update())
6493 if (service
.is_full()) {
6494 state
= CEPH_OSD_FULL
;
6495 } else if (service
.is_backfillfull()) {
6496 state
= CEPH_OSD_BACKFILLFULL
;
6497 } else if (service
.is_nearfull()) {
6498 state
= CEPH_OSD_NEARFULL
;
6501 OSDMap::calc_state_set(state
, s
);
6502 dout(10) << __func__
<< " want state " << s
<< dendl
;
6503 monc
->send_mon_message(new MOSDFull(get_osdmap_epoch(), state
));
6506 void OSD::start_waiting_for_healthy()
6508 dout(1) << "start_waiting_for_healthy" << dendl
;
6509 set_state(STATE_WAITING_FOR_HEALTHY
);
6510 last_heartbeat_resample
= utime_t();
6512 // subscribe to osdmap updates, in case our peers really are known to be dead
6513 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6516 bool OSD::_is_healthy()
6518 if (!cct
->get_heartbeat_map()->is_healthy()) {
6519 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
6523 if (is_waiting_for_healthy()) {
6524 utime_t now
= ceph_clock_now();
6525 if (osd_markdown_log
.empty()) {
6526 dout(5) << __func__
<< " force returning true since last markdown"
6527 << " was " << cct
->_conf
->osd_max_markdown_period
6528 << "s ago" << dendl
;
6531 std::lock_guard
l(heartbeat_lock
);
6532 int num
= 0, up
= 0;
6533 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
6534 p
!= heartbeat_peers
.end();
6536 if (p
->second
.is_healthy(now
))
6540 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
6541 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
6542 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
6550 void OSD::_send_boot()
6552 dout(10) << "_send_boot" << dendl
;
6553 Connection
*local_connection
=
6554 cluster_messenger
->get_loopback_connection().get();
6555 entity_addrvec_t client_addrs
= client_messenger
->get_myaddrs();
6556 entity_addrvec_t cluster_addrs
= cluster_messenger
->get_myaddrs();
6557 entity_addrvec_t hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6558 entity_addrvec_t hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6560 dout(20) << " initial client_addrs " << client_addrs
6561 << ", cluster_addrs " << cluster_addrs
6562 << ", hb_back_addrs " << hb_back_addrs
6563 << ", hb_front_addrs " << hb_front_addrs
6565 if (cluster_messenger
->set_addr_unknowns(client_addrs
)) {
6566 dout(10) << " assuming cluster_addrs match client_addrs "
6567 << client_addrs
<< dendl
;
6568 cluster_addrs
= cluster_messenger
->get_myaddrs();
6570 if (auto session
= local_connection
->get_priv(); !session
) {
6571 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6574 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
6575 if (hb_back_server_messenger
->set_addr_unknowns(cluster_addrs
)) {
6576 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6577 << cluster_addrs
<< dendl
;
6578 hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6580 if (auto session
= local_connection
->get_priv(); !session
) {
6581 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6584 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
6585 if (hb_front_server_messenger
->set_addr_unknowns(client_addrs
)) {
6586 dout(10) << " assuming hb_front_addrs match client_addrs "
6587 << client_addrs
<< dendl
;
6588 hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6590 if (auto session
= local_connection
->get_priv(); !session
) {
6591 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6594 // we now know what our front and back addrs will be, and we are
6595 // about to tell the mon what our metadata (including numa bindings)
6596 // are, so now is a good time!
6597 set_numa_affinity();
6599 MOSDBoot
*mboot
= new MOSDBoot(
6600 superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
6601 hb_back_addrs
, hb_front_addrs
, cluster_addrs
,
6603 dout(10) << " final client_addrs " << client_addrs
6604 << ", cluster_addrs " << cluster_addrs
6605 << ", hb_back_addrs " << hb_back_addrs
6606 << ", hb_front_addrs " << hb_front_addrs
6608 _collect_metadata(&mboot
->metadata
);
6609 monc
->send_mon_message(mboot
);
6610 set_state(STATE_BOOTING
);
6613 void OSD::_collect_metadata(map
<string
,string
> *pm
)
6616 (*pm
)["osd_data"] = dev_path
;
6617 if (store
->get_type() == "filestore") {
6618 // not applicable for bluestore
6619 (*pm
)["osd_journal"] = journal_path
;
6621 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddrs());
6622 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddrs());
6623 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddrs());
6624 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddrs());
6627 (*pm
)["osd_objectstore"] = store
->get_type();
6628 (*pm
)["rotational"] = store_is_rotational
? "1" : "0";
6629 (*pm
)["journal_rotational"] = journal_is_rotational
? "1" : "0";
6630 (*pm
)["default_device_class"] = store
->get_default_device_class();
6631 string osdspec_affinity
;
6632 int r
= store
->read_meta("osdspec_affinity", &osdspec_affinity
);
6633 if (r
< 0 || osdspec_affinity
.empty()) {
6634 osdspec_affinity
= "";
6636 (*pm
)["osdspec_affinity"] = osdspec_affinity
;
6637 store
->collect_metadata(pm
);
6639 collect_sys_info(pm
, cct
);
6641 (*pm
)["front_iface"] = pick_iface(
6643 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
6644 (*pm
)["back_iface"] = pick_iface(
6646 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
6652 set
<string
> unknown
;
6653 for (auto nm
: { "front_iface", "back_iface" }) {
6654 if (!(*pm
)[nm
].size()) {
6659 int r
= get_iface_numa_node((*pm
)[nm
], &n
);
6661 unknown
.insert((*pm
)[nm
]);
6669 if (unknown
.size()) {
6670 (*pm
)["network_numa_unknown_ifaces"] = stringify(unknown
);
6672 if (!nodes
.empty()) {
6673 (*pm
)["network_numa_nodes"] = stringify(nodes
);
6675 if (node
>= 0 && nodes
.size() == 1 && unknown
.empty()) {
6676 (*pm
)["network_numa_node"] = stringify(node
);
6680 if (numa_node
>= 0) {
6681 (*pm
)["numa_node"] = stringify(numa_node
);
6682 (*pm
)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size
,
6686 set
<string
> devnames
;
6687 store
->get_devices(&devnames
);
6688 map
<string
,string
> errs
;
6689 get_device_metadata(devnames
, pm
, &errs
);
6690 for (auto& i
: errs
) {
6691 dout(1) << __func__
<< " " << i
.first
<< ": " << i
.second
<< dendl
;
6693 dout(10) << __func__
<< " " << *pm
<< dendl
;
6696 void OSD::queue_want_up_thru(epoch_t want
)
6698 std::shared_lock map_locker
{map_lock
};
6699 epoch_t cur
= get_osdmap()->get_up_thru(whoami
);
6700 std::lock_guard
report_locker(mon_report_lock
);
6701 if (want
> up_thru_wanted
) {
6702 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
6703 << ", currently " << cur
6705 up_thru_wanted
= want
;
6708 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
6709 << ", currently " << cur
6714 void OSD::send_alive()
6716 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6717 const auto osdmap
= get_osdmap();
6718 if (!osdmap
->exists(whoami
))
6720 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
6721 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
6722 if (up_thru_wanted
> up_thru
) {
6723 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
6724 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
6728 void OSD::request_full_map(epoch_t first
, epoch_t last
)
6730 dout(10) << __func__
<< " " << first
<< ".." << last
6731 << ", previously requested "
6732 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
6733 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6734 ceph_assert(first
> 0 && last
> 0);
6735 ceph_assert(first
<= last
);
6736 ceph_assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
6737 if (requested_full_first
== 0) {
6739 requested_full_first
= first
;
6740 requested_full_last
= last
;
6741 } else if (last
<= requested_full_last
) {
6745 // additional request
6746 first
= requested_full_last
+ 1;
6747 requested_full_last
= last
;
6749 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
6750 req
->request_full(first
, last
);
6751 monc
->send_mon_message(req
);
6754 void OSD::got_full_map(epoch_t e
)
6756 ceph_assert(requested_full_first
<= requested_full_last
);
6757 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6758 if (requested_full_first
== 0) {
6759 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
6762 if (e
< requested_full_first
) {
6763 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6764 << ".." << requested_full_last
6765 << ", ignoring" << dendl
;
6768 if (e
>= requested_full_last
) {
6769 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6770 << ".." << requested_full_last
<< ", resetting" << dendl
;
6771 requested_full_first
= requested_full_last
= 0;
6775 requested_full_first
= e
+ 1;
6777 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6778 << ".." << requested_full_last
6779 << ", still need more" << dendl
;
6782 void OSD::requeue_failures()
6784 std::lock_guard
l(heartbeat_lock
);
6785 unsigned old_queue
= failure_queue
.size();
6786 unsigned old_pending
= failure_pending
.size();
6787 for (auto p
= failure_pending
.begin(); p
!= failure_pending
.end(); ) {
6788 failure_queue
[p
->first
] = p
->second
.first
;
6789 failure_pending
.erase(p
++);
6791 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
6792 << failure_queue
.size() << dendl
;
6795 void OSD::send_failures()
6797 ceph_assert(ceph_mutex_is_locked(map_lock
));
6798 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6799 std::lock_guard
l(heartbeat_lock
);
6800 utime_t now
= ceph_clock_now();
6801 const auto osdmap
= get_osdmap();
6802 while (!failure_queue
.empty()) {
6803 int osd
= failure_queue
.begin()->first
;
6804 if (!failure_pending
.count(osd
)) {
6805 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
6806 monc
->send_mon_message(
6810 osdmap
->get_addrs(osd
),
6812 osdmap
->get_epoch()));
6813 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
,
6814 osdmap
->get_addrs(osd
));
6816 failure_queue
.erase(osd
);
6820 void OSD::send_still_alive(epoch_t epoch
, int osd
, const entity_addrvec_t
&addrs
)
6822 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), osd
, addrs
, 0, epoch
,
6823 MOSDFailure::FLAG_ALIVE
);
6824 monc
->send_mon_message(m
);
6827 void OSD::cancel_pending_failures()
6829 std::lock_guard
l(heartbeat_lock
);
6830 auto it
= failure_pending
.begin();
6831 while (it
!= failure_pending
.end()) {
6832 dout(10) << __func__
<< " canceling in-flight failure report for osd."
6833 << it
->first
<< dendl
;
6834 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
6835 failure_pending
.erase(it
++);
6839 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
6841 const auto& monmap
= monc
->monmap
;
6842 // send beacon to mon even if we are just connected, and the monmap is not
6843 // initialized yet by then.
6844 if (monmap
.epoch
> 0 &&
6845 monmap
.get_required_features().contains_all(
6846 ceph::features::mon::FEATURE_LUMINOUS
)) {
6847 dout(20) << __func__
<< " sending" << dendl
;
6848 MOSDBeacon
* beacon
= nullptr;
6850 std::lock_guard l
{min_last_epoch_clean_lock
};
6851 beacon
= new MOSDBeacon(get_osdmap_epoch(),
6852 min_last_epoch_clean
,
6853 superblock
.last_purged_snaps_scrub
,
6854 cct
->_conf
->osd_beacon_report_interval
);
6855 beacon
->pgs
= min_last_epoch_clean_pgs
;
6856 last_sent_beacon
= now
;
6858 monc
->send_mon_message(beacon
);
6860 dout(20) << __func__
<< " not sending" << dendl
;
6864 void OSD::handle_command(MCommand
*m
)
6866 ConnectionRef con
= m
->get_connection();
6867 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6869 con
->send_message(new MCommandReply(m
, -EACCES
));
6873 if (!session
->caps
.allow_all()) {
6874 con
->send_message(new MCommandReply(m
, -EACCES
));
6878 cct
->get_admin_socket()->queue_tell_command(m
);
6883 class unlock_guard
{
6886 explicit unlock_guard(ceph::mutex
& mutex
)
6891 unlock_guard(unlock_guard
&) = delete;
6898 void OSD::scrub_purged_snaps()
6900 dout(10) << __func__
<< dendl
;
6901 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6902 SnapMapper::Scrubber
s(cct
, store
, service
.meta_ch
,
6903 make_snapmapper_oid(),
6904 make_purged_snaps_oid());
6905 clog
->debug() << "purged_snaps scrub starts";
6908 if (s
.stray
.size()) {
6909 clog
->debug() << "purged_snaps scrub found " << s
.stray
.size() << " strays";
6911 clog
->debug() << "purged_snaps scrub ok";
6913 set
<pair
<spg_t
,snapid_t
>> queued
;
6914 for (auto& [pool
, snap
, hash
, shard
] : s
.stray
) {
6915 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(pool
);
6917 dout(20) << __func__
<< " pool " << pool
<< " dne" << dendl
;
6920 pg_t
pgid(pi
->raw_hash_to_pg(hash
), pool
);
6921 spg_t
spgid(pgid
, shard
);
6922 pair
<spg_t
,snapid_t
> p(spgid
, snap
);
6923 if (queued
.count(p
)) {
6924 dout(20) << __func__
<< " pg " << spgid
<< " snap " << snap
6925 << " already queued" << dendl
;
6928 PGRef pg
= lookup_lock_pg(spgid
);
6930 dout(20) << __func__
<< " pg " << spgid
<< " not found" << dendl
;
6934 dout(10) << __func__
<< " requeue pg " << spgid
<< " " << pg
<< " snap "
6936 pg
->queue_snap_retrim(snap
);
6940 if (is_stopping()) {
6943 dout(10) << __func__
<< " done queueing pgs, updating superblock" << dendl
;
6944 ObjectStore::Transaction t
;
6945 superblock
.last_purged_snaps_scrub
= ceph_clock_now();
6946 write_superblock(t
);
6947 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
6948 ceph_assert(tr
== 0);
6950 send_beacon(ceph::coarse_mono_clock::now());
6952 dout(10) << __func__
<< " done" << dendl
;
6955 void OSD::probe_smart(const string
& only_devid
, ostream
& ss
)
6957 set
<string
> devnames
;
6958 store
->get_devices(&devnames
);
6959 uint64_t smart_timeout
= cct
->_conf
.get_val
<uint64_t>(
6960 "osd_smart_report_timeout");
6962 // == typedef std::map<std::string, mValue> mObject;
6963 json_spirit::mObject json_map
;
6965 for (auto dev
: devnames
) {
6966 // smartctl works only on physical devices; filter out any logical device
6967 if (dev
.find("dm-") == 0) {
6972 string devid
= get_device_id(dev
, &err
);
6973 if (devid
.size() == 0) {
6974 dout(10) << __func__
<< " no unique id for dev " << dev
<< " ("
6975 << err
<< "), skipping" << dendl
;
6978 if (only_devid
.size() && devid
!= only_devid
) {
6982 json_spirit::mValue smart_json
;
6983 if (block_device_get_metrics(dev
, smart_timeout
,
6985 dout(10) << "block_device_get_metrics failed for /dev/" << dev
<< dendl
;
6988 json_map
[devid
] = smart_json
;
6990 json_spirit::write(json_map
, ss
, json_spirit::pretty_print
);
6993 bool OSD::heartbeat_dispatch(Message
*m
)
6995 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
6996 switch (m
->get_type()) {
6999 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
7004 handle_osd_ping(static_cast<MOSDPing
*>(m
));
7008 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
7015 bool OSD::ms_dispatch(Message
*m
)
7017 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
7018 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
7019 service
.got_stop_ack();
7027 if (is_stopping()) {
7041 void OSDService::maybe_share_map(
7043 const OSDMapRef
& osdmap
,
7044 epoch_t peer_epoch_lb
)
7046 // NOTE: we assume caller hold something that keeps the Connection itself
7047 // pinned (e.g., an OpRequest's MessageRef).
7048 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
7053 // assume the peer has the newer of the op's sent_epoch and what
7054 // we think we sent them.
7055 session
->sent_epoch_lock
.lock();
7056 if (peer_epoch_lb
> session
->last_sent_epoch
) {
7057 dout(10) << __func__
<< " con " << con
7058 << " " << con
->get_peer_addr()
7059 << " map epoch " << session
->last_sent_epoch
7060 << " -> " << peer_epoch_lb
<< " (as per caller)" << dendl
;
7061 session
->last_sent_epoch
= peer_epoch_lb
;
7063 epoch_t last_sent_epoch
= session
->last_sent_epoch
;
7064 session
->sent_epoch_lock
.unlock();
7066 if (osdmap
->get_epoch() <= last_sent_epoch
) {
7070 send_incremental_map(last_sent_epoch
, con
, osdmap
);
7071 last_sent_epoch
= osdmap
->get_epoch();
7073 session
->sent_epoch_lock
.lock();
7074 if (session
->last_sent_epoch
< last_sent_epoch
) {
7075 dout(10) << __func__
<< " con " << con
7076 << " " << con
->get_peer_addr()
7077 << " map epoch " << session
->last_sent_epoch
7078 << " -> " << last_sent_epoch
<< " (shared)" << dendl
;
7079 session
->last_sent_epoch
= last_sent_epoch
;
7081 session
->sent_epoch_lock
.unlock();
7084 void OSD::dispatch_session_waiting(const ceph::ref_t
<Session
>& session
, OSDMapRef osdmap
)
7086 ceph_assert(ceph_mutex_is_locked(session
->session_dispatch_lock
));
7088 auto i
= session
->waiting_on_map
.begin();
7089 while (i
!= session
->waiting_on_map
.end()) {
7090 OpRequestRef op
= &(*i
);
7091 ceph_assert(ms_can_fast_dispatch(op
->get_req()));
7092 auto m
= op
->get_req
<MOSDFastDispatchOp
>();
7093 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
7096 session
->waiting_on_map
.erase(i
++);
7100 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
7101 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
7102 static_cast<const MOSDOp
*>(m
)->get_pg());
7103 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
7107 pgid
= m
->get_spg();
7109 enqueue_op(pgid
, std::move(op
), m
->get_map_epoch());
7112 if (session
->waiting_on_map
.empty()) {
7113 clear_session_waiting_on_map(session
);
7115 register_session_waiting_on_map(session
);
7119 void OSD::ms_fast_dispatch(Message
*m
)
7123 jaeger_tracing::init_tracer("osd-services-reinit");
7124 dout(10) << "jaeger tracer after " << opentracing::Tracer::Global() << dendl
;
7125 auto dispatch_span
= jaeger_tracing::new_span(__func__
);
7128 if (service
.is_stopping()) {
7134 switch (m
->get_type()) {
7136 dout(10) << "ping from " << m
->get_source() << dendl
;
7139 case MSG_OSD_FORCE_RECOVERY
:
7140 handle_fast_force_recovery(static_cast<MOSDForceRecovery
*>(m
));
7142 case MSG_OSD_SCRUB2
:
7143 handle_fast_scrub(static_cast<MOSDScrub2
*>(m
));
7146 case MSG_OSD_PG_CREATE2
:
7147 return handle_fast_pg_create(static_cast<MOSDPGCreate2
*>(m
));
7148 case MSG_OSD_PG_QUERY
:
7149 return handle_fast_pg_query(static_cast<MOSDPGQuery
*>(m
));
7150 case MSG_OSD_PG_NOTIFY
:
7151 return handle_fast_pg_notify(static_cast<MOSDPGNotify
*>(m
));
7152 case MSG_OSD_PG_INFO
:
7153 return handle_fast_pg_info(static_cast<MOSDPGInfo
*>(m
));
7154 case MSG_OSD_PG_REMOVE
:
7155 return handle_fast_pg_remove(static_cast<MOSDPGRemove
*>(m
));
7157 // these are single-pg messages that handle themselves
7158 case MSG_OSD_PG_LOG
:
7159 case MSG_OSD_PG_TRIM
:
7160 case MSG_OSD_PG_NOTIFY2
:
7161 case MSG_OSD_PG_QUERY2
:
7162 case MSG_OSD_PG_INFO2
:
7163 case MSG_OSD_BACKFILL_RESERVE
:
7164 case MSG_OSD_RECOVERY_RESERVE
:
7165 case MSG_OSD_PG_LEASE
:
7166 case MSG_OSD_PG_LEASE_ACK
:
7168 MOSDPeeringOp
*pm
= static_cast<MOSDPeeringOp
*>(m
);
7169 if (require_osd_peer(pm
)) {
7170 enqueue_peering_evt(
7172 PGPeeringEventRef(pm
->get_event()));
7179 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7182 osd_reqid_t reqid
= op
->get_reqid();
7184 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
7185 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
7188 op
->set_osd_parent_span(dispatch_span
);
7189 if (op
->osd_parent_span
) {
7190 auto op_req_span
= jaeger_tracing::child_span("op-request-created", op
->osd_parent_span
);
7191 op
->set_osd_parent_span(op_req_span
);
7195 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7197 // note sender epoch, min req's epoch
7198 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
7199 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
7200 ceph_assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
7202 service
.maybe_inject_dispatch_delay();
7204 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
7205 m
->get_type() != CEPH_MSG_OSD_OP
) {
7206 // queue it directly
7208 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
7210 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
7212 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7213 // message that didn't have an explicit spg_t); we need to map
7214 // them to an spg_t while preserving delivery order.
7215 auto priv
= m
->get_connection()->get_priv();
7216 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
7217 std::lock_guard l
{session
->session_dispatch_lock
};
7219 session
->waiting_on_map
.push_back(*op
);
7220 OSDMapRef nextmap
= service
.get_nextmap_reserved();
7221 dispatch_session_waiting(session
, nextmap
);
7222 service
.release_map(nextmap
);
7225 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
7228 int OSD::ms_handle_authentication(Connection
*con
)
7231 auto s
= ceph::ref_cast
<Session
>(con
->get_priv());
7233 s
= ceph::make_ref
<Session
>(cct
, con
);
7235 s
->entity_name
= con
->get_peer_entity_name();
7236 dout(10) << __func__
<< " new session " << s
<< " con " << s
->con
7237 << " entity " << s
->entity_name
7238 << " addr " << con
->get_peer_addrs() << dendl
;
7240 dout(10) << __func__
<< " existing session " << s
<< " con " << s
->con
7241 << " entity " << s
->entity_name
7242 << " addr " << con
->get_peer_addrs() << dendl
;
7245 AuthCapsInfo
&caps_info
= con
->get_peer_caps_info();
7246 if (caps_info
.allow_all
) {
7247 s
->caps
.set_allow_all();
7248 } else if (caps_info
.caps
.length() > 0) {
7249 bufferlist::const_iterator p
= caps_info
.caps
.cbegin();
7254 catch (ceph::buffer::error
& e
) {
7255 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7256 << " failed to decode caps string" << dendl
;
7260 bool success
= s
->caps
.parse(str
);
7262 dout(10) << __func__
<< " session " << s
7263 << " " << s
->entity_name
7264 << " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
7267 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7268 << " failed to parse caps '" << str
<< "'" << dendl
;
7276 void OSD::do_waiters()
7278 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7280 dout(10) << "do_waiters -- start" << dendl
;
7281 while (!finished
.empty()) {
7282 OpRequestRef next
= finished
.front();
7283 finished
.pop_front();
7286 dout(10) << "do_waiters -- finish" << dendl
;
7289 void OSD::dispatch_op(OpRequestRef op
)
7291 switch (op
->get_req()->get_type()) {
7293 case MSG_OSD_PG_CREATE
:
7294 handle_pg_create(op
);
7299 void OSD::_dispatch(Message
*m
)
7301 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7302 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
7304 switch (m
->get_type()) {
7305 // -- don't need OSDMap --
7307 // map and replication
7308 case CEPH_MSG_OSD_MAP
:
7309 handle_osd_map(static_cast<MOSDMap
*>(m
));
7311 case MSG_MON_GET_PURGED_SNAPS_REPLY
:
7312 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply
*>(m
));
7317 handle_scrub(static_cast<MOSDScrub
*>(m
));
7321 handle_command(static_cast<MCommand
*>(m
));
7324 // -- need OSDMap --
7326 case MSG_OSD_PG_CREATE
:
7328 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7330 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7331 // no map? starting up?
7332 if (!get_osdmap()) {
7333 dout(7) << "no OSDMap, not booted" << dendl
;
7334 logger
->inc(l_osd_waiting_for_map
);
7335 waiting_for_osdmap
.push_back(op
);
7336 op
->mark_delayed("no osdmap");
7346 // remove me post-nautilus
7347 void OSD::handle_scrub(MOSDScrub
*m
)
7349 dout(10) << "handle_scrub " << *m
<< dendl
;
7350 if (!require_mon_or_mgr_peer(m
)) {
7354 if (m
->fsid
!= monc
->get_fsid()) {
7355 dout(0) << "handle_scrub fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7364 if (!m
->scrub_pgs
.empty()) {
7366 for (auto pgid
: m
->scrub_pgs
) {
7368 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
7369 std::find(spgs
.begin(), spgs
.end(), pcand
) != spgs
.end()) {
7376 for (auto pgid
: spgs
) {
7377 enqueue_peering_evt(
7380 std::make_shared
<PGPeeringEvent
>(
7383 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7389 void OSD::handle_fast_scrub(MOSDScrub2
*m
)
7391 dout(10) << __func__
<< " " << *m
<< dendl
;
7392 if (!require_mon_or_mgr_peer(m
)) {
7396 if (m
->fsid
!= monc
->get_fsid()) {
7397 dout(0) << __func__
<< " fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7402 for (auto pgid
: m
->scrub_pgs
) {
7403 enqueue_peering_evt(
7406 std::make_shared
<PGPeeringEvent
>(
7409 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7414 bool OSD::scrub_random_backoff()
7416 bool coin_flip
= (rand() / (double)RAND_MAX
>=
7417 cct
->_conf
->osd_scrub_backoff_ratio
);
7419 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl
;
7425 OSDService::ScrubJob::ScrubJob(CephContext
* cct
,
7426 const spg_t
& pg
, const utime_t
& timestamp
,
7427 double pool_scrub_min_interval
,
7428 double pool_scrub_max_interval
, bool must
)
7431 sched_time(timestamp
),
7434 // if not explicitly requested, postpone the scrub with a random delay
7436 double scrub_min_interval
= pool_scrub_min_interval
> 0 ?
7437 pool_scrub_min_interval
: cct
->_conf
->osd_scrub_min_interval
;
7438 double scrub_max_interval
= pool_scrub_max_interval
> 0 ?
7439 pool_scrub_max_interval
: cct
->_conf
->osd_scrub_max_interval
;
7441 sched_time
+= scrub_min_interval
;
7442 double r
= rand() / (double)RAND_MAX
;
7444 scrub_min_interval
* cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
7445 if (scrub_max_interval
== 0) {
7446 deadline
= utime_t();
7448 deadline
+= scrub_max_interval
;
7454 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob
& rhs
) const {
7455 if (sched_time
< rhs
.sched_time
)
7457 if (sched_time
> rhs
.sched_time
)
7459 return pgid
< rhs
.pgid
;
7462 void OSDService::dumps_scrub(ceph::Formatter
*f
)
7464 ceph_assert(f
!= nullptr);
7465 std::lock_guard
l(sched_scrub_lock
);
7467 f
->open_array_section("scrubs");
7468 for (const auto &i
: sched_scrub_pg
) {
7469 f
->open_object_section("scrub");
7470 f
->dump_stream("pgid") << i
.pgid
;
7471 f
->dump_stream("sched_time") << i
.sched_time
;
7472 f
->dump_stream("deadline") << i
.deadline
;
7473 f
->dump_bool("forced", i
.sched_time
== PgScrubber::scrub_must_stamp());
7479 double OSD::scrub_sleep_time(bool must_scrub
)
7482 return cct
->_conf
->osd_scrub_sleep
;
7484 utime_t now
= ceph_clock_now();
7485 if (scrub_time_permit(now
)) {
7486 return cct
->_conf
->osd_scrub_sleep
;
7488 double normal_sleep
= cct
->_conf
->osd_scrub_sleep
;
7489 double extended_sleep
= cct
->_conf
->osd_scrub_extended_sleep
;
7490 return std::max(extended_sleep
, normal_sleep
);
7493 bool OSD::scrub_time_permit(utime_t now
)
7496 time_t tt
= now
.sec();
7497 localtime_r(&tt
, &bdt
);
7499 bool day_permit
= false;
7500 if (cct
->_conf
->osd_scrub_begin_week_day
< cct
->_conf
->osd_scrub_end_week_day
) {
7501 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
&& bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7505 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
|| bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7511 dout(20) << __func__
<< " should run between week day " << cct
->_conf
->osd_scrub_begin_week_day
7512 << " - " << cct
->_conf
->osd_scrub_end_week_day
7513 << " now " << bdt
.tm_wday
<< " = no" << dendl
;
7517 bool time_permit
= false;
7518 if (cct
->_conf
->osd_scrub_begin_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7519 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
&& bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7523 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
|| bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7528 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7529 << " - " << cct
->_conf
->osd_scrub_end_hour
7530 << " now " << bdt
.tm_hour
<< " = yes" << dendl
;
7532 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7533 << " - " << cct
->_conf
->osd_scrub_end_hour
7534 << " now " << bdt
.tm_hour
<< " = no" << dendl
;
7539 bool OSD::scrub_load_below_threshold()
7542 if (getloadavg(loadavgs
, 3) != 3) {
7543 dout(10) << __func__
<< " couldn't read loadavgs\n" << dendl
;
7547 // allow scrub if below configured threshold
7548 long cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
7549 double loadavg_per_cpu
= cpus
> 0 ? loadavgs
[0] / cpus
: loadavgs
[0];
7550 if (loadavg_per_cpu
< cct
->_conf
->osd_scrub_load_threshold
) {
7551 dout(20) << __func__
<< " loadavg per cpu " << loadavg_per_cpu
7552 << " < max " << cct
->_conf
->osd_scrub_load_threshold
7553 << " = yes" << dendl
;
7557 // allow scrub if below daily avg and currently decreasing
7558 if (loadavgs
[0] < daily_loadavg
&& loadavgs
[0] < loadavgs
[2]) {
7559 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7560 << " < daily_loadavg " << daily_loadavg
7561 << " and < 15m avg " << loadavgs
[2]
7562 << " = yes" << dendl
;
7566 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7567 << " >= max " << cct
->_conf
->osd_scrub_load_threshold
7568 << " and ( >= daily_loadavg " << daily_loadavg
7569 << " or >= 15m avg " << loadavgs
[2]
7570 << ") = no" << dendl
;
7574 void OSD::sched_scrub()
7576 dout(20) << __func__
<< " sched_scrub starts" << dendl
;
7578 // if not permitted, fail fast
7579 if (!service
.can_inc_scrubs()) {
7580 dout(20) << __func__
<< ": OSD cannot inc scrubs" << dendl
;
7583 bool allow_requested_repair_only
= false;
7584 if (service
.is_recovery_active() && !cct
->_conf
->osd_scrub_during_recovery
) {
7585 if (!cct
->_conf
->osd_repair_during_recovery
) {
7586 dout(15) << __func__
<< ": not scheduling scrubs due to active recovery" << dendl
;
7589 dout(10) << __func__
7590 << " will only schedule explicitly requested repair due to active recovery"
7592 allow_requested_repair_only
= true;
7595 utime_t now
= ceph_clock_now();
7596 bool time_permit
= scrub_time_permit(now
);
7597 bool load_is_low
= scrub_load_below_threshold();
7598 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low
<< dendl
;
7600 OSDService::ScrubJob scrub_job
;
7601 if (service
.first_scrub_stamp(&scrub_job
)) {
7603 dout(30) << "sched_scrub examine " << scrub_job
.pgid
<< " at " << scrub_job
.sched_time
<< dendl
;
7605 if (scrub_job
.sched_time
> now
) {
7606 // save ourselves some effort
7607 dout(20) << "sched_scrub " << scrub_job
.pgid
<< " scheduled at " << scrub_job
.sched_time
7608 << " > " << now
<< dendl
;
7612 if ((scrub_job
.deadline
.is_zero() || scrub_job
.deadline
>= now
) && !(time_permit
&& load_is_low
)) {
7613 dout(15) << __func__
<< " not scheduling scrub for " << scrub_job
.pgid
<< " due to "
7614 << (!time_permit
? "time not permit" : "high load") << dendl
;
7618 PGRef pg
= _lookup_lock_pg(scrub_job
.pgid
);
7620 dout(20) << __func__
<< " pg " << scrub_job
.pgid
<< " not found" << dendl
;
7624 // This has already started, so go on to the next scrub job
7625 if (pg
->is_scrub_active()) {
7627 dout(20) << __func__
<< ": already in progress pgid " << scrub_job
.pgid
<< dendl
;
7630 // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7631 if (allow_requested_repair_only
&& !pg
->m_planned_scrub
.must_repair
) {
7633 dout(10) << __func__
<< " skip " << scrub_job
.pgid
7634 << " because repairing is not explicitly requested on it"
7639 // If it is reserving, let it resolve before going to the next scrub job
7640 if (pg
->m_scrubber
->is_reserving()) {
7642 dout(10) << __func__
<< ": reserve in progress pgid " << scrub_job
.pgid
<< dendl
;
7645 dout(15) << "sched_scrub scrubbing " << scrub_job
.pgid
<< " at " << scrub_job
.sched_time
7646 << (pg
->get_must_scrub() ? ", explicitly requested" :
7647 (load_is_low
? ", load_is_low" : " deadline < now"))
7649 if (pg
->sched_scrub()) {
7651 dout(10) << __func__
<< " scheduled a scrub!" << " (~" << scrub_job
.pgid
<< "~)" << dendl
;
7655 } while (service
.next_scrub_stamp(scrub_job
, &scrub_job
));
7657 dout(20) << "sched_scrub done" << dendl
;
7660 void OSD::resched_all_scrubs()
7662 dout(10) << __func__
<< ": start" << dendl
;
7663 const vector
<spg_t
> pgs
= [this] {
7665 OSDService::ScrubJob job
;
7666 if (service
.first_scrub_stamp(&job
)) {
7668 pgs
.push_back(job
.pgid
);
7669 } while (service
.next_scrub_stamp(job
, &job
));
7673 for (auto& pgid
: pgs
) {
7674 dout(20) << __func__
<< ": examine " << pgid
<< dendl
;
7675 PGRef pg
= _lookup_lock_pg(pgid
);
7678 if (!pg
->m_planned_scrub
.must_scrub
&& !pg
->m_planned_scrub
.need_auto
) {
7679 dout(15) << __func__
<< ": reschedule " << pgid
<< dendl
;
7680 pg
->on_info_history_change();
7684 dout(10) << __func__
<< ": done" << dendl
;
7687 MPGStats
* OSD::collect_pg_stats()
7689 // This implementation unconditionally sends every is_primary PG's
7690 // stats every time we're called. This has equivalent cost to the
7691 // previous implementation's worst case where all PGs are busy and
7692 // their stats are always enqueued for sending.
7693 std::shared_lock l
{map_lock
};
7695 osd_stat_t cur_stat
= service
.get_osd_stat();
7696 cur_stat
.os_perf_stat
= store
->get_cur_stats();
7698 auto m
= new MPGStats(monc
->get_fsid(), get_osdmap_epoch());
7699 m
->osd_stat
= cur_stat
;
7701 std::lock_guard lec
{min_last_epoch_clean_lock
};
7702 min_last_epoch_clean
= get_osdmap_epoch();
7703 min_last_epoch_clean_pgs
.clear();
7705 std::set
<int64_t> pool_set
;
7708 for (auto& pg
: pgs
) {
7709 auto pool
= pg
->pg_id
.pgid
.pool();
7710 pool_set
.emplace((int64_t)pool
);
7711 if (!pg
->is_primary()) {
7714 pg
->get_pg_stats([&](const pg_stat_t
& s
, epoch_t lec
) {
7715 m
->pg_stat
[pg
->pg_id
.pgid
] = s
;
7716 min_last_epoch_clean
= std::min(min_last_epoch_clean
, lec
);
7717 min_last_epoch_clean_pgs
.push_back(pg
->pg_id
.pgid
);
7721 bool per_pool_stats
= false;
7722 bool per_pool_omap_stats
= false;
7723 for (auto p
: pool_set
) {
7724 int r
= store
->pool_statfs(p
, &st
, &per_pool_omap_stats
);
7725 if (r
== -ENOTSUP
) {
7729 m
->pool_stat
[p
] = st
;
7730 per_pool_stats
= true;
7734 // indicate whether we are reporting per-pool stats
7735 m
->osd_stat
.num_osds
= 1;
7736 m
->osd_stat
.num_per_pool_osds
= per_pool_stats
? 1 : 0;
7737 m
->osd_stat
.num_per_pool_omap_osds
= per_pool_omap_stats
? 1 : 0;
7742 vector
<DaemonHealthMetric
> OSD::get_health_metrics()
7744 vector
<DaemonHealthMetric
> metrics
;
7746 utime_t oldest_secs
;
7747 const utime_t now
= ceph_clock_now();
7749 too_old
-= cct
->_conf
.get_val
<double>("osd_op_complaint_time");
7751 TrackedOpRef oldest_op
;
7752 auto count_slow_ops
= [&](TrackedOp
& op
) {
7753 if (op
.get_initiated() < too_old
) {
7755 ss
<< "slow request " << op
.get_desc()
7757 << op
.get_initiated()
7759 << op
.state_string();
7760 lgeneric_subdout(cct
,osd
,20) << ss
.str() << dendl
;
7761 clog
->warn() << ss
.str();
7763 if (!oldest_op
|| op
.get_initiated() < oldest_op
->get_initiated()) {
7771 if (op_tracker
.visit_ops_in_flight(&oldest_secs
, count_slow_ops
)) {
7773 derr
<< __func__
<< " reporting " << slow
<< " slow ops, oldest is "
7774 << oldest_op
->get_desc() << dendl
;
7776 metrics
.emplace_back(daemon_metric::SLOW_OPS
, slow
, oldest_secs
);
7778 // no news is not good news.
7779 metrics
.emplace_back(daemon_metric::SLOW_OPS
, 0, 0);
7783 std::lock_guard
l(pending_creates_lock
);
7784 auto n_primaries
= pending_creates_from_mon
;
7785 for (const auto& create
: pending_creates_from_osd
) {
7786 if (create
.second
) {
7790 metrics
.emplace_back(daemon_metric::PENDING_CREATING_PGS
, n_primaries
);
7795 // =====================================================
7798 void OSD::wait_for_new_map(OpRequestRef op
)
7801 if (waiting_for_osdmap
.empty()) {
7802 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7805 logger
->inc(l_osd_waiting_for_map
);
7806 waiting_for_osdmap
.push_back(op
);
7807 op
->mark_delayed("wait for new map");
7812 * assimilate new OSDMap(s). scan pgs, etc.
7815 void OSD::note_down_osd(int peer
)
7817 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7818 cluster_messenger
->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer
));
7820 std::lock_guard l
{heartbeat_lock
};
7821 failure_queue
.erase(peer
);
7822 failure_pending
.erase(peer
);
7823 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
7824 if (p
!= heartbeat_peers
.end()) {
7825 p
->second
.clear_mark_down();
7826 heartbeat_peers
.erase(p
);
7830 void OSD::note_up_osd(int peer
)
7832 heartbeat_set_peers_need_update();
7835 struct C_OnMapCommit
: public Context
{
7837 epoch_t first
, last
;
7839 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
7840 : osd(o
), first(f
), last(l
), msg(m
) {}
7841 void finish(int r
) override
{
7842 osd
->_committed_osd_maps(first
, last
, msg
);
7847 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
7849 std::lock_guard
l(osdmap_subscribe_lock
);
7850 if (latest_subscribed_epoch
>= epoch
&& !force_request
)
7853 latest_subscribed_epoch
= std::max
<uint64_t>(epoch
, latest_subscribed_epoch
);
7855 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
7861 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
7863 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
7864 if (min
<= superblock
.oldest_map
)
7868 ObjectStore::Transaction t
;
7869 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
7870 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
7871 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
7872 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
7873 superblock
.oldest_map
= e
+ 1;
7875 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
7876 service
.publish_superblock(superblock
);
7877 write_superblock(t
);
7878 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7879 ceph_assert(tr
== 0);
7882 // skip_maps leaves us with a range of old maps if we fail to remove all
7883 // of them before moving superblock.oldest_map forward to the first map
7884 // in the incoming MOSDMap msg. so we should continue removing them in
7885 // this case, even we could do huge series of delete transactions all at
7892 service
.publish_superblock(superblock
);
7893 write_superblock(t
);
7894 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7895 ceph_assert(tr
== 0);
7897 // we should not remove the cached maps
7898 ceph_assert(min
<= service
.map_cache
.cached_key_lower_bound());
7901 void OSD::handle_osd_map(MOSDMap
*m
)
7903 // wait for pgs to catch up
7905 // we extend the map cache pins to accomodate pgs slow to consume maps
7906 // for some period, until we hit the max_lag_factor bound, at which point
7907 // we block here to stop injesting more maps than they are able to keep
7909 epoch_t max_lag
= cct
->_conf
->osd_map_cache_size
*
7910 m_osd_pg_epoch_max_lag_factor
;
7911 ceph_assert(max_lag
> 0);
7912 epoch_t osd_min
= 0;
7913 for (auto shard
: shards
) {
7914 epoch_t min
= shard
->get_min_pg_epoch();
7915 if (osd_min
== 0 || min
< osd_min
) {
7919 epoch_t osdmap_epoch
= get_osdmap_epoch();
7921 osdmap_epoch
> max_lag
&&
7922 osdmap_epoch
- max_lag
> osd_min
) {
7923 epoch_t need
= osdmap_epoch
- max_lag
;
7924 dout(10) << __func__
<< " waiting for pgs to catch up (need " << need
7925 << " max_lag " << max_lag
<< ")" << dendl
;
7926 for (auto shard
: shards
) {
7927 epoch_t min
= shard
->get_min_pg_epoch();
7929 dout(10) << __func__
<< " waiting for pgs to consume " << need
7930 << " (shard " << shard
->shard_id
<< " min " << min
7931 << ", map cache is " << cct
->_conf
->osd_map_cache_size
7932 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7934 unlock_guard unlock
{osd_lock
};
7935 shard
->wait_min_pg_epoch(need
);
7941 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7942 map
<epoch_t
,OSDMapRef
> added_maps
;
7943 map
<epoch_t
,bufferlist
> added_maps_bl
;
7944 if (m
->fsid
!= monc
->get_fsid()) {
7945 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
7946 << monc
->get_fsid() << dendl
;
7950 if (is_initializing()) {
7951 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
7956 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
7957 if (session
&& !(session
->entity_name
.is_mon() ||
7958 session
->entity_name
.is_osd())) {
7960 dout(10) << "got osd map from Session " << session
7961 << " which we can't take maps from (not a mon or osd)" << dendl
;
7966 // share with the objecter
7968 service
.objecter
->handle_osd_map(m
);
7970 epoch_t first
= m
->get_first();
7971 epoch_t last
= m
->get_last();
7972 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
7973 << superblock
.newest_map
7974 << ", src has [" << m
->oldest_map
<< "," << m
->newest_map
<< "]"
7977 logger
->inc(l_osd_map
);
7978 logger
->inc(l_osd_mape
, last
- first
+ 1);
7979 if (first
<= superblock
.newest_map
)
7980 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
7981 if (service
.max_oldest_map
< m
->oldest_map
) {
7982 service
.max_oldest_map
= m
->oldest_map
;
7983 ceph_assert(service
.max_oldest_map
>= superblock
.oldest_map
);
7986 // make sure there is something new, here, before we bother flushing
7987 // the queues and such
7988 if (last
<= superblock
.newest_map
) {
7989 dout(10) << " no new maps here, dropping" << dendl
;
7995 bool skip_maps
= false;
7996 if (first
> superblock
.newest_map
+ 1) {
7997 dout(10) << "handle_osd_map message skips epochs "
7998 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
7999 if (m
->oldest_map
<= superblock
.newest_map
+ 1) {
8000 osdmap_subscribe(superblock
.newest_map
+ 1, false);
8004 // always try to get the full range of maps--as many as we can. this
8005 // 1- is good to have
8006 // 2- is at present the only way to ensure that we get a *full* map as
8008 if (m
->oldest_map
< first
) {
8009 osdmap_subscribe(m
->oldest_map
- 1, true);
8016 ObjectStore::Transaction t
;
8017 uint64_t txn_size
= 0;
8019 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> purged_snaps
;
8021 // store new maps: queue for disk and put in the osdmap cache
8022 epoch_t start
= std::max(superblock
.newest_map
+ 1, first
);
8023 for (epoch_t e
= start
; e
<= last
; e
++) {
8024 if (txn_size
>= t
.get_num_bytes()) {
8025 derr
<< __func__
<< " transaction size overflowed" << dendl
;
8026 ceph_assert(txn_size
< t
.get_num_bytes());
8028 txn_size
= t
.get_num_bytes();
8029 map
<epoch_t
,bufferlist
>::iterator p
;
8030 p
= m
->maps
.find(e
);
8031 if (p
!= m
->maps
.end()) {
8032 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
8033 OSDMap
*o
= new OSDMap
;
8034 bufferlist
& bl
= p
->second
;
8038 purged_snaps
[e
] = o
->get_new_purged_snaps();
8040 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8041 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
8042 added_maps
[e
] = add_map(o
);
8043 added_maps_bl
[e
] = bl
;
8048 p
= m
->incremental_maps
.find(e
);
8049 if (p
!= m
->incremental_maps
.end()) {
8050 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
8051 bufferlist
& bl
= p
->second
;
8052 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
8053 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
8055 OSDMap
*o
= new OSDMap
;
8058 bool got
= get_map_bl(e
- 1, obl
);
8060 auto p
= added_maps_bl
.find(e
- 1);
8061 ceph_assert(p
!= added_maps_bl
.end());
8067 OSDMap::Incremental inc
;
8068 auto p
= bl
.cbegin();
8071 if (o
->apply_incremental(inc
) < 0) {
8072 derr
<< "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
8073 ceph_abort_msg("bad fsid");
8077 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
8079 bool injected_failure
= false;
8080 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
8081 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
8082 derr
<< __func__
<< " injecting map crc failure" << dendl
;
8083 injected_failure
= true;
8086 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
8087 dout(2) << "got incremental " << e
8088 << " but failed to encode full with correct crc; requesting"
8090 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
8091 dout(20) << "my encoded map was:\n";
8092 fbl
.hexdump(*_dout
);
8095 request_full_map(e
, last
);
8098 // don't continue committing if we failed to enc the first inc map
8100 dout(10) << __func__
<< " bailing because last < start (" << last
<< "<" << start
<< ")" << dendl
;
8107 purged_snaps
[e
] = o
->get_new_purged_snaps();
8109 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8110 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
8111 added_maps
[e
] = add_map(o
);
8112 added_maps_bl
[e
] = fbl
;
8116 ceph_abort_msg("MOSDMap lied about what maps it had?");
8119 // even if this map isn't from a mon, we may have satisfied our subscription
8120 monc
->sub_got("osdmap", last
);
8122 if (!m
->maps
.empty() && requested_full_first
) {
8123 dout(10) << __func__
<< " still missing full maps " << requested_full_first
8124 << ".." << requested_full_last
<< dendl
;
8125 rerequest_full_maps();
8128 if (superblock
.oldest_map
) {
8129 // make sure we at least keep pace with incoming maps
8130 trim_maps(m
->oldest_map
, last
- first
+ 1, skip_maps
);
8131 pg_num_history
.prune(superblock
.oldest_map
);
8134 if (!superblock
.oldest_map
|| skip_maps
)
8135 superblock
.oldest_map
= first
;
8136 superblock
.newest_map
= last
;
8137 superblock
.current_epoch
= last
;
8139 // note in the superblock that we were clean thru the prior epoch
8140 epoch_t boot_epoch
= service
.get_boot_epoch();
8141 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
8142 superblock
.mounted
= boot_epoch
;
8143 superblock
.clean_thru
= last
;
8146 // check for pg_num changes and deleted pools
8148 for (auto& i
: added_maps
) {
8150 if (!(lastmap
= service
.try_get_map(i
.first
- 1))) {
8151 dout(10) << __func__
<< " can't get previous map " << i
.first
- 1
8152 << " probably first start of this osd" << dendl
;
8156 ceph_assert(lastmap
->get_epoch() + 1 == i
.second
->get_epoch());
8157 for (auto& j
: lastmap
->get_pools()) {
8158 if (!i
.second
->have_pg_pool(j
.first
)) {
8159 pg_num_history
.log_pool_delete(i
.first
, j
.first
);
8160 dout(10) << __func__
<< " recording final pg_pool_t for pool "
8161 << j
.first
<< dendl
;
8162 // this information is needed by _make_pg() if have to restart before
8163 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8164 ghobject_t obj
= make_final_pool_info_oid(j
.first
);
8166 encode(j
.second
, bl
, CEPH_FEATURES_ALL
);
8167 string name
= lastmap
->get_pool_name(j
.first
);
8169 map
<string
,string
> profile
;
8170 if (lastmap
->get_pg_pool(j
.first
)->is_erasure()) {
8171 profile
= lastmap
->get_erasure_code_profile(
8172 lastmap
->get_pg_pool(j
.first
)->erasure_code_profile
);
8174 encode(profile
, bl
);
8175 t
.write(coll_t::meta(), obj
, 0, bl
.length(), bl
);
8176 } else if (unsigned new_pg_num
= i
.second
->get_pg_num(j
.first
);
8177 new_pg_num
!= j
.second
.get_pg_num()) {
8178 dout(10) << __func__
<< " recording pool " << j
.first
<< " pg_num "
8179 << j
.second
.get_pg_num() << " -> " << new_pg_num
<< dendl
;
8180 pg_num_history
.log_pg_num_change(i
.first
, j
.first
, new_pg_num
);
8183 for (auto& j
: i
.second
->get_pools()) {
8184 if (!lastmap
->have_pg_pool(j
.first
)) {
8185 dout(10) << __func__
<< " recording new pool " << j
.first
<< " pg_num "
8186 << j
.second
.get_pg_num() << dendl
;
8187 pg_num_history
.log_pg_num_change(i
.first
, j
.first
,
8188 j
.second
.get_pg_num());
8193 pg_num_history
.epoch
= last
;
8196 ::encode(pg_num_history
, bl
);
8197 t
.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl
.length(), bl
);
8198 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
8201 // record new purged_snaps
8202 if (superblock
.purged_snaps_last
== start
- 1) {
8203 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
8204 make_purged_snaps_oid(), &t
,
8206 superblock
.purged_snaps_last
= last
;
8208 dout(10) << __func__
<< " superblock purged_snaps_last is "
8209 << superblock
.purged_snaps_last
8210 << ", not recording new purged_snaps" << dendl
;
8213 // superblock and commit
8214 write_superblock(t
);
8215 t
.register_on_commit(new C_OnMapCommit(this, start
, last
, m
));
8216 store
->queue_transaction(
8219 service
.publish_superblock(superblock
);
8222 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
8224 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
8225 if (is_stopping()) {
8226 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8229 std::lock_guard
l(osd_lock
);
8230 if (is_stopping()) {
8231 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8236 ceph_assert(first
<= last
);
8238 bool do_shutdown
= false;
8239 bool do_restart
= false;
8240 bool network_error
= false;
8241 OSDMapRef osdmap
= get_osdmap();
8243 // advance through the new maps
8244 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
8245 dout(10) << " advance to epoch " << cur
8246 << " (<= last " << last
8247 << " <= newest_map " << superblock
.newest_map
8250 OSDMapRef newmap
= get_map(cur
);
8251 ceph_assert(newmap
); // we just cached it above!
8253 // start blocklisting messages sent to peers that go down.
8254 service
.pre_publish_map(newmap
);
8256 // kill connections to newly down osds
8257 bool waited_for_reservations
= false;
8259 osdmap
= get_osdmap();
8260 osdmap
->get_all_osds(old
);
8261 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
8263 osdmap
->is_up(*p
) && // in old map
8264 newmap
->is_down(*p
)) { // but not the new one
8265 if (!waited_for_reservations
) {
8266 service
.await_reserved_maps();
8267 waited_for_reservations
= true;
8270 } else if (*p
!= whoami
&&
8271 osdmap
->is_down(*p
) &&
8272 newmap
->is_up(*p
)) {
8277 if (osdmap
->is_noup(whoami
) != newmap
->is_noup(whoami
)) {
8278 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
8281 // this captures the case where we sent the boot message while
8282 // NOUP was being set on the mon and our boot request was
8283 // dropped, and then later it is cleared. it imperfectly
8284 // handles the case where our original boot message was not
8285 // dropped and we restart even though we might have booted, but
8286 // that is harmless (boot will just take slightly longer).
8291 osdmap
= std::move(newmap
);
8295 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
8297 osdmap
->is_up(whoami
) &&
8298 osdmap
->get_addrs(whoami
) == client_messenger
->get_myaddrs()) {
8299 up_epoch
= osdmap
->get_epoch();
8300 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
8302 boot_epoch
= osdmap
->get_epoch();
8303 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
8305 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
8309 epoch_t _bind_epoch
= service
.get_bind_epoch();
8310 if (osdmap
->is_up(whoami
) &&
8311 osdmap
->get_addrs(whoami
).legacy_equals(
8312 client_messenger
->get_myaddrs()) &&
8313 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
8316 dout(1) << "state: booting -> active" << dendl
;
8317 set_state(STATE_ACTIVE
);
8320 // set incarnation so that osd_reqid_t's we generate for our
8321 // objecter requests are unique across restarts.
8322 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
8323 cancel_pending_failures();
8327 if (osdmap
->get_epoch() > 0 &&
8329 if (!osdmap
->exists(whoami
)) {
8330 derr
<< "map says i do not exist. shutting down." << dendl
;
8331 do_shutdown
= true; // don't call shutdown() while we have
8332 // everything paused
8333 } else if (osdmap
->is_stop(whoami
)) {
8334 derr
<< "map says i am stopped by admin. shutting down." << dendl
;
8336 } else if (!osdmap
->is_up(whoami
) ||
8337 !osdmap
->get_addrs(whoami
).legacy_equals(
8338 client_messenger
->get_myaddrs()) ||
8339 !osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8340 cluster_messenger
->get_myaddrs()) ||
8341 !osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8342 hb_back_server_messenger
->get_myaddrs()) ||
8343 !osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8344 hb_front_server_messenger
->get_myaddrs())) {
8345 if (!osdmap
->is_up(whoami
)) {
8346 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
8347 service
.got_stop_ack();
8349 clog
->warn() << "Monitor daemon marked osd." << whoami
<< " down, "
8350 "but it is still running";
8351 clog
->debug() << "map e" << osdmap
->get_epoch()
8352 << " wrongly marked me down at e"
8353 << osdmap
->get_down_at(whoami
);
8355 if (monc
->monmap
.min_mon_release
>= ceph_release_t::octopus
) {
8356 // note that this is best-effort...
8357 monc
->send_mon_message(
8361 osdmap
->get_epoch()));
8363 } else if (!osdmap
->get_addrs(whoami
).legacy_equals(
8364 client_messenger
->get_myaddrs())) {
8365 clog
->error() << "map e" << osdmap
->get_epoch()
8366 << " had wrong client addr (" << osdmap
->get_addrs(whoami
)
8367 << " != my " << client_messenger
->get_myaddrs() << ")";
8368 } else if (!osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8369 cluster_messenger
->get_myaddrs())) {
8370 clog
->error() << "map e" << osdmap
->get_epoch()
8371 << " had wrong cluster addr ("
8372 << osdmap
->get_cluster_addrs(whoami
)
8373 << " != my " << cluster_messenger
->get_myaddrs() << ")";
8374 } else if (!osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8375 hb_back_server_messenger
->get_myaddrs())) {
8376 clog
->error() << "map e" << osdmap
->get_epoch()
8377 << " had wrong heartbeat back addr ("
8378 << osdmap
->get_hb_back_addrs(whoami
)
8379 << " != my " << hb_back_server_messenger
->get_myaddrs()
8381 } else if (!osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8382 hb_front_server_messenger
->get_myaddrs())) {
8383 clog
->error() << "map e" << osdmap
->get_epoch()
8384 << " had wrong heartbeat front addr ("
8385 << osdmap
->get_hb_front_addrs(whoami
)
8386 << " != my " << hb_front_server_messenger
->get_myaddrs()
8390 if (!service
.is_stopping()) {
8391 epoch_t up_epoch
= 0;
8392 epoch_t bind_epoch
= osdmap
->get_epoch();
8393 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
8397 utime_t now
= ceph_clock_now();
8398 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
8399 osd_markdown_log
.push_back(now
);
8400 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
8401 derr
<< __func__
<< " marked down "
8402 << osd_markdown_log
.size()
8403 << " > osd_max_markdown_count "
8404 << cct
->_conf
->osd_max_markdown_count
8405 << " in last " << grace
<< " seconds, shutting down"
8411 start_waiting_for_healthy();
8413 set
<int> avoid_ports
;
8414 #if defined(__FreeBSD__)
8415 // prevent FreeBSD from grabbing the client_messenger port during
8416 // rebinding. In which case a cluster_meesneger will connect also
8418 client_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8420 cluster_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8422 int r
= cluster_messenger
->rebind(avoid_ports
);
8424 do_shutdown
= true; // FIXME: do_restart?
8425 network_error
= true;
8426 derr
<< __func__
<< " marked down:"
8427 << " rebind cluster_messenger failed" << dendl
;
8430 hb_back_server_messenger
->mark_down_all();
8431 hb_front_server_messenger
->mark_down_all();
8432 hb_front_client_messenger
->mark_down_all();
8433 hb_back_client_messenger
->mark_down_all();
8435 reset_heartbeat_peers(true);
8442 check_osdmap_features();
8447 if (is_active() || is_waiting_for_healthy())
8448 maybe_update_heartbeat_peers();
8455 if (network_error
) {
8456 cancel_pending_failures();
8458 // trigger shutdown in a different thread
8459 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
8460 queue_async_signal(SIGINT
);
8462 else if (m
->newest_map
&& m
->newest_map
> last
) {
8463 dout(10) << " msg say newest map is " << m
->newest_map
8464 << ", requesting more" << dendl
;
8465 osdmap_subscribe(osdmap
->get_epoch()+1, false);
8467 else if (is_preboot()) {
8468 if (m
->get_source().is_mon())
8469 _preboot(m
->oldest_map
, m
->newest_map
);
8473 else if (do_restart
)
8478 void OSD::check_osdmap_features()
8480 // adjust required feature bits?
8482 // we have to be a bit careful here, because we are accessing the
8483 // Policy structures without taking any lock. in particular, only
8484 // modify integer values that can safely be read by a racing CPU.
8485 // since we are only accessing existing Policy structures a their
8486 // current memory location, and setting or clearing bits in integer
8487 // fields, and we are the only writer, this is not a problem.
8489 const auto osdmap
= get_osdmap();
8491 Messenger::Policy p
= client_messenger
->get_default_policy();
8493 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
8494 if ((p
.features_required
& mask
) != features
) {
8495 dout(0) << "crush map has features " << features
8496 << ", adjusting msgr requires for clients" << dendl
;
8497 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8498 client_messenger
->set_default_policy(p
);
8502 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
8504 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
8505 if ((p
.features_required
& mask
) != features
) {
8506 dout(0) << "crush map has features " << features
8507 << " was " << p
.features_required
8508 << ", adjusting msgr requires for mons" << dendl
;
8509 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8510 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
8514 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
8516 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
8518 if ((p
.features_required
& mask
) != features
) {
8519 dout(0) << "crush map has features " << features
8520 << ", adjusting msgr requires for osds" << dendl
;
8521 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8522 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
8525 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
8526 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
8527 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
8528 ObjectStore::Transaction t
;
8529 write_superblock(t
);
8530 int err
= store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
8531 ceph_assert(err
== 0);
8535 if (osdmap
->require_osd_release
< ceph_release_t::nautilus
) {
8536 hb_front_server_messenger
->set_require_authorizer(false);
8537 hb_back_server_messenger
->set_require_authorizer(false);
8539 hb_front_server_messenger
->set_require_authorizer(true);
8540 hb_back_server_messenger
->set_require_authorizer(true);
8543 if (osdmap
->require_osd_release
!= last_require_osd_release
) {
8544 dout(1) << __func__
<< " require_osd_release " << last_require_osd_release
8545 << " -> " << to_string(osdmap
->require_osd_release
) << dendl
;
8546 store
->write_meta("require_osd_release",
8547 stringify((int)osdmap
->require_osd_release
));
8548 last_require_osd_release
= osdmap
->require_osd_release
;
8552 struct C_FinishSplits
: public Context
{
8555 C_FinishSplits(OSD
*osd
, const set
<PGRef
> &in
)
8556 : osd(osd
), pgs(in
) {}
8557 void finish(int r
) override
{
8558 osd
->_finish_splits(pgs
);
8562 void OSD::_finish_splits(set
<PGRef
>& pgs
)
8564 dout(10) << __func__
<< " " << pgs
<< dendl
;
8567 for (set
<PGRef
>::iterator i
= pgs
.begin();
8572 PeeringCtx rctx
= create_context();
8574 dout(10) << __func__
<< " " << *pg
<< dendl
;
8575 epoch_t e
= pg
->get_osdmap_epoch();
8576 pg
->handle_initialize(rctx
);
8577 pg
->queue_null(e
, e
);
8578 dispatch_context(rctx
, pg
, service
.get_osdmap());
8581 unsigned shard_index
= pg
->pg_id
.hash_to_shard(num_shards
);
8582 shards
[shard_index
]->register_and_wake_split_child(pg
);
8586 bool OSD::add_merge_waiter(OSDMapRef nextmap
, spg_t target
, PGRef src
,
8589 std::lock_guard
l(merge_lock
);
8590 auto& p
= merge_waiters
[nextmap
->get_epoch()][target
];
8591 p
[src
->pg_id
] = src
;
8592 dout(10) << __func__
<< " added merge_waiter " << src
->pg_id
8593 << " for " << target
<< ", have " << p
.size() << "/" << need
8595 return p
.size() == need
;
8598 bool OSD::advance_pg(
8601 ThreadPool::TPHandle
&handle
,
8604 if (osd_epoch
<= pg
->get_osdmap_epoch()) {
8607 ceph_assert(pg
->is_locked());
8608 OSDMapRef lastmap
= pg
->get_osdmap();
8609 set
<PGRef
> new_pgs
; // any split children
8612 unsigned old_pg_num
= lastmap
->have_pg_pool(pg
->pg_id
.pool()) ?
8613 lastmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8614 for (epoch_t next_epoch
= pg
->get_osdmap_epoch() + 1;
8615 next_epoch
<= osd_epoch
;
8617 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
8619 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
8623 unsigned new_pg_num
=
8624 (old_pg_num
&& nextmap
->have_pg_pool(pg
->pg_id
.pool())) ?
8625 nextmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8626 if (old_pg_num
&& new_pg_num
&& old_pg_num
!= new_pg_num
) {
8628 if (nextmap
->have_pg_pool(pg
->pg_id
.pool())) {
8630 if (pg
->pg_id
.is_merge_source(
8634 // we are merge source
8635 PGRef spg
= pg
; // carry a ref
8636 dout(1) << __func__
<< " " << pg
->pg_id
8637 << " is merge source, target is " << parent
8639 pg
->write_if_dirty(rctx
);
8640 if (!new_pgs
.empty()) {
8641 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8645 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8647 // release backoffs explicitly, since the on_shutdown path
8648 // aggressively tears down backoff state.
8649 if (pg
->is_primary()) {
8650 pg
->release_pg_backoffs();
8653 OSDShard
*sdata
= pg
->osd_shard
;
8655 std::lock_guard
l(sdata
->shard_lock
);
8657 sdata
->_detach_pg(pg
->pg_slot
);
8658 // update pg count now since we might not get an osdmap
8660 if (pg
->is_primary())
8661 logger
->dec(l_osd_pg_primary
);
8662 else if (pg
->is_nonprimary())
8663 logger
->dec(l_osd_pg_replica
); // misnomer
8665 logger
->dec(l_osd_pg_stray
);
8670 set
<spg_t
> children
;
8671 parent
.is_split(new_pg_num
, old_pg_num
, &children
);
8672 if (add_merge_waiter(nextmap
, parent
, pg
, children
.size())) {
8673 enqueue_peering_evt(
8676 std::make_shared
<PGPeeringEvent
>(
8677 nextmap
->get_epoch(),
8678 nextmap
->get_epoch(),
8683 } else if (pg
->pg_id
.is_merge_target(old_pg_num
, new_pg_num
)) {
8684 // we are merge target
8685 set
<spg_t
> children
;
8686 pg
->pg_id
.is_split(new_pg_num
, old_pg_num
, &children
);
8687 dout(20) << __func__
<< " " << pg
->pg_id
8688 << " is merge target, sources are " << children
8690 map
<spg_t
,PGRef
> sources
;
8692 std::lock_guard
l(merge_lock
);
8693 auto& s
= merge_waiters
[nextmap
->get_epoch()][pg
->pg_id
];
8694 unsigned need
= children
.size();
8695 dout(20) << __func__
<< " have " << s
.size() << "/"
8697 if (s
.size() == need
) {
8699 merge_waiters
[nextmap
->get_epoch()].erase(pg
->pg_id
);
8700 if (merge_waiters
[nextmap
->get_epoch()].empty()) {
8701 merge_waiters
.erase(nextmap
->get_epoch());
8705 if (!sources
.empty()) {
8706 unsigned new_pg_num
= nextmap
->get_pg_num(pg
->pg_id
.pool());
8707 unsigned split_bits
= pg
->pg_id
.get_split_bits(new_pg_num
);
8708 dout(1) << __func__
<< " merging " << pg
->pg_id
<< dendl
;
8710 sources
, rctx
, split_bits
,
8711 nextmap
->get_pg_pool(
8712 pg
->pg_id
.pool())->last_pg_merge_meta
);
8713 pg
->pg_slot
->waiting_for_merge_epoch
= 0;
8715 dout(20) << __func__
<< " not ready to merge yet" << dendl
;
8716 pg
->write_if_dirty(rctx
);
8717 if (!new_pgs
.empty()) {
8718 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8722 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8724 // kick source(s) to get them ready
8725 for (auto& i
: children
) {
8726 dout(20) << __func__
<< " kicking source " << i
<< dendl
;
8727 enqueue_peering_evt(
8730 std::make_shared
<PGPeeringEvent
>(
8731 nextmap
->get_epoch(),
8732 nextmap
->get_epoch(),
8742 vector
<int> newup
, newacting
;
8743 int up_primary
, acting_primary
;
8744 nextmap
->pg_to_up_acting_osds(
8746 &newup
, &up_primary
,
8747 &newacting
, &acting_primary
);
8748 pg
->handle_advance_map(
8749 nextmap
, lastmap
, newup
, up_primary
,
8750 newacting
, acting_primary
, rctx
);
8752 auto oldpool
= lastmap
->get_pools().find(pg
->pg_id
.pool());
8753 auto newpool
= nextmap
->get_pools().find(pg
->pg_id
.pool());
8754 if (oldpool
!= lastmap
->get_pools().end()
8755 && newpool
!= nextmap
->get_pools().end()) {
8756 dout(20) << __func__
8757 << " new pool opts " << newpool
->second
.opts
8758 << " old pool opts " << oldpool
->second
.opts
8761 double old_min_interval
= 0, new_min_interval
= 0;
8762 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &old_min_interval
);
8763 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &new_min_interval
);
8765 double old_max_interval
= 0, new_max_interval
= 0;
8766 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &old_max_interval
);
8767 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &new_max_interval
);
8769 // Assume if an interval is change from set to unset or vice versa the actual config
8770 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8772 if (old_min_interval
!= new_min_interval
|| old_max_interval
!= new_max_interval
) {
8773 pg
->on_info_history_change();
8777 if (new_pg_num
&& old_pg_num
!= new_pg_num
) {
8779 set
<spg_t
> children
;
8780 if (pg
->pg_id
.is_split(
8785 pg
, children
, &new_pgs
, lastmap
, nextmap
,
8791 old_pg_num
= new_pg_num
;
8792 handle
.reset_tp_timeout();
8794 pg
->handle_activate_map(rctx
);
8798 if (!new_pgs
.empty()) {
8799 rctx
.transaction
.register_on_applied(new C_FinishSplits(this, new_pgs
));
8804 void OSD::consume_map()
8806 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8807 auto osdmap
= get_osdmap();
8808 dout(7) << "consume_map version " << osdmap
->get_epoch() << dendl
;
8810 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8811 * speak the older sorting version any more. Be careful not to force
8812 * a shutdown if we are merely processing old maps, though.
8814 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
) && is_active()) {
8815 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
8819 service
.pre_publish_map(osdmap
);
8820 service
.await_reserved_maps();
8821 service
.publish_map(osdmap
);
8823 // prime splits and merges
8824 set
<pair
<spg_t
,epoch_t
>> newly_split
; // splits, and when
8825 set
<pair
<spg_t
,epoch_t
>> merge_pgs
; // merge participants, and when
8826 for (auto& shard
: shards
) {
8827 shard
->identify_splits_and_merges(osdmap
, &newly_split
, &merge_pgs
);
8829 if (!newly_split
.empty()) {
8830 for (auto& shard
: shards
) {
8831 shard
->prime_splits(osdmap
, &newly_split
);
8833 ceph_assert(newly_split
.empty());
8836 // prune sent_ready_to_merge
8837 service
.prune_sent_ready_to_merge(osdmap
);
8839 // FIXME, maybe: We could race against an incoming peering message
8840 // that instantiates a merge PG after identify_merges() below and
8841 // never set up its peer to complete the merge. An OSD restart
8842 // would clear it up. This is a hard race to resolve,
8843 // extraordinarily rare (we only merge PGs that are stable and
8844 // clean, so it'd have to be an imported PG to an OSD with a
8845 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8846 // replace all of this with a seastar-based code soon anyway.
8847 if (!merge_pgs
.empty()) {
8848 // mark the pgs we already have, or create new and empty merge
8849 // participants for those we are missing. do this all under the
8850 // shard lock so we don't have to worry about racing pg creates
8852 for (auto& shard
: shards
) {
8853 shard
->prime_merges(osdmap
, &merge_pgs
);
8855 ceph_assert(merge_pgs
.empty());
8858 service
.prune_pg_created();
8860 unsigned pushes_to_free
= 0;
8861 for (auto& shard
: shards
) {
8862 shard
->consume_map(osdmap
, &pushes_to_free
);
8865 vector
<spg_t
> pgids
;
8868 // count (FIXME, probably during seastar rewrite)
8869 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
8872 for (auto& pg
: pgs
) {
8873 // FIXME (probably during seastar rewrite): this is lockless and
8874 // racy, but we don't want to take pg lock here.
8875 if (pg
->is_primary())
8877 else if (pg
->is_nonprimary())
8878 num_pg_replica
++; // misnomer
8884 // FIXME (as part of seastar rewrite): move to OSDShard
8885 std::lock_guard
l(pending_creates_lock
);
8886 for (auto pg
= pending_creates_from_osd
.begin();
8887 pg
!= pending_creates_from_osd
.end();) {
8888 if (osdmap
->get_pg_acting_role(pg
->first
, whoami
) < 0) {
8889 dout(10) << __func__
<< " pg " << pg
->first
<< " doesn't map here, "
8890 << "discarding pending_create_from_osd" << dendl
;
8891 pg
= pending_creates_from_osd
.erase(pg
);
8898 service
.maybe_inject_dispatch_delay();
8900 dispatch_sessions_waiting_on_map();
8902 service
.maybe_inject_dispatch_delay();
8904 service
.release_reserved_pushes(pushes_to_free
);
8906 // queue null events to push maps down to individual PGs
8907 for (auto pgid
: pgids
) {
8908 enqueue_peering_evt(
8911 std::make_shared
<PGPeeringEvent
>(
8912 osdmap
->get_epoch(),
8913 osdmap
->get_epoch(),
8916 logger
->set(l_osd_pg
, pgids
.size());
8917 logger
->set(l_osd_pg_primary
, num_pg_primary
);
8918 logger
->set(l_osd_pg_replica
, num_pg_replica
);
8919 logger
->set(l_osd_pg_stray
, num_pg_stray
);
8922 void OSD::activate_map()
8924 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8925 auto osdmap
= get_osdmap();
8927 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
8930 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
8931 if (!service
.recovery_is_paused()) {
8932 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
8933 service
.pause_recovery();
8936 if (service
.recovery_is_paused()) {
8937 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
8938 service
.unpause_recovery();
8942 service
.activate_map();
8945 take_waiters(waiting_for_osdmap
);
8948 bool OSD::require_mon_peer(const Message
*m
)
8950 if (!m
->get_connection()->peer_is_mon()) {
8951 dout(0) << "require_mon_peer received from non-mon "
8952 << m
->get_connection()->get_peer_addr()
8953 << " " << *m
<< dendl
;
8959 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
8961 if (!m
->get_connection()->peer_is_mon() &&
8962 !m
->get_connection()->peer_is_mgr()) {
8963 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8964 << m
->get_connection()->get_peer_addr()
8965 << " " << *m
<< dendl
;
8971 bool OSD::require_osd_peer(const Message
*m
)
8973 if (!m
->get_connection()->peer_is_osd()) {
8974 dout(0) << "require_osd_peer received from non-osd "
8975 << m
->get_connection()->get_peer_addr()
8976 << " " << *m
<< dendl
;
8982 bool OSD::require_self_aliveness(const Message
*m
, epoch_t epoch
)
8984 epoch_t up_epoch
= service
.get_up_epoch();
8985 if (epoch
< up_epoch
) {
8986 dout(7) << "from pre-up epoch " << epoch
<< " < " << up_epoch
<< dendl
;
8991 dout(7) << "still in boot state, dropping message " << *m
<< dendl
;
8998 bool OSD::require_same_peer_instance(const Message
*m
, const OSDMapRef
& map
,
8999 bool is_fast_dispatch
)
9001 int from
= m
->get_source().num();
9003 if (map
->is_down(from
) ||
9004 (map
->get_cluster_addrs(from
) != m
->get_source_addrs())) {
9005 dout(5) << "from dead osd." << from
<< ", marking down, "
9006 << " msg was " << m
->get_source_inst().addr
9008 << (map
->is_up(from
) ?
9009 map
->get_cluster_addrs(from
) : entity_addrvec_t())
9011 ConnectionRef con
= m
->get_connection();
9013 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); s
) {
9014 if (!is_fast_dispatch
)
9015 s
->session_dispatch_lock
.lock();
9016 clear_session_waiting_on_map(s
);
9017 con
->set_priv(nullptr); // break ref <-> session cycle, if any
9019 if (!is_fast_dispatch
)
9020 s
->session_dispatch_lock
.unlock();
9029 * require that we have same (or newer) map, and that
9030 * the source is the pg primary.
9032 bool OSD::require_same_or_newer_map(OpRequestRef
& op
, epoch_t epoch
,
9033 bool is_fast_dispatch
)
9035 const Message
*m
= op
->get_req();
9036 const auto osdmap
= get_osdmap();
9037 dout(15) << "require_same_or_newer_map " << epoch
9038 << " (i am " << osdmap
->get_epoch() << ") " << m
<< dendl
;
9040 ceph_assert(ceph_mutex_is_locked(osd_lock
));
9042 // do they have a newer map?
9043 if (epoch
> osdmap
->get_epoch()) {
9044 dout(7) << "waiting for newer map epoch " << epoch
9045 << " > my " << osdmap
->get_epoch() << " with " << m
<< dendl
;
9046 wait_for_new_map(op
);
9050 if (!require_self_aliveness(op
->get_req(), epoch
)) {
9054 // ok, our map is same or newer.. do they still exist?
9055 if (m
->get_connection()->get_messenger() == cluster_messenger
&&
9056 !require_same_peer_instance(op
->get_req(), osdmap
, is_fast_dispatch
)) {
9067 // ----------------------------------------
9070 void OSD::split_pgs(
9072 const set
<spg_t
> &childpgids
, set
<PGRef
> *out_pgs
,
9077 unsigned pg_num
= nextmap
->get_pg_num(parent
->pg_id
.pool());
9078 parent
->update_snap_mapper_bits(parent
->get_pgid().get_split_bits(pg_num
));
9080 vector
<object_stat_sum_t
> updated_stats
;
9081 parent
->start_split_stats(childpgids
, &updated_stats
);
9083 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
9084 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
9085 i
!= childpgids
.end();
9087 ceph_assert(stat_iter
!= updated_stats
.end());
9088 dout(10) << __func__
<< " splitting " << *parent
<< " into " << *i
<< dendl
;
9089 PG
* child
= _make_pg(nextmap
, *i
);
9091 out_pgs
->insert(child
);
9092 child
->ch
= store
->create_new_collection(child
->coll
);
9095 uint32_t shard_index
= i
->hash_to_shard(shards
.size());
9096 assert(NULL
!= shards
[shard_index
]);
9097 store
->set_collection_commit_queue(child
->coll
, &(shards
[shard_index
]->context_queue
));
9100 unsigned split_bits
= i
->get_split_bits(pg_num
);
9101 dout(10) << " pg_num is " << pg_num
9102 << ", m_seed " << i
->ps()
9103 << ", split_bits is " << split_bits
<< dendl
;
9104 parent
->split_colls(
9108 &child
->get_pool().info
,
9115 child
->init_collection_pool_opts();
9117 child
->finish_split_stats(*stat_iter
, rctx
.transaction
);
9120 ceph_assert(stat_iter
!= updated_stats
.end());
9121 parent
->finish_split_stats(*stat_iter
, rctx
.transaction
);
9127 void OSD::handle_pg_create(OpRequestRef op
)
9129 // NOTE: this can be removed in P release (mimic is the last version to
9130 // send MOSDPGCreate messages).
9132 auto m
= op
->get_req
<MOSDPGCreate
>();
9133 ceph_assert(m
->get_type() == MSG_OSD_PG_CREATE
);
9135 dout(10) << "handle_pg_create " << *m
<< dendl
;
9137 if (!require_mon_peer(op
->get_req())) {
9141 if (!require_same_or_newer_map(op
, m
->epoch
, false))
9146 const auto osdmap
= get_osdmap();
9147 map
<pg_t
,utime_t
>::const_iterator ci
= m
->ctimes
.begin();
9148 for (map
<pg_t
,pg_create_t
>::const_iterator p
= m
->mkpg
.begin();
9151 ceph_assert(ci
!= m
->ctimes
.end() && ci
->first
== p
->first
);
9152 epoch_t created
= p
->second
.created
;
9153 if (p
->second
.split_bits
) // Skip split pgs
9157 if (!osdmap
->have_pg_pool(on
.pool())) {
9158 dout(20) << "ignoring pg on deleted pool " << on
<< dendl
;
9162 dout(20) << "mkpg " << on
<< " e" << created
<< "@" << ci
->second
<< dendl
;
9165 bool mapped
= osdmap
->get_primary_shard(on
, &pgid
);
9166 ceph_assert(mapped
);
9168 // is it still ours?
9169 vector
<int> up
, acting
;
9170 int up_primary
= -1;
9171 int acting_primary
= -1;
9172 osdmap
->pg_to_up_acting_osds(on
, &up
, &up_primary
, &acting
, &acting_primary
);
9173 int role
= osdmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
9175 if (acting_primary
!= whoami
) {
9176 dout(10) << "mkpg " << on
<< " not acting_primary (" << acting_primary
9177 << "), my role=" << role
<< ", skipping" << dendl
;
9183 pg_history_t history
;
9184 build_initial_pg_history(pgid
, created
, ci
->second
, &history
, &pi
);
9186 // The mon won't resend unless the primary changed, so we ignore
9187 // same_interval_since. We'll pass this history with the current
9188 // epoch as the event.
9189 if (history
.same_primary_since
> m
->epoch
) {
9190 dout(10) << __func__
<< ": got obsolete pg create on pgid "
9191 << pgid
<< " from epoch " << m
->epoch
9192 << ", primary changed in " << history
.same_primary_since
9196 enqueue_peering_evt(
9199 std::make_shared
<PGPeeringEvent
>(
9200 osdmap
->get_epoch(),
9201 osdmap
->get_epoch(),
9206 osdmap
->get_epoch(),
9214 std::lock_guard
l(pending_creates_lock
);
9215 if (pending_creates_from_mon
== 0) {
9216 last_pg_create_epoch
= m
->epoch
;
9220 maybe_update_heartbeat_peers();
9224 // ----------------------------------------
9225 // peering and recovery
9227 PeeringCtx
OSD::create_context()
9229 return PeeringCtx(get_osdmap()->require_osd_release
);
9232 void OSD::dispatch_context(PeeringCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
9233 ThreadPool::TPHandle
*handle
)
9235 if (!service
.get_osdmap()->is_up(whoami
)) {
9236 dout(20) << __func__
<< " not up in osdmap" << dendl
;
9237 } else if (!is_active()) {
9238 dout(20) << __func__
<< " not active" << dendl
;
9240 for (auto& [osd
, ls
] : ctx
.message_map
) {
9241 if (!curmap
->is_up(osd
)) {
9242 dout(20) << __func__
<< " skipping down osd." << osd
<< dendl
;
9245 ConnectionRef con
= service
.get_con_osd_cluster(
9246 osd
, curmap
->get_epoch());
9248 dout(20) << __func__
<< " skipping osd." << osd
<< " (NULL con)"
9252 service
.maybe_share_map(con
.get(), curmap
);
9254 con
->send_message2(m
);
9259 if ((!ctx
.transaction
.empty() || ctx
.transaction
.has_contexts()) && pg
) {
9260 int tr
= store
->queue_transaction(
9262 std::move(ctx
.transaction
), TrackedOpRef(),
9264 ceph_assert(tr
== 0);
9268 void OSD::handle_fast_pg_create(MOSDPGCreate2
*m
)
9270 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9271 if (!require_mon_peer(m
)) {
9275 for (auto& p
: m
->pgs
) {
9276 spg_t pgid
= p
.first
;
9277 epoch_t created
= p
.second
.first
;
9278 utime_t created_stamp
= p
.second
.second
;
9279 auto q
= m
->pg_extra
.find(pgid
);
9280 if (q
== m
->pg_extra
.end()) {
9281 dout(20) << __func__
<< " " << pgid
<< " e" << created
9282 << "@" << created_stamp
9283 << " (no history or past_intervals)" << dendl
;
9284 // pre-octopus ... no pg history. this can be removed in Q release.
9285 enqueue_peering_evt(
9288 std::make_shared
<PGPeeringEvent
>(
9296 pg_history_t(created
, created_stamp
),
9301 dout(20) << __func__
<< " " << pgid
<< " e" << created
9302 << "@" << created_stamp
9303 << " history " << q
->second
.first
9304 << " pi " << q
->second
.second
<< dendl
;
9305 if (!q
->second
.second
.empty() &&
9306 m
->epoch
< q
->second
.second
.get_bounds().second
) {
9307 clog
->error() << "got pg_create on " << pgid
<< " epoch " << m
->epoch
9308 << " and unmatched past_intervals " << q
->second
.second
9309 << " (history " << q
->second
.first
<< ")";
9311 enqueue_peering_evt(
9314 std::make_shared
<PGPeeringEvent
>(
9331 std::lock_guard
l(pending_creates_lock
);
9332 if (pending_creates_from_mon
== 0) {
9333 last_pg_create_epoch
= m
->epoch
;
9340 void OSD::handle_fast_pg_query(MOSDPGQuery
*m
)
9342 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9343 if (!require_osd_peer(m
)) {
9347 int from
= m
->get_source().num();
9348 for (auto& p
: m
->pg_list
) {
9349 enqueue_peering_evt(
9352 std::make_shared
<PGPeeringEvent
>(
9353 p
.second
.epoch_sent
, p
.second
.epoch_sent
,
9356 pg_shard_t(from
, p
.second
.from
),
9358 p
.second
.epoch_sent
),
9365 void OSD::handle_fast_pg_notify(MOSDPGNotify
* m
)
9367 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9368 if (!require_osd_peer(m
)) {
9372 int from
= m
->get_source().num();
9373 for (auto& p
: m
->get_pg_list()) {
9374 spg_t
pgid(p
.info
.pgid
.pgid
, p
.to
);
9375 enqueue_peering_evt(
9378 std::make_shared
<PGPeeringEvent
>(
9382 pgid
, pg_shard_t(from
, p
.from
),
9384 m
->get_connection()->get_features()),
9397 void OSD::handle_fast_pg_info(MOSDPGInfo
* m
)
9399 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9400 if (!require_osd_peer(m
)) {
9404 int from
= m
->get_source().num();
9405 for (auto& p
: m
->pg_list
) {
9406 enqueue_peering_evt(
9407 spg_t(p
.info
.pgid
.pgid
, p
.to
),
9409 std::make_shared
<PGPeeringEvent
>(
9410 p
.epoch_sent
, p
.query_epoch
,
9412 pg_shard_t(from
, p
.from
),
9420 void OSD::handle_fast_pg_remove(MOSDPGRemove
*m
)
9422 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9423 if (!require_osd_peer(m
)) {
9427 for (auto& pgid
: m
->pg_list
) {
9428 enqueue_peering_evt(
9431 std::make_shared
<PGPeeringEvent
>(
9432 m
->get_epoch(), m
->get_epoch(),
9433 PeeringState::DeleteStart())));
9438 void OSD::handle_fast_force_recovery(MOSDForceRecovery
*m
)
9440 dout(10) << __func__
<< " " << *m
<< dendl
;
9441 if (!require_mon_or_mgr_peer(m
)) {
9445 epoch_t epoch
= get_osdmap_epoch();
9446 for (auto pgid
: m
->forced_pgs
) {
9447 if (m
->options
& OFR_BACKFILL
) {
9448 if (m
->options
& OFR_CANCEL
) {
9449 enqueue_peering_evt(
9452 std::make_shared
<PGPeeringEvent
>(
9454 PeeringState::UnsetForceBackfill())));
9456 enqueue_peering_evt(
9459 std::make_shared
<PGPeeringEvent
>(
9461 PeeringState::SetForceBackfill())));
9463 } else if (m
->options
& OFR_RECOVERY
) {
9464 if (m
->options
& OFR_CANCEL
) {
9465 enqueue_peering_evt(
9468 std::make_shared
<PGPeeringEvent
>(
9470 PeeringState::UnsetForceRecovery())));
9472 enqueue_peering_evt(
9475 std::make_shared
<PGPeeringEvent
>(
9477 PeeringState::SetForceRecovery())));
9484 void OSD::handle_pg_query_nopg(const MQuery
& q
)
9486 spg_t pgid
= q
.pgid
;
9487 dout(10) << __func__
<< " " << pgid
<< dendl
;
9489 OSDMapRef osdmap
= get_osdmap();
9490 if (!osdmap
->have_pg_pool(pgid
.pool()))
9493 dout(10) << " pg " << pgid
<< " dne" << dendl
;
9494 pg_info_t
empty(spg_t(pgid
.pgid
, q
.query
.to
));
9495 ConnectionRef con
= service
.get_con_osd_cluster(q
.from
.osd
, osdmap
->get_epoch());
9498 if (q
.query
.type
== pg_query_t::LOG
||
9499 q
.query
.type
== pg_query_t::FULLLOG
) {
9501 q
.query
.from
, q
.query
.to
,
9502 osdmap
->get_epoch(), empty
,
9503 q
.query
.epoch_sent
);
9505 vector
<pg_notify_t
> ls
;
9508 q
.query
.from
, q
.query
.to
,
9510 osdmap
->get_epoch(),
9513 m
= new MOSDPGNotify(osdmap
->get_epoch(), std::move(ls
));
9515 service
.maybe_share_map(con
.get(), osdmap
);
9516 con
->send_message(m
);
9520 void OSDService::queue_check_readable(spg_t spgid
,
9522 ceph::signedspan delay
)
9524 if (delay
== ceph::signedspan::zero()) {
9525 osd
->enqueue_peering_evt(
9528 std::make_shared
<PGPeeringEvent
>(
9530 PeeringState::CheckReadable())));
9532 mono_timer
.add_event(
9534 [this, spgid
, lpr
]() {
9535 queue_check_readable(spgid
, lpr
);
9541 // =========================================================
9544 void OSDService::_maybe_queue_recovery() {
9545 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
9546 uint64_t available_pushes
;
9547 while (!awaiting_throttle
.empty() &&
9548 _recover_now(&available_pushes
)) {
9549 uint64_t to_start
= std::min(
9551 cct
->_conf
->osd_recovery_max_single_start
);
9552 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
9553 awaiting_throttle
.pop_front();
9554 dout(10) << __func__
<< " starting " << to_start
9555 << ", recovery_ops_reserved " << recovery_ops_reserved
9556 << " -> " << (recovery_ops_reserved
+ to_start
) << dendl
;
9557 recovery_ops_reserved
+= to_start
;
9561 bool OSDService::_recover_now(uint64_t *available_pushes
)
9563 if (available_pushes
)
9564 *available_pushes
= 0;
9566 if (ceph_clock_now() < defer_recovery_until
) {
9567 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
9571 if (recovery_paused
) {
9572 dout(15) << __func__
<< " paused" << dendl
;
9576 uint64_t max
= osd
->get_recovery_max_active();
9577 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
9578 dout(15) << __func__
<< " active " << recovery_ops_active
9579 << " + reserved " << recovery_ops_reserved
9580 << " >= max " << max
<< dendl
;
9584 if (available_pushes
)
9585 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
9590 unsigned OSDService::get_target_pg_log_entries() const
9592 auto num_pgs
= osd
->get_num_pgs();
9593 auto target
= cct
->_conf
->osd_target_pg_log_entries_per_osd
;
9594 if (num_pgs
> 0 && target
> 0) {
9595 // target an even spread of our budgeted log entries across all
9596 // PGs. note that while we only get to control the entry count
9597 // for primary PGs, we'll normally be responsible for a mix of
9598 // primary and replica PGs (for the same pool(s) even), so this
9600 return std::max
<unsigned>(
9601 std::min
<unsigned>(target
/ num_pgs
,
9602 cct
->_conf
->osd_max_pg_log_entries
),
9603 cct
->_conf
->osd_min_pg_log_entries
);
9605 // fall back to a per-pg value.
9606 return cct
->_conf
->osd_min_pg_log_entries
;
9610 void OSD::do_recovery(
9611 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
,
9612 ThreadPool::TPHandle
&handle
)
9614 uint64_t started
= 0;
9617 * When the value of osd_recovery_sleep is set greater than zero, recovery
9618 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9619 * recovery event's schedule time. This is done by adding a
9620 * recovery_requeue_callback event, which re-queues the recovery op using
9621 * queue_recovery_after_sleep.
9623 float recovery_sleep
= get_osd_recovery_sleep();
9625 std::lock_guard
l(service
.sleep_lock
);
9626 if (recovery_sleep
> 0 && service
.recovery_needs_sleep
) {
9628 auto recovery_requeue_callback
= new LambdaContext([this, pgref
, queued
, reserved_pushes
](int r
) {
9629 dout(20) << "do_recovery wake up at "
9631 << ", re-queuing recovery" << dendl
;
9632 std::lock_guard
l(service
.sleep_lock
);
9633 service
.recovery_needs_sleep
= false;
9634 service
.queue_recovery_after_sleep(pgref
.get(), queued
, reserved_pushes
);
9637 // This is true for the first recovery op and when the previous recovery op
9638 // has been scheduled in the past. The next recovery op is scheduled after
9639 // completing the sleep from now.
9641 if (auto now
= ceph::real_clock::now();
9642 service
.recovery_schedule_time
< now
) {
9643 service
.recovery_schedule_time
= now
;
9645 service
.recovery_schedule_time
+= ceph::make_timespan(recovery_sleep
);
9646 service
.sleep_timer
.add_event_at(service
.recovery_schedule_time
,
9647 recovery_requeue_callback
);
9648 dout(20) << "Recovery event scheduled at "
9649 << service
.recovery_schedule_time
<< dendl
;
9656 std::lock_guard
l(service
.sleep_lock
);
9657 service
.recovery_needs_sleep
= true;
9660 if (pg
->pg_has_reset_since(queued
)) {
9664 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
9665 #ifdef DEBUG_RECOVERY_OIDS
9666 dout(20) << " active was " << service
.recovery_oids
[pg
->pg_id
] << dendl
;
9669 bool do_unfound
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
9670 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
9671 << " on " << *pg
<< dendl
;
9674 PeeringCtx rctx
= create_context();
9675 rctx
.handle
= &handle
;
9676 pg
->find_unfound(queued
, rctx
);
9677 dispatch_context(rctx
, pg
, pg
->get_osdmap());
9682 ceph_assert(started
<= reserved_pushes
);
9683 service
.release_reserved_pushes(reserved_pushes
);
9686 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
9688 std::lock_guard
l(recovery_lock
);
9689 dout(10) << "start_recovery_op " << *pg
<< " " << soid
9690 << " (" << recovery_ops_active
<< "/"
9691 << osd
->get_recovery_max_active() << " rops)"
9693 recovery_ops_active
++;
9695 #ifdef DEBUG_RECOVERY_OIDS
9696 dout(20) << " active was " << recovery_oids
[pg
->pg_id
] << dendl
;
9697 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
) == 0);
9698 recovery_oids
[pg
->pg_id
].insert(soid
);
9702 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
9704 std::lock_guard
l(recovery_lock
);
9705 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
9706 << " dequeue=" << dequeue
9707 << " (" << recovery_ops_active
<< "/"
9708 << osd
->get_recovery_max_active() << " rops)"
9712 ceph_assert(recovery_ops_active
> 0);
9713 recovery_ops_active
--;
9715 #ifdef DEBUG_RECOVERY_OIDS
9716 dout(20) << " active oids was " << recovery_oids
[pg
->pg_id
] << dendl
;
9717 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
));
9718 recovery_oids
[pg
->pg_id
].erase(soid
);
9721 _maybe_queue_recovery();
9724 bool OSDService::is_recovery_active()
9726 if (cct
->_conf
->osd_debug_pretend_recovery_active
) {
9729 return local_reserver
.has_reservation() || remote_reserver
.has_reservation();
9732 void OSDService::release_reserved_pushes(uint64_t pushes
)
9734 std::lock_guard
l(recovery_lock
);
9735 dout(10) << __func__
<< "(" << pushes
<< "), recovery_ops_reserved "
9736 << recovery_ops_reserved
<< " -> " << (recovery_ops_reserved
-pushes
)
9738 ceph_assert(recovery_ops_reserved
>= pushes
);
9739 recovery_ops_reserved
-= pushes
;
9740 _maybe_queue_recovery();
9743 // =========================================================
9746 bool OSD::op_is_discardable(const MOSDOp
*op
)
9748 // drop client request if they are not connected and can't get the
9750 if (!op
->get_connection()->is_connected()) {
9756 void OSD::enqueue_op(spg_t pg
, OpRequestRef
&& op
, epoch_t epoch
)
9758 const utime_t stamp
= op
->get_req()->get_recv_stamp();
9759 const utime_t latency
= ceph_clock_now() - stamp
;
9760 const unsigned priority
= op
->get_req()->get_priority();
9761 const int cost
= op
->get_req()->get_cost();
9762 const uint64_t owner
= op
->get_req()->get_source().num();
9763 const int type
= op
->get_req()->get_type();
9765 dout(15) << "enqueue_op " << op
<< " prio " << priority
9768 << " latency " << latency
9769 << " epoch " << epoch
9770 << " " << *(op
->get_req()) << dendl
;
9771 op
->osd_trace
.event("enqueue op");
9772 op
->osd_trace
.keyval("priority", priority
);
9773 op
->osd_trace
.keyval("cost", cost
);
9775 if (op
->osd_parent_span
) {
9776 auto enqueue_span
= jaeger_tracing::child_span(__func__
, op
->osd_parent_span
);
9778 {"priority", priority
},
9786 op
->mark_queued_for_pg();
9787 logger
->tinc(l_osd_op_before_queue_op_lat
, latency
);
9788 if (type
== MSG_OSD_PG_PUSH
||
9789 type
== MSG_OSD_PG_PUSH_REPLY
) {
9792 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGRecoveryMsg(pg
, std::move(op
))),
9793 cost
, priority
, stamp
, owner
, epoch
));
9797 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGOpItem(pg
, std::move(op
))),
9798 cost
, priority
, stamp
, owner
, epoch
));
9802 void OSD::enqueue_peering_evt(spg_t pgid
, PGPeeringEventRef evt
)
9804 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
9807 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
9809 cct
->_conf
->osd_peering_op_priority
,
9812 evt
->get_epoch_sent()));
9816 * NOTE: dequeue called in worker thread, with pg lock
9818 void OSD::dequeue_op(
9819 PGRef pg
, OpRequestRef op
,
9820 ThreadPool::TPHandle
&handle
)
9822 const Message
*m
= op
->get_req();
9825 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_BEGIN", false);
9827 utime_t now
= ceph_clock_now();
9828 op
->set_dequeued_time(now
);
9830 utime_t latency
= now
- m
->get_recv_stamp();
9831 dout(10) << "dequeue_op " << op
<< " prio " << m
->get_priority()
9832 << " cost " << m
->get_cost()
9833 << " latency " << latency
9835 << " pg " << *pg
<< dendl
;
9837 logger
->tinc(l_osd_op_before_dequeue_op_lat
, latency
);
9839 service
.maybe_share_map(m
->get_connection().get(),
9843 if (pg
->is_deleting())
9846 op
->mark_reached_pg();
9847 op
->osd_trace
.event("dequeue_op");
9849 pg
->do_request(op
, handle
);
9852 dout(10) << "dequeue_op " << op
<< " finish" << dendl
;
9853 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_END", false);
9857 void OSD::dequeue_peering_evt(
9860 PGPeeringEventRef evt
,
9861 ThreadPool::TPHandle
& handle
)
9863 PeeringCtx rctx
= create_context();
9864 auto curmap
= sdata
->get_osdmap();
9865 bool need_up_thru
= false;
9866 epoch_t same_interval_since
= 0;
9868 if (const MQuery
*q
= dynamic_cast<const MQuery
*>(evt
->evt
.get())) {
9869 handle_pg_query_nopg(*q
);
9871 derr
<< __func__
<< " unrecognized pg-less event " << evt
->get_desc() << dendl
;
9874 } else if (advance_pg(curmap
->get_epoch(), pg
, handle
, rctx
)) {
9875 pg
->do_peering_event(evt
, rctx
);
9876 if (pg
->is_deleted()) {
9880 dispatch_context(rctx
, pg
, curmap
, &handle
);
9881 need_up_thru
= pg
->get_need_up_thru();
9882 same_interval_since
= pg
->get_same_interval_since();
9887 queue_want_up_thru(same_interval_since
);
9890 service
.send_pg_temp();
9893 void OSD::dequeue_delete(
9897 ThreadPool::TPHandle
& handle
)
9899 dequeue_peering_evt(
9903 std::make_shared
<PGPeeringEvent
>(
9905 PeeringState::DeleteSome())),
9911 // --------------------------------
9913 const char** OSD::get_tracked_conf_keys() const
9915 static const char* KEYS
[] = {
9916 "osd_max_backfills",
9917 "osd_min_recovery_priority",
9918 "osd_max_trimming_pgs",
9919 "osd_op_complaint_time",
9920 "osd_op_log_threshold",
9921 "osd_op_history_size",
9922 "osd_op_history_duration",
9923 "osd_op_history_slow_op_size",
9924 "osd_op_history_slow_op_threshold",
9925 "osd_enable_op_tracker",
9926 "osd_map_cache_size",
9927 "osd_pg_epoch_max_lag_factor",
9928 "osd_pg_epoch_persisted_max_stale",
9929 "osd_recovery_sleep",
9930 "osd_recovery_sleep_hdd",
9931 "osd_recovery_sleep_ssd",
9932 "osd_recovery_sleep_hybrid",
9934 "osd_delete_sleep_hdd",
9935 "osd_delete_sleep_ssd",
9936 "osd_delete_sleep_hybrid",
9937 "osd_snap_trim_sleep",
9938 "osd_snap_trim_sleep_hdd",
9939 "osd_snap_trim_sleep_ssd",
9940 "osd_snap_trim_sleep_hybrid"
9942 "osd_recovery_max_active",
9943 "osd_recovery_max_active_hdd",
9944 "osd_recovery_max_active_ssd",
9945 // clog & admin clog
9948 "clog_to_syslog_facility",
9949 "clog_to_syslog_level",
9950 "osd_objectstore_fuse",
9952 "clog_to_graylog_host",
9953 "clog_to_graylog_port",
9956 "osd_recovery_delay_start",
9957 "osd_client_message_size_cap",
9958 "osd_client_message_cap",
9959 "osd_heartbeat_min_size",
9960 "osd_heartbeat_interval",
9961 "osd_object_clean_region_max_num_intervals",
9962 "osd_scrub_min_interval",
9963 "osd_scrub_max_interval",
9969 void OSD::handle_conf_change(const ConfigProxy
& conf
,
9970 const std::set
<std::string
> &changed
)
9972 std::lock_guard l
{osd_lock
};
9974 if (changed
.count("osd_max_backfills") ||
9975 changed
.count("osd_delete_sleep") ||
9976 changed
.count("osd_delete_sleep_hdd") ||
9977 changed
.count("osd_delete_sleep_ssd") ||
9978 changed
.count("osd_delete_sleep_hybrid") ||
9979 changed
.count("osd_snap_trim_sleep") ||
9980 changed
.count("osd_snap_trim_sleep_hdd") ||
9981 changed
.count("osd_snap_trim_sleep_ssd") ||
9982 changed
.count("osd_snap_trim_sleep_hybrid") ||
9983 changed
.count("osd_scrub_sleep") ||
9984 changed
.count("osd_recovery_sleep") ||
9985 changed
.count("osd_recovery_sleep_hdd") ||
9986 changed
.count("osd_recovery_sleep_ssd") ||
9987 changed
.count("osd_recovery_sleep_hybrid") ||
9988 changed
.count("osd_recovery_max_active") ||
9989 changed
.count("osd_recovery_max_active_hdd") ||
9990 changed
.count("osd_recovery_max_active_ssd")) {
9991 if (!maybe_override_options_for_qos() &&
9992 changed
.count("osd_max_backfills")) {
9993 // Scheduler is not "mclock". Fallback to earlier behavior
9994 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9995 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9998 if (changed
.count("osd_min_recovery_priority")) {
9999 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
10000 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
10002 if (changed
.count("osd_max_trimming_pgs")) {
10003 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
10005 if (changed
.count("osd_op_complaint_time") ||
10006 changed
.count("osd_op_log_threshold")) {
10007 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
10008 cct
->_conf
->osd_op_log_threshold
);
10010 if (changed
.count("osd_op_history_size") ||
10011 changed
.count("osd_op_history_duration")) {
10012 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
10013 cct
->_conf
->osd_op_history_duration
);
10015 if (changed
.count("osd_op_history_slow_op_size") ||
10016 changed
.count("osd_op_history_slow_op_threshold")) {
10017 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
10018 cct
->_conf
->osd_op_history_slow_op_threshold
);
10020 if (changed
.count("osd_enable_op_tracker")) {
10021 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
10023 if (changed
.count("osd_map_cache_size")) {
10024 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10025 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10026 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10028 if (changed
.count("clog_to_monitors") ||
10029 changed
.count("clog_to_syslog") ||
10030 changed
.count("clog_to_syslog_level") ||
10031 changed
.count("clog_to_syslog_facility") ||
10032 changed
.count("clog_to_graylog") ||
10033 changed
.count("clog_to_graylog_host") ||
10034 changed
.count("clog_to_graylog_port") ||
10035 changed
.count("host") ||
10036 changed
.count("fsid")) {
10037 update_log_config();
10039 if (changed
.count("osd_pg_epoch_max_lag_factor")) {
10040 m_osd_pg_epoch_max_lag_factor
= conf
.get_val
<double>(
10041 "osd_pg_epoch_max_lag_factor");
10044 #ifdef HAVE_LIBFUSE
10045 if (changed
.count("osd_objectstore_fuse")) {
10047 enable_disable_fuse(false);
10052 if (changed
.count("osd_recovery_delay_start")) {
10053 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
10054 service
.kick_recovery_queue();
10057 if (changed
.count("osd_client_message_cap")) {
10058 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
10059 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10060 if (pol
.throttler_messages
&& newval
> 0) {
10061 pol
.throttler_messages
->reset_max(newval
);
10064 if (changed
.count("osd_client_message_size_cap")) {
10065 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
10066 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10067 if (pol
.throttler_bytes
&& newval
> 0) {
10068 pol
.throttler_bytes
->reset_max(newval
);
10071 if (changed
.count("osd_object_clean_region_max_num_intervals")) {
10072 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
10075 if (changed
.count("osd_scrub_min_interval") ||
10076 changed
.count("osd_scrub_max_interval")) {
10077 resched_all_scrubs();
10078 dout(0) << __func__
<< ": scrub interval change" << dendl
;
10081 if (changed
.count("osd_asio_thread_count")) {
10082 service
.poolctx
.stop();
10083 service
.poolctx
.start(conf
.get_val
<std::uint64_t>("osd_asio_thread_count"));
10087 bool OSD::maybe_override_options_for_qos()
10089 // If the scheduler enabled is mclock, override the recovery, backfill
10090 // and sleep options so that mclock can meet the QoS goals.
10091 if (cct
->_conf
.get_val
<std::string
>("osd_op_queue") == "mclock_scheduler") {
10092 dout(1) << __func__
10093 << ": Changing recovery/backfill/sleep settings for QoS" << dendl
;
10095 // Set high value for recovery max active
10096 uint32_t rec_max_active
= 1000;
10097 cct
->_conf
.set_val(
10098 "osd_recovery_max_active", std::to_string(rec_max_active
));
10099 cct
->_conf
.set_val(
10100 "osd_recovery_max_active_hdd", std::to_string(rec_max_active
));
10101 cct
->_conf
.set_val(
10102 "osd_recovery_max_active_ssd", std::to_string(rec_max_active
));
10104 // Set high value for osd_max_backfill
10105 uint32_t max_backfills
= 1000;
10106 cct
->_conf
.set_val("osd_max_backfills", std::to_string(max_backfills
));
10107 service
.local_reserver
.set_max(max_backfills
);
10108 service
.remote_reserver
.set_max(max_backfills
);
10110 // Disable recovery sleep
10111 cct
->_conf
.set_val("osd_recovery_sleep", std::to_string(0));
10112 cct
->_conf
.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10113 cct
->_conf
.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10114 cct
->_conf
.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10116 // Disable delete sleep
10117 cct
->_conf
.set_val("osd_delete_sleep", std::to_string(0));
10118 cct
->_conf
.set_val("osd_delete_sleep_hdd", std::to_string(0));
10119 cct
->_conf
.set_val("osd_delete_sleep_ssd", std::to_string(0));
10120 cct
->_conf
.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10122 // Disable snap trim sleep
10123 cct
->_conf
.set_val("osd_snap_trim_sleep", std::to_string(0));
10124 cct
->_conf
.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10125 cct
->_conf
.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10126 cct
->_conf
.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10128 // Disable scrub sleep
10129 cct
->_conf
.set_val("osd_scrub_sleep", std::to_string(0));
10135 void OSD::update_log_config()
10137 map
<string
,string
> log_to_monitors
;
10138 map
<string
,string
> log_to_syslog
;
10139 map
<string
,string
> log_channel
;
10140 map
<string
,string
> log_prio
;
10141 map
<string
,string
> log_to_graylog
;
10142 map
<string
,string
> log_to_graylog_host
;
10143 map
<string
,string
> log_to_graylog_port
;
10147 if (parse_log_client_options(cct
, log_to_monitors
, log_to_syslog
,
10148 log_channel
, log_prio
, log_to_graylog
,
10149 log_to_graylog_host
, log_to_graylog_port
,
10151 clog
->update_config(log_to_monitors
, log_to_syslog
,
10152 log_channel
, log_prio
, log_to_graylog
,
10153 log_to_graylog_host
, log_to_graylog_port
,
10155 derr
<< "log_to_monitors " << log_to_monitors
<< dendl
;
10158 void OSD::check_config()
10160 // some sanity checks
10161 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
10162 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
10163 << " is not > osd_pg_epoch_persisted_max_stale ("
10164 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
10166 if (cct
->_conf
->osd_object_clean_region_max_num_intervals
< 0) {
10167 clog
->warn() << "osd_object_clean_region_max_num_intervals ("
10168 << cct
->_conf
->osd_object_clean_region_max_num_intervals
10173 // --------------------------------
10175 void OSD::get_latest_osdmap()
10177 dout(10) << __func__
<< " -- start" << dendl
;
10179 boost::system::error_code ec
;
10180 service
.objecter
->wait_for_latest_osdmap(ceph::async::use_blocked
[ec
]);
10182 dout(10) << __func__
<< " -- finish" << dendl
;
10185 // --------------------------------
10187 void OSD::set_perf_queries(const ConfigPayload
&config_payload
) {
10188 const OSDConfigPayload
&osd_config_payload
= boost::get
<OSDConfigPayload
>(config_payload
);
10189 const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
= osd_config_payload
.config
;
10190 dout(10) << "setting " << queries
.size() << " queries" << dendl
;
10192 std::list
<OSDPerfMetricQuery
> supported_queries
;
10193 for (auto &it
: queries
) {
10194 auto &query
= it
.first
;
10195 if (!query
.key_descriptor
.empty()) {
10196 supported_queries
.push_back(query
);
10199 if (supported_queries
.size() < queries
.size()) {
10200 dout(1) << queries
.size() - supported_queries
.size()
10201 << " unsupported queries" << dendl
;
10204 std::lock_guard locker
{m_perf_queries_lock
};
10205 m_perf_queries
= supported_queries
;
10206 m_perf_limits
= queries
;
10208 std::vector
<PGRef
> pgs
;
10210 for (auto& pg
: pgs
) {
10211 std::scoped_lock l
{*pg
};
10212 pg
->set_dynamic_perf_stats_queries(supported_queries
);
10216 MetricPayload
OSD::get_perf_reports() {
10217 OSDMetricPayload payload
;
10218 std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> &reports
= payload
.report
;
10220 std::vector
<PGRef
> pgs
;
10222 DynamicPerfStats dps
;
10223 for (auto& pg
: pgs
) {
10224 // m_perf_queries can be modified only in set_perf_queries by mgr client
10225 // request, and it is protected by by mgr client's lock, which is held
10226 // when set_perf_queries/get_perf_reports are called, so we may not hold
10227 // m_perf_queries_lock here.
10228 DynamicPerfStats
pg_dps(m_perf_queries
);
10230 pg
->get_dynamic_perf_stats(&pg_dps
);
10234 dps
.add_to_reports(m_perf_limits
, &reports
);
10235 dout(20) << "reports for " << reports
.size() << " queries" << dendl
;
10240 // =============================================================
10242 #undef dout_context
10243 #define dout_context cct
10245 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10247 void OSDShard::_attach_pg(OSDShardPGSlot
*slot
, PG
*pg
)
10249 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10251 pg
->osd_shard
= this;
10252 pg
->pg_slot
= slot
;
10253 osd
->inc_num_pgs();
10255 slot
->epoch
= pg
->get_osdmap_epoch();
10256 pg_slots_by_epoch
.insert(*slot
);
10259 void OSDShard::_detach_pg(OSDShardPGSlot
*slot
)
10261 dout(10) << slot
->pg
->pg_id
<< " " << slot
->pg
<< dendl
;
10262 slot
->pg
->osd_shard
= nullptr;
10263 slot
->pg
->pg_slot
= nullptr;
10264 slot
->pg
= nullptr;
10265 osd
->dec_num_pgs();
10267 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10269 if (waiting_for_min_pg_epoch
) {
10270 min_pg_epoch_cond
.notify_all();
10274 void OSDShard::update_pg_epoch(OSDShardPGSlot
*slot
, epoch_t e
)
10276 std::lock_guard
l(shard_lock
);
10277 dout(30) << "min was " << pg_slots_by_epoch
.begin()->epoch
10278 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10279 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10280 dout(20) << slot
->pg
->pg_id
<< " " << slot
->epoch
<< " -> " << e
<< dendl
;
10282 pg_slots_by_epoch
.insert(*slot
);
10283 dout(30) << "min is now " << pg_slots_by_epoch
.begin()->epoch
10284 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10285 if (waiting_for_min_pg_epoch
) {
10286 min_pg_epoch_cond
.notify_all();
10290 epoch_t
OSDShard::get_min_pg_epoch()
10292 std::lock_guard
l(shard_lock
);
10293 auto p
= pg_slots_by_epoch
.begin();
10294 if (p
== pg_slots_by_epoch
.end()) {
10300 void OSDShard::wait_min_pg_epoch(epoch_t need
)
10302 std::unique_lock l
{shard_lock
};
10303 ++waiting_for_min_pg_epoch
;
10304 min_pg_epoch_cond
.wait(l
, [need
, this] {
10305 if (pg_slots_by_epoch
.empty()) {
10307 } else if (pg_slots_by_epoch
.begin()->epoch
>= need
) {
10310 dout(10) << need
<< " waiting on "
10311 << pg_slots_by_epoch
.begin()->epoch
<< dendl
;
10315 --waiting_for_min_pg_epoch
;
10318 epoch_t
OSDShard::get_max_waiting_epoch()
10320 std::lock_guard
l(shard_lock
);
10322 for (auto& i
: pg_slots
) {
10323 if (!i
.second
->waiting_peering
.empty()) {
10324 r
= std::max(r
, i
.second
->waiting_peering
.rbegin()->first
);
10330 void OSDShard::consume_map(
10331 const OSDMapRef
& new_osdmap
,
10332 unsigned *pushes_to_free
)
10334 std::lock_guard
l(shard_lock
);
10335 OSDMapRef old_osdmap
;
10337 std::lock_guard
l(osdmap_lock
);
10338 old_osdmap
= std::move(shard_osdmap
);
10339 shard_osdmap
= new_osdmap
;
10341 dout(10) << new_osdmap
->get_epoch()
10342 << " (was " << (old_osdmap
? old_osdmap
->get_epoch() : 0) << ")"
10344 bool queued
= false;
10347 auto p
= pg_slots
.begin();
10348 while (p
!= pg_slots
.end()) {
10349 OSDShardPGSlot
*slot
= p
->second
.get();
10350 const spg_t
& pgid
= p
->first
;
10351 dout(20) << __func__
<< " " << pgid
<< dendl
;
10352 if (!slot
->waiting_for_split
.empty()) {
10353 dout(20) << __func__
<< " " << pgid
10354 << " waiting for split " << slot
->waiting_for_split
<< dendl
;
10358 if (slot
->waiting_for_merge_epoch
> new_osdmap
->get_epoch()) {
10359 dout(20) << __func__
<< " " << pgid
10360 << " waiting for merge by epoch " << slot
->waiting_for_merge_epoch
10365 if (!slot
->waiting_peering
.empty()) {
10366 epoch_t first
= slot
->waiting_peering
.begin()->first
;
10367 if (first
<= new_osdmap
->get_epoch()) {
10368 dout(20) << __func__
<< " " << pgid
10369 << " pending_peering first epoch " << first
10370 << " <= " << new_osdmap
->get_epoch() << ", requeueing" << dendl
;
10371 _wake_pg_slot(pgid
, slot
);
10377 if (!slot
->waiting
.empty()) {
10378 if (new_osdmap
->is_up_acting_osd_shard(pgid
, osd
->get_nodeid())) {
10379 dout(20) << __func__
<< " " << pgid
<< " maps to us, keeping"
10384 while (!slot
->waiting
.empty() &&
10385 slot
->waiting
.front().get_map_epoch() <= new_osdmap
->get_epoch()) {
10386 auto& qi
= slot
->waiting
.front();
10387 dout(20) << __func__
<< " " << pgid
10388 << " waiting item " << qi
10389 << " epoch " << qi
.get_map_epoch()
10390 << " <= " << new_osdmap
->get_epoch()
10392 << (qi
.get_map_epoch() < new_osdmap
->get_epoch() ? "stale" :
10394 << ", dropping" << dendl
;
10395 *pushes_to_free
+= qi
.get_reserved_pushes();
10396 slot
->waiting
.pop_front();
10399 if (slot
->waiting
.empty() &&
10400 slot
->num_running
== 0 &&
10401 slot
->waiting_for_split
.empty() &&
10403 dout(20) << __func__
<< " " << pgid
<< " empty, pruning" << dendl
;
10404 p
= pg_slots
.erase(p
);
10411 std::lock_guard l
{sdata_wait_lock
};
10412 sdata_cond
.notify_one();
10416 void OSDShard::_wake_pg_slot(
10418 OSDShardPGSlot
*slot
)
10420 dout(20) << __func__
<< " " << pgid
10421 << " to_process " << slot
->to_process
10422 << " waiting " << slot
->waiting
10423 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10424 for (auto i
= slot
->to_process
.rbegin();
10425 i
!= slot
->to_process
.rend();
10427 scheduler
->enqueue_front(std::move(*i
));
10429 slot
->to_process
.clear();
10430 for (auto i
= slot
->waiting
.rbegin();
10431 i
!= slot
->waiting
.rend();
10433 scheduler
->enqueue_front(std::move(*i
));
10435 slot
->waiting
.clear();
10436 for (auto i
= slot
->waiting_peering
.rbegin();
10437 i
!= slot
->waiting_peering
.rend();
10439 // this is overkill; we requeue everything, even if some of these
10440 // items are waiting for maps we don't have yet. FIXME, maybe,
10441 // someday, if we decide this inefficiency matters
10442 for (auto j
= i
->second
.rbegin(); j
!= i
->second
.rend(); ++j
) {
10443 scheduler
->enqueue_front(std::move(*j
));
10446 slot
->waiting_peering
.clear();
10447 ++slot
->requeue_seq
;
10450 void OSDShard::identify_splits_and_merges(
10451 const OSDMapRef
& as_of_osdmap
,
10452 set
<pair
<spg_t
,epoch_t
>> *split_pgs
,
10453 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10455 std::lock_guard
l(shard_lock
);
10456 if (shard_osdmap
) {
10457 for (auto& i
: pg_slots
) {
10458 const spg_t
& pgid
= i
.first
;
10459 auto *slot
= i
.second
.get();
10461 osd
->service
.identify_splits_and_merges(
10462 shard_osdmap
, as_of_osdmap
, pgid
,
10463 split_pgs
, merge_pgs
);
10464 } else if (!slot
->waiting_for_split
.empty()) {
10465 osd
->service
.identify_splits_and_merges(
10466 shard_osdmap
, as_of_osdmap
, pgid
,
10467 split_pgs
, nullptr);
10469 dout(20) << __func__
<< " slot " << pgid
10470 << " has no pg and waiting_for_split " << dendl
;
10476 void OSDShard::prime_splits(const OSDMapRef
& as_of_osdmap
,
10477 set
<pair
<spg_t
,epoch_t
>> *pgids
)
10479 std::lock_guard
l(shard_lock
);
10480 _prime_splits(pgids
);
10481 if (shard_osdmap
->get_epoch() > as_of_osdmap
->get_epoch()) {
10482 set
<pair
<spg_t
,epoch_t
>> newer_children
;
10483 for (auto i
: *pgids
) {
10484 osd
->service
.identify_splits_and_merges(
10485 as_of_osdmap
, shard_osdmap
, i
.first
,
10486 &newer_children
, nullptr);
10488 newer_children
.insert(pgids
->begin(), pgids
->end());
10489 dout(10) << "as_of_osdmap " << as_of_osdmap
->get_epoch() << " < shard "
10490 << shard_osdmap
->get_epoch() << ", new children " << newer_children
10492 _prime_splits(&newer_children
);
10493 // note: we don't care what is left over here for other shards.
10494 // if this shard is ahead of us and one isn't, e.g., one thread is
10495 // calling into prime_splits via _process (due to a newly created
10496 // pg) and this shard has a newer map due to a racing consume_map,
10497 // then any grandchildren left here will be identified (or were
10498 // identified) when the slower shard's osdmap is advanced.
10499 // _prime_splits() will tolerate the case where the pgid is
10504 void OSDShard::_prime_splits(set
<pair
<spg_t
,epoch_t
>> *pgids
)
10506 dout(10) << *pgids
<< dendl
;
10507 auto p
= pgids
->begin();
10508 while (p
!= pgids
->end()) {
10509 unsigned shard_index
= p
->first
.hash_to_shard(osd
->num_shards
);
10510 if (shard_index
== shard_id
) {
10511 auto r
= pg_slots
.emplace(p
->first
, nullptr);
10513 dout(10) << "priming slot " << p
->first
<< " e" << p
->second
<< dendl
;
10514 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10515 r
.first
->second
->waiting_for_split
.insert(p
->second
);
10518 ceph_assert(q
!= pg_slots
.end());
10519 dout(10) << "priming (existing) slot " << p
->first
<< " e" << p
->second
10521 q
->second
->waiting_for_split
.insert(p
->second
);
10523 p
= pgids
->erase(p
);
10530 void OSDShard::prime_merges(const OSDMapRef
& as_of_osdmap
,
10531 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10533 std::lock_guard
l(shard_lock
);
10534 dout(20) << __func__
<< " checking shard " << shard_id
10535 << " for remaining merge pgs " << merge_pgs
<< dendl
;
10536 auto p
= merge_pgs
->begin();
10537 while (p
!= merge_pgs
->end()) {
10538 spg_t pgid
= p
->first
;
10539 epoch_t epoch
= p
->second
;
10540 unsigned shard_index
= pgid
.hash_to_shard(osd
->num_shards
);
10541 if (shard_index
!= shard_id
) {
10545 OSDShardPGSlot
*slot
;
10546 auto r
= pg_slots
.emplace(pgid
, nullptr);
10548 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10550 slot
= r
.first
->second
.get();
10553 dout(20) << __func__
<< " have merge participant pg " << pgid
10554 << " " << slot
->pg
<< dendl
;
10555 } else if (!slot
->waiting_for_split
.empty() &&
10556 *slot
->waiting_for_split
.begin() < epoch
) {
10557 dout(20) << __func__
<< " pending split on merge participant pg " << pgid
10558 << " " << slot
->waiting_for_split
<< dendl
;
10560 dout(20) << __func__
<< " creating empty merge participant " << pgid
10561 << " for merge in " << epoch
<< dendl
;
10562 // leave history zeroed; PG::merge_from() will fill it in.
10563 pg_history_t history
;
10564 PGCreateInfo
cinfo(pgid
, epoch
- 1,
10565 history
, PastIntervals(), false);
10566 PGRef pg
= osd
->handle_pg_create_info(shard_osdmap
, &cinfo
);
10567 _attach_pg(r
.first
->second
.get(), pg
.get());
10568 _wake_pg_slot(pgid
, slot
);
10571 // mark slot for merge
10572 dout(20) << __func__
<< " marking merge participant " << pgid
<< dendl
;
10573 slot
->waiting_for_merge_epoch
= epoch
;
10574 p
= merge_pgs
->erase(p
);
10578 void OSDShard::register_and_wake_split_child(PG
*pg
)
10582 std::lock_guard
l(shard_lock
);
10583 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10584 auto p
= pg_slots
.find(pg
->pg_id
);
10585 ceph_assert(p
!= pg_slots
.end());
10586 auto *slot
= p
->second
.get();
10587 dout(20) << pg
->pg_id
<< " waiting_for_split " << slot
->waiting_for_split
10589 ceph_assert(!slot
->pg
);
10590 ceph_assert(!slot
->waiting_for_split
.empty());
10591 _attach_pg(slot
, pg
);
10593 epoch
= pg
->get_osdmap_epoch();
10594 ceph_assert(slot
->waiting_for_split
.count(epoch
));
10595 slot
->waiting_for_split
.erase(epoch
);
10596 if (slot
->waiting_for_split
.empty()) {
10597 _wake_pg_slot(pg
->pg_id
, slot
);
10599 dout(10) << __func__
<< " still waiting for split on "
10600 << slot
->waiting_for_split
<< dendl
;
10604 // kick child to ensure it pulls up to the latest osdmap
10605 osd
->enqueue_peering_evt(
10608 std::make_shared
<PGPeeringEvent
>(
10613 std::lock_guard l
{sdata_wait_lock
};
10614 sdata_cond
.notify_one();
10617 void OSDShard::unprime_split_children(spg_t parent
, unsigned old_pg_num
)
10619 std::lock_guard
l(shard_lock
);
10620 vector
<spg_t
> to_delete
;
10621 for (auto& i
: pg_slots
) {
10622 if (i
.first
!= parent
&&
10623 i
.first
.get_ancestor(old_pg_num
) == parent
) {
10624 dout(10) << __func__
<< " parent " << parent
<< " clearing " << i
.first
10626 _wake_pg_slot(i
.first
, i
.second
.get());
10627 to_delete
.push_back(i
.first
);
10630 for (auto pgid
: to_delete
) {
10631 pg_slots
.erase(pgid
);
10635 OSDShard::OSDShard(
10642 shard_name(string("OSDShard.") + stringify(id
)),
10643 sdata_wait_lock_name(shard_name
+ "::sdata_wait_lock"),
10644 sdata_wait_lock
{make_mutex(sdata_wait_lock_name
)},
10645 osdmap_lock
{make_mutex(shard_name
+ "::osdmap_lock")},
10646 shard_lock_name(shard_name
+ "::shard_lock"),
10647 shard_lock
{make_mutex(shard_lock_name
)},
10648 scheduler(ceph::osd::scheduler::make_scheduler(
10649 cct
, osd
->num_shards
, osd
->store
->is_rotational())),
10650 context_queue(sdata_wait_lock
, sdata_cond
)
10652 dout(0) << "using op scheduler " << *scheduler
<< dendl
;
10656 // =============================================================
10658 #undef dout_context
10659 #define dout_context osd->cct
10661 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10663 void OSD::ShardedOpWQ::_add_slot_waiter(
10665 OSDShardPGSlot
*slot
,
10666 OpSchedulerItem
&& qi
)
10668 if (qi
.is_peering()) {
10669 dout(20) << __func__
<< " " << pgid
10670 << " peering, item epoch is "
10671 << qi
.get_map_epoch()
10672 << ", will wait on " << qi
<< dendl
;
10673 slot
->waiting_peering
[qi
.get_map_epoch()].push_back(std::move(qi
));
10675 dout(20) << __func__
<< " " << pgid
10676 << " item epoch is "
10677 << qi
.get_map_epoch()
10678 << ", will wait on " << qi
<< dendl
;
10679 slot
->waiting
.push_back(std::move(qi
));
10684 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10686 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
10688 uint32_t shard_index
= thread_index
% osd
->num_shards
;
10689 auto& sdata
= osd
->shards
[shard_index
];
10690 ceph_assert(sdata
);
10692 // If all threads of shards do oncommits, there is a out-of-order
10693 // problem. So we choose the thread which has the smallest
10694 // thread_index(thread_index < num_shards) of shard to do oncommit
10696 bool is_smallest_thread_index
= thread_index
< osd
->num_shards
;
10699 sdata
->shard_lock
.lock();
10700 if (sdata
->scheduler
->empty() &&
10701 (!is_smallest_thread_index
|| sdata
->context_queue
.empty())) {
10702 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10703 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
10704 // we raced with a context_queue addition, don't wait
10705 wait_lock
.unlock();
10706 } else if (!sdata
->stop_waiting
) {
10707 dout(20) << __func__
<< " empty q, waiting" << dendl
;
10708 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
10709 sdata
->shard_lock
.unlock();
10710 sdata
->sdata_cond
.wait(wait_lock
);
10711 wait_lock
.unlock();
10712 sdata
->shard_lock
.lock();
10713 if (sdata
->scheduler
->empty() &&
10714 !(is_smallest_thread_index
&& !sdata
->context_queue
.empty())) {
10715 sdata
->shard_lock
.unlock();
10718 // found a work item; reapply default wq timeouts
10719 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10720 timeout_interval
, suicide_interval
);
10722 dout(20) << __func__
<< " need return immediately" << dendl
;
10723 wait_lock
.unlock();
10724 sdata
->shard_lock
.unlock();
10729 list
<Context
*> oncommits
;
10730 if (is_smallest_thread_index
) {
10731 sdata
->context_queue
.move_to(oncommits
);
10734 WorkItem work_item
;
10735 while (!std::get_if
<OpSchedulerItem
>(&work_item
)) {
10736 if (sdata
->scheduler
->empty()) {
10737 if (osd
->is_stopping()) {
10738 sdata
->shard_lock
.unlock();
10739 for (auto c
: oncommits
) {
10740 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10743 return; // OSD shutdown, discard.
10745 sdata
->shard_lock
.unlock();
10746 handle_oncommits(oncommits
);
10750 work_item
= sdata
->scheduler
->dequeue();
10751 if (osd
->is_stopping()) {
10752 sdata
->shard_lock
.unlock();
10753 for (auto c
: oncommits
) {
10754 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10757 return; // OSD shutdown, discard.
10760 // If the work item is scheduled in the future, wait until
10761 // the time returned in the dequeue response before retrying.
10762 if (auto when_ready
= std::get_if
<double>(&work_item
)) {
10763 if (is_smallest_thread_index
) {
10764 sdata
->shard_lock
.unlock();
10765 handle_oncommits(oncommits
);
10768 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10769 auto future_time
= ceph::real_clock::from_double(*when_ready
);
10770 dout(10) << __func__
<< " dequeue future request at " << future_time
<< dendl
;
10771 sdata
->shard_lock
.unlock();
10772 ++sdata
->waiting_threads
;
10773 sdata
->sdata_cond
.wait_until(wait_lock
, future_time
);
10774 --sdata
->waiting_threads
;
10775 wait_lock
.unlock();
10776 sdata
->shard_lock
.lock();
10780 // Access the stored item
10781 auto item
= std::move(std::get
<OpSchedulerItem
>(work_item
));
10782 if (osd
->is_stopping()) {
10783 sdata
->shard_lock
.unlock();
10784 for (auto c
: oncommits
) {
10785 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10788 return; // OSD shutdown, discard.
10791 const auto token
= item
.get_ordering_token();
10792 auto r
= sdata
->pg_slots
.emplace(token
, nullptr);
10794 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10796 OSDShardPGSlot
*slot
= r
.first
->second
.get();
10797 dout(20) << __func__
<< " " << token
10798 << (r
.second
? " (new)" : "")
10799 << " to_process " << slot
->to_process
10800 << " waiting " << slot
->waiting
10801 << " waiting_peering " << slot
->waiting_peering
10803 slot
->to_process
.push_back(std::move(item
));
10804 dout(20) << __func__
<< " " << slot
->to_process
.back()
10805 << " queued" << dendl
;
10808 PGRef pg
= slot
->pg
;
10810 // lock pg (if we have it)
10812 // note the requeue seq now...
10813 uint64_t requeue_seq
= slot
->requeue_seq
;
10814 ++slot
->num_running
;
10816 sdata
->shard_lock
.unlock();
10817 osd
->service
.maybe_inject_dispatch_delay();
10819 osd
->service
.maybe_inject_dispatch_delay();
10820 sdata
->shard_lock
.lock();
10822 auto q
= sdata
->pg_slots
.find(token
);
10823 if (q
== sdata
->pg_slots
.end()) {
10824 // this can happen if we race with pg removal.
10825 dout(20) << __func__
<< " slot " << token
<< " no longer there" << dendl
;
10827 sdata
->shard_lock
.unlock();
10828 handle_oncommits(oncommits
);
10831 slot
= q
->second
.get();
10832 --slot
->num_running
;
10834 if (slot
->to_process
.empty()) {
10835 // raced with _wake_pg_slot or consume_map
10836 dout(20) << __func__
<< " " << token
10837 << " nothing queued" << dendl
;
10839 sdata
->shard_lock
.unlock();
10840 handle_oncommits(oncommits
);
10843 if (requeue_seq
!= slot
->requeue_seq
) {
10844 dout(20) << __func__
<< " " << token
10845 << " requeue_seq " << slot
->requeue_seq
<< " > our "
10846 << requeue_seq
<< ", we raced with _wake_pg_slot"
10849 sdata
->shard_lock
.unlock();
10850 handle_oncommits(oncommits
);
10853 if (slot
->pg
!= pg
) {
10854 // this can happen if we race with pg removal.
10855 dout(20) << __func__
<< " slot " << token
<< " no longer attached to "
10862 dout(20) << __func__
<< " " << token
10863 << " to_process " << slot
->to_process
10864 << " waiting " << slot
->waiting
10865 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10867 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
10871 auto qi
= std::move(slot
->to_process
.front());
10872 slot
->to_process
.pop_front();
10873 dout(20) << __func__
<< " " << qi
<< " pg " << pg
<< dendl
;
10874 set
<pair
<spg_t
,epoch_t
>> new_children
;
10878 // should this pg shard exist on this osd in this (or a later) epoch?
10879 osdmap
= sdata
->shard_osdmap
;
10880 const PGCreateInfo
*create_info
= qi
.creates_pg();
10881 if (!slot
->waiting_for_split
.empty()) {
10882 dout(20) << __func__
<< " " << token
10883 << " splitting " << slot
->waiting_for_split
<< dendl
;
10884 _add_slot_waiter(token
, slot
, std::move(qi
));
10885 } else if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10886 dout(20) << __func__
<< " " << token
10887 << " map " << qi
.get_map_epoch() << " > "
10888 << osdmap
->get_epoch() << dendl
;
10889 _add_slot_waiter(token
, slot
, std::move(qi
));
10890 } else if (qi
.is_peering()) {
10891 if (!qi
.peering_requires_pg()) {
10892 // for pg-less events, we run them under the ordering lock, since
10893 // we don't have the pg lock to keep them ordered.
10894 qi
.run(osd
, sdata
, pg
, tp_handle
);
10895 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10897 if (create_info
->by_mon
&&
10898 osdmap
->get_pg_acting_primary(token
.pgid
) != osd
->whoami
) {
10899 dout(20) << __func__
<< " " << token
10900 << " no pg, no longer primary, ignoring mon create on "
10903 dout(20) << __func__
<< " " << token
10904 << " no pg, should create on " << qi
<< dendl
;
10905 pg
= osd
->handle_pg_create_info(osdmap
, create_info
);
10907 // we created the pg! drop out and continue "normally"!
10908 sdata
->_attach_pg(slot
, pg
.get());
10909 sdata
->_wake_pg_slot(token
, slot
);
10911 // identify split children between create epoch and shard epoch.
10912 osd
->service
.identify_splits_and_merges(
10913 pg
->get_osdmap(), osdmap
, pg
->pg_id
, &new_children
, nullptr);
10914 sdata
->_prime_splits(&new_children
);
10915 // distribute remaining split children to other shards below!
10918 dout(20) << __func__
<< " ignored create on " << qi
<< dendl
;
10921 dout(20) << __func__
<< " " << token
10922 << " no pg, peering, !create, discarding " << qi
<< dendl
;
10925 dout(20) << __func__
<< " " << token
10926 << " no pg, peering, doesn't map here e" << osdmap
->get_epoch()
10927 << ", discarding " << qi
10930 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10931 dout(20) << __func__
<< " " << token
10932 << " no pg, should exist e" << osdmap
->get_epoch()
10933 << ", will wait on " << qi
<< dendl
;
10934 _add_slot_waiter(token
, slot
, std::move(qi
));
10936 dout(20) << __func__
<< " " << token
10937 << " no pg, shouldn't exist e" << osdmap
->get_epoch()
10938 << ", dropping " << qi
<< dendl
;
10939 // share map with client?
10940 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10941 osd
->service
.maybe_share_map((*_op
)->get_req()->get_connection().get(),
10942 sdata
->shard_osdmap
,
10943 (*_op
)->sent_epoch
);
10945 unsigned pushes_to_free
= qi
.get_reserved_pushes();
10946 if (pushes_to_free
> 0) {
10947 sdata
->shard_lock
.unlock();
10948 osd
->service
.release_reserved_pushes(pushes_to_free
);
10949 handle_oncommits(oncommits
);
10953 sdata
->shard_lock
.unlock();
10954 handle_oncommits(oncommits
);
10957 if (qi
.is_peering()) {
10958 OSDMapRef osdmap
= sdata
->shard_osdmap
;
10959 if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10960 _add_slot_waiter(token
, slot
, std::move(qi
));
10961 sdata
->shard_lock
.unlock();
10963 handle_oncommits(oncommits
);
10967 sdata
->shard_lock
.unlock();
10969 if (!new_children
.empty()) {
10970 for (auto shard
: osd
->shards
) {
10971 shard
->prime_splits(osdmap
, &new_children
);
10973 ceph_assert(new_children
.empty());
10976 // osd_opwq_process marks the point at which an operation has been dequeued
10977 // and will begin to be handled by a worker thread.
10981 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10982 reqid
= (*_op
)->get_reqid();
10985 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
10986 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
10989 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
10990 Formatter
*f
= Formatter::create("json");
10991 f
->open_object_section("q");
10993 f
->close_section();
10998 qi
.run(osd
, sdata
, pg
, tp_handle
);
11003 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11004 reqid
= (*_op
)->get_reqid();
11007 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
11008 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
11011 handle_oncommits(oncommits
);
11014 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem
&& item
) {
11015 uint32_t shard_index
=
11016 item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11018 dout(20) << __func__
<< " " << item
<< dendl
;
11020 OSDShard
* sdata
= osd
->shards
[shard_index
];
11021 assert (NULL
!= sdata
);
11025 std::lock_guard l
{sdata
->shard_lock
};
11026 empty
= sdata
->scheduler
->empty();
11027 sdata
->scheduler
->enqueue(std::move(item
));
11031 std::lock_guard l
{sdata
->sdata_wait_lock
};
11033 sdata
->sdata_cond
.notify_all();
11034 } else if (sdata
->waiting_threads
) {
11035 sdata
->sdata_cond
.notify_one();
11040 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem
&& item
)
11042 auto shard_index
= item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11043 auto& sdata
= osd
->shards
[shard_index
];
11044 ceph_assert(sdata
);
11045 sdata
->shard_lock
.lock();
11046 auto p
= sdata
->pg_slots
.find(item
.get_ordering_token());
11047 if (p
!= sdata
->pg_slots
.end() &&
11048 !p
->second
->to_process
.empty()) {
11049 // we may be racing with _process, which has dequeued a new item
11050 // from scheduler, put it on to_process, and is now busy taking the
11051 // pg lock. ensure this old requeued item is ordered before any
11052 // such newer item in to_process.
11053 p
->second
->to_process
.push_front(std::move(item
));
11054 item
= std::move(p
->second
->to_process
.back());
11055 p
->second
->to_process
.pop_back();
11056 dout(20) << __func__
11057 << " " << p
->second
->to_process
.front()
11058 << " shuffled w/ " << item
<< dendl
;
11060 dout(20) << __func__
<< " " << item
<< dendl
;
11062 sdata
->scheduler
->enqueue_front(std::move(item
));
11063 sdata
->shard_lock
.unlock();
11064 std::lock_guard l
{sdata
->sdata_wait_lock
};
11065 sdata
->sdata_cond
.notify_one();
11068 namespace ceph::osd_cmds
{
11070 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
,
11073 if (!ceph_using_tcmalloc()) {
11074 os
<< "could not issue heap profiler command -- not using tcmalloc!";
11075 return -EOPNOTSUPP
;
11079 if (!cmd_getval(cmdmap
, "heapcmd", cmd
)) {
11080 os
<< "unable to get value for command \"" << cmd
<< "\"";
11084 std::vector
<std::string
> cmd_vec
;
11085 get_str_vec(cmd
, cmd_vec
);
11088 if (cmd_getval(cmdmap
, "value", val
)) {
11089 cmd_vec
.push_back(val
);
11092 ceph_heap_profiler_handle_command(cmd_vec
, os
);
11097 } // namespace ceph::osd_cmds