1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
47 #include "osdc/Objecter.h"
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_releases.h"
52 #include "common/ceph_time.h"
53 #include "common/version.h"
54 #include "common/pick_address.h"
55 #include "common/blkdev.h"
56 #include "common/numa.h"
58 #include "os/ObjectStore.h"
60 #include "os/FuseStore.h"
63 #include "PrimaryLogPG.h"
65 #include "msg/Messenger.h"
66 #include "msg/Message.h"
68 #include "mon/MonClient.h"
70 #include "messages/MLog.h"
72 #include "messages/MGenericMessage.h"
73 #include "messages/MOSDPing.h"
74 #include "messages/MOSDFailure.h"
75 #include "messages/MOSDMarkMeDown.h"
76 #include "messages/MOSDMarkMeDead.h"
77 #include "messages/MOSDFull.h"
78 #include "messages/MOSDOp.h"
79 #include "messages/MOSDOpReply.h"
80 #include "messages/MOSDBackoff.h"
81 #include "messages/MOSDBeacon.h"
82 #include "messages/MOSDRepOp.h"
83 #include "messages/MOSDRepOpReply.h"
84 #include "messages/MOSDBoot.h"
85 #include "messages/MOSDPGTemp.h"
86 #include "messages/MOSDPGReadyToMerge.h"
88 #include "messages/MOSDMap.h"
89 #include "messages/MMonGetOSDMap.h"
90 #include "messages/MOSDPGNotify.h"
91 #include "messages/MOSDPGNotify2.h"
92 #include "messages/MOSDPGQuery.h"
93 #include "messages/MOSDPGQuery2.h"
94 #include "messages/MOSDPGLog.h"
95 #include "messages/MOSDPGRemove.h"
96 #include "messages/MOSDPGInfo.h"
97 #include "messages/MOSDPGInfo2.h"
98 #include "messages/MOSDPGCreate.h"
99 #include "messages/MOSDPGCreate2.h"
100 #include "messages/MOSDPGScan.h"
101 #include "messages/MBackfillReserve.h"
102 #include "messages/MRecoveryReserve.h"
103 #include "messages/MOSDForceRecovery.h"
104 #include "messages/MOSDECSubOpWrite.h"
105 #include "messages/MOSDECSubOpWriteReply.h"
106 #include "messages/MOSDECSubOpRead.h"
107 #include "messages/MOSDECSubOpReadReply.h"
108 #include "messages/MOSDPGCreated.h"
109 #include "messages/MOSDPGUpdateLogMissing.h"
110 #include "messages/MOSDPGUpdateLogMissingReply.h"
112 #include "messages/MOSDPeeringOp.h"
114 #include "messages/MOSDAlive.h"
116 #include "messages/MOSDScrub.h"
117 #include "messages/MOSDScrub2.h"
118 #include "messages/MOSDRepScrub.h"
120 #include "messages/MCommand.h"
121 #include "messages/MCommandReply.h"
123 #include "messages/MPGStats.h"
124 #include "messages/MPGStatsAck.h"
126 #include "messages/MWatchNotify.h"
127 #include "messages/MOSDPGPush.h"
128 #include "messages/MOSDPGPushReply.h"
129 #include "messages/MOSDPGPull.h"
131 #include "messages/MMonGetPurgedSnaps.h"
132 #include "messages/MMonGetPurgedSnapsReply.h"
134 #include "common/perf_counters.h"
135 #include "common/Timer.h"
136 #include "common/LogClient.h"
137 #include "common/AsyncReserver.h"
138 #include "common/HeartbeatMap.h"
139 #include "common/admin_socket.h"
140 #include "common/ceph_context.h"
142 #include "global/signal_handler.h"
143 #include "global/pidfile.h"
145 #include "include/color.h"
146 #include "perfglue/cpu_profiler.h"
147 #include "perfglue/heap_profiler.h"
149 #include "osd/OpRequest.h"
151 #include "auth/AuthAuthorizeHandler.h"
152 #include "auth/RotatingKeyRing.h"
154 #include "objclass/objclass.h"
156 #include "common/cmdparse.h"
157 #include "include/str_list.h"
158 #include "include/util.h"
160 #include "include/ceph_assert.h"
161 #include "common/config.h"
162 #include "common/EventTrace.h"
164 #include "json_spirit/json_spirit_reader.h"
165 #include "json_spirit/json_spirit_writer.h"
168 #define TRACEPOINT_DEFINE
169 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170 #include "tracing/osd.h"
171 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #undef TRACEPOINT_DEFINE
174 #define tracepoint(...)
177 #define dout_context cct
178 #define dout_subsys ceph_subsys_osd
180 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
182 using namespace ceph::osd::scheduler
;
183 using TOPNSPC::common::cmd_getval
;
185 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
186 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
189 //Initial features in new superblock.
190 //Features here are also automatically upgraded
191 CompatSet
OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat
;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
194 CompatSet::FeatureSet ceph_osd_feature_incompat
;
195 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
196 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
197 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
198 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
199 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
200 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
201 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
202 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
203 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
204 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
205 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
206 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
207 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
208 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
209 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES
);
210 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
);
211 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
212 ceph_osd_feature_incompat
);
215 //Features are added here that this OSD supports.
216 CompatSet
OSD::get_osd_compat_set() {
217 CompatSet compat
= get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
223 OSDService::OSDService(OSD
*osd
) :
226 whoami(osd
->whoami
), store(osd
->store
),
227 log_client(osd
->log_client
), clog(osd
->clog
),
228 pg_recovery_stats(osd
->pg_recovery_stats
),
229 cluster_messenger(osd
->cluster_messenger
),
230 client_messenger(osd
->client_messenger
),
232 recoverystate_perf(osd
->recoverystate_perf
),
234 osd_max_object_size(cct
->_conf
, "osd_max_object_size"),
235 osd_skip_data_digest(cct
->_conf
, "osd_skip_data_digest"),
236 publish_lock
{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock
{ceph::make_mutex("OSDService::pre_publish_lock")},
241 agent_valid_iterator(false),
243 flush_mode_high_count(0),
246 agent_stop_flag(false),
247 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
251 objecter(make_unique
<Objecter
>(osd
->client_messenger
->cct
,
252 osd
->objecter_messenger
,
253 osd
->monc
, nullptr)),
254 m_objecter_finishers(cct
->_conf
->osd_objecter_finishers
),
255 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
257 recovery_request_timer(cct
, recovery_request_lock
, false),
258 sleep_timer(cct
, sleep_lock
, false),
259 reserver_finisher(cct
),
260 local_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
261 cct
->_conf
->osd_min_recovery_priority
),
262 remote_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
263 cct
->_conf
->osd_min_recovery_priority
),
264 snap_reserver(cct
, &reserver_finisher
,
265 cct
->_conf
->osd_max_trimming_pgs
),
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
269 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
270 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
271 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
273 cur_ratio(0), physical_ratio(0),
274 boot_epoch(0), up_epoch(0), bind_epoch(0)
278 for (int i
= 0; i
< m_objecter_finishers
; i
++) {
280 str
<< "objecter-finisher-" << i
;
281 auto fin
= make_unique
<Finisher
>(osd
->client_messenger
->cct
, str
.str(), "finisher");
282 objecter_finishers
.push_back(std::move(fin
));
287 void OSDService::add_pgid(spg_t pgid
, PG
*pg
){
288 std::lock_guard
l(pgid_lock
);
289 if (!pgid_tracker
.count(pgid
)) {
292 pgid_tracker
[pgid
]++;
294 void OSDService::remove_pgid(spg_t pgid
, PG
*pg
)
296 std::lock_guard
l(pgid_lock
);
297 ceph_assert(pgid_tracker
.count(pgid
));
298 ceph_assert(pgid_tracker
[pgid
] > 0);
299 pgid_tracker
[pgid
]--;
300 if (pgid_tracker
[pgid
] == 0) {
301 pgid_tracker
.erase(pgid
);
302 live_pgs
.erase(pgid
);
305 void OSDService::dump_live_pgids()
307 std::lock_guard
l(pgid_lock
);
308 derr
<< "live pgids:" << dendl
;
309 for (map
<spg_t
, int>::const_iterator i
= pgid_tracker
.cbegin();
310 i
!= pgid_tracker
.cend();
312 derr
<< "\t" << *i
<< dendl
;
313 live_pgs
[i
->first
]->dump_live_ids();
319 ceph::signedspan
OSDService::get_mnow()
321 return ceph::mono_clock::now() - osd
->startup_time
;
324 void OSDService::identify_splits_and_merges(
328 set
<pair
<spg_t
,epoch_t
>> *split_children
,
329 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
331 if (!old_map
->have_pg_pool(pgid
.pool())) {
334 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
335 auto p
= osd
->pg_num_history
.pg_nums
.find(pgid
.pool());
336 if (p
== osd
->pg_num_history
.pg_nums
.end()) {
339 dout(20) << __func__
<< " " << pgid
<< " e" << old_map
->get_epoch()
340 << " to e" << new_map
->get_epoch()
341 << " pg_nums " << p
->second
<< dendl
;
343 queue
.push_back(pgid
);
345 while (!queue
.empty()) {
346 auto cur
= queue
.front();
349 unsigned pgnum
= old_pgnum
;
350 for (auto q
= p
->second
.lower_bound(old_map
->get_epoch());
351 q
!= p
->second
.end() &&
352 q
->first
<= new_map
->get_epoch();
354 if (pgnum
< q
->second
) {
356 if (cur
.ps() < pgnum
) {
358 if (cur
.is_split(pgnum
, q
->second
, &children
)) {
359 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
360 << " pg_num " << pgnum
<< " -> " << q
->second
361 << " children " << children
<< dendl
;
362 for (auto i
: children
) {
363 split_children
->insert(make_pair(i
, q
->first
));
368 } else if (cur
.ps() < q
->second
) {
369 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
370 << " pg_num " << pgnum
<< " -> " << q
->second
371 << " is a child" << dendl
;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children
->insert(make_pair(cur
, q
->first
));
378 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
379 << " pg_num " << pgnum
<< " -> " << q
->second
380 << " is post-split, skipping" << dendl
;
382 } else if (merge_pgs
) {
384 if (cur
.ps() >= q
->second
) {
385 if (cur
.ps() < pgnum
) {
387 if (cur
.is_merge_source(pgnum
, q
->second
, &parent
)) {
389 parent
.is_split(q
->second
, pgnum
, &children
);
390 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
391 << " pg_num " << pgnum
<< " -> " << q
->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children
<< dendl
;
394 merge_pgs
->insert(make_pair(parent
, q
->first
));
395 if (!did
.count(parent
)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue
.push_back(parent
);
400 for (auto c
: children
) {
401 merge_pgs
->insert(make_pair(c
, q
->first
));
407 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
408 << " pg_num " << pgnum
<< " -> " << q
->second
409 << " is beyond old pgnum, skipping" << dendl
;
413 if (cur
.is_split(q
->second
, pgnum
, &children
)) {
414 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
415 << " pg_num " << pgnum
<< " -> " << q
->second
416 << " is merge target, source " << children
<< dendl
;
417 for (auto c
: children
) {
418 merge_pgs
->insert(make_pair(c
, q
->first
));
422 merge_pgs
->insert(make_pair(cur
, q
->first
));
431 void OSDService::need_heartbeat_peer_update()
433 osd
->need_heartbeat_peer_update();
436 HeartbeatStampsRef
OSDService::get_hb_stamps(unsigned peer
)
438 std::lock_guard
l(hb_stamp_lock
);
439 if (peer
>= hb_stamps
.size()) {
440 hb_stamps
.resize(peer
+ 1);
442 if (!hb_stamps
[peer
]) {
443 hb_stamps
[peer
] = ceph::make_ref
<HeartbeatStamps
>(peer
);
445 return hb_stamps
[peer
];
448 void OSDService::queue_renew_lease(epoch_t epoch
, spg_t spgid
)
450 osd
->enqueue_peering_evt(
453 std::make_shared
<PGPeeringEvent
>(
458 void OSDService::start_shutdown()
461 std::lock_guard
l(agent_timer_lock
);
462 agent_timer
.shutdown();
466 std::lock_guard
l(sleep_lock
);
467 sleep_timer
.shutdown();
471 std::lock_guard
l(recovery_request_lock
);
472 recovery_request_timer
.shutdown();
476 void OSDService::shutdown_reserver()
478 reserver_finisher
.wait_for_empty();
479 reserver_finisher
.stop();
482 void OSDService::shutdown()
484 mono_timer
.suspend();
487 std::lock_guard
l(watch_lock
);
488 watch_timer
.shutdown();
491 objecter
->shutdown();
492 for (auto& f
: objecter_finishers
) {
497 publish_map(OSDMapRef());
498 next_osdmap
= OSDMapRef();
501 void OSDService::init()
503 reserver_finisher
.start();
504 for (auto& f
: objecter_finishers
) {
507 objecter
->set_client_incarnation(0);
509 // deprioritize objecter in daemonperf output
510 objecter
->get_logger()->set_prio_adjust(-3);
516 agent_thread
.create("osd_srv_agent");
518 if (cct
->_conf
->osd_recovery_delay_start
)
519 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
522 void OSDService::final_init()
524 objecter
->start(osdmap
.get());
527 void OSDService::activate_map()
529 // wake/unwake the tiering agent
530 std::lock_guard l
{agent_lock
};
532 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
534 agent_cond
.notify_all();
537 void OSDService::request_osdmap_update(epoch_t e
)
539 osd
->osdmap_subscribe(e
, false);
543 class AgentTimeoutCB
: public Context
{
546 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
547 void finish(int) override
{
548 pg
->agent_choose_mode_restart();
552 void OSDService::agent_entry()
554 dout(10) << __func__
<< " start" << dendl
;
555 std::unique_lock agent_locker
{agent_lock
};
557 while (!agent_stop_flag
) {
558 if (agent_queue
.empty()) {
559 dout(20) << __func__
<< " empty queue" << dendl
;
560 agent_cond
.wait(agent_locker
);
563 uint64_t level
= agent_queue
.rbegin()->first
;
564 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
566 << " tiers " << agent_queue
.size()
567 << ", top is " << level
568 << " with pgs " << top
.size()
569 << ", ops " << agent_ops
<< "/"
570 << cct
->_conf
->osd_agent_max_ops
571 << (agent_active
? " active" : " NOT ACTIVE")
573 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
574 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
575 int agent_flush_quota
= max
;
576 if (!flush_mode_high_count
)
577 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
578 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
579 agent_cond
.wait(agent_locker
);
583 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
584 agent_queue_pos
= top
.begin();
585 agent_valid_iterator
= true;
587 PGRef pg
= *agent_queue_pos
;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota
<< dendl
;
591 agent_locker
.unlock();
592 if (!pg
->agent_work(max
, agent_flush_quota
)) {
593 dout(10) << __func__
<< " " << pg
->pg_id
594 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
595 << " seconds" << dendl
;
597 osd
->logger
->inc(l_osd_tier_delay
);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
599 std::lock_guard timer_locker
{agent_timer_lock
};
600 Context
*cb
= new AgentTimeoutCB(pg
);
601 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
605 dout(10) << __func__
<< " finish" << dendl
;
608 void OSDService::agent_stop()
611 std::lock_guard
l(agent_lock
);
613 // By this time all ops should be cancelled
614 ceph_assert(agent_ops
== 0);
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue
.empty()) {
617 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
618 derr
<< "agent queue not empty, for example " << (*top
.begin())->get_pgid() << dendl
;
619 ceph_abort_msg("agent queue not empty");
622 agent_stop_flag
= true;
623 agent_cond
.notify_all();
628 // -------------------------------------
630 void OSDService::promote_throttle_recalibrate()
632 utime_t now
= ceph_clock_now();
633 double dur
= now
- last_recalibrate
;
634 last_recalibrate
= now
;
635 unsigned prob
= promote_probability_millis
;
637 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
638 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
640 unsigned min_prob
= 1;
642 uint64_t attempts
, obj
, bytes
;
643 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
644 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
645 << obj
<< " objects and " << byte_u_t(bytes
) << "; target "
646 << target_obj_sec
<< " obj/sec or "
647 << byte_u_t(target_bytes_sec
) << "/sec"
650 // calculate what the probability *should* be, given the targets
652 if (attempts
&& dur
> 0) {
653 uint64_t avg_size
= 1;
655 avg_size
= std::max
<uint64_t>(bytes
/ obj
, 1);
656 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
657 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
659 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
660 << avg_size
<< dendl
;
661 if (target_obj_sec
&& target_bytes_sec
)
662 new_prob
= std::min(po
, pb
);
663 else if (target_obj_sec
)
665 else if (target_bytes_sec
)
672 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
674 // correct for persistent skew between target rate and actual rate, adjust
677 if (attempts
&& obj
) {
678 actual
= obj
* 1000 / attempts
;
679 ratio
= (double)actual
/ (double)prob
;
680 new_prob
= (double)new_prob
/ ratio
;
682 new_prob
= std::max(new_prob
, min_prob
);
683 new_prob
= std::min(new_prob
, 1000u);
686 prob
= (prob
+ new_prob
) / 2;
687 prob
= std::max(prob
, min_prob
);
688 prob
= std::min(prob
, 1000u);
689 dout(10) << __func__
<< " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis
<< " -> " << prob
694 promote_probability_millis
= prob
;
696 // set hard limits for this interval to mitigate stampedes
697 promote_max_objects
= target_obj_sec
* osd
->OSD_TICK_INTERVAL
* 2;
698 promote_max_bytes
= target_bytes_sec
* osd
->OSD_TICK_INTERVAL
* 2;
701 // -------------------------------------
703 float OSDService::get_failsafe_full_ratio()
705 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
706 if (full_ratio
> 1.0) full_ratio
/= 100.0;
710 OSDService::s_names
OSDService::recalc_full_state(float ratio
, float pratio
, string
&inject
)
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap
= get_osdmap();
717 if (!osdmap
|| osdmap
->get_epoch() == 0) {
720 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
721 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
722 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
723 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
725 if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio
= failsafe_ratio
;
729 backfillfull_ratio
= failsafe_ratio
;
730 nearfull_ratio
= failsafe_ratio
;
731 } else if (full_ratio
<= 0 ||
732 backfillfull_ratio
<= 0 ||
733 nearfull_ratio
<= 0) {
734 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio
= failsafe_ratio
;
738 backfillfull_ratio
= failsafe_ratio
;
739 nearfull_ratio
= failsafe_ratio
;
742 if (injectfull_state
> NONE
&& injectfull
) {
743 inject
= "(Injected)";
744 return injectfull_state
;
745 } else if (pratio
> failsafe_ratio
) {
747 } else if (ratio
> full_ratio
) {
749 } else if (ratio
> backfillfull_ratio
) {
751 } else if (pratio
> nearfull_ratio
) {
757 void OSDService::check_full_status(float ratio
, float pratio
)
759 std::lock_guard
l(full_status_lock
);
762 physical_ratio
= pratio
;
766 new_state
= recalc_full_state(ratio
, pratio
, inject
);
768 dout(20) << __func__
<< " cur ratio " << ratio
769 << ", physical ratio " << pratio
770 << ", new state " << get_full_state_name(new_state
)
775 if (cur_state
!= new_state
) {
776 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
777 << " -> " << get_full_state_name(new_state
) << dendl
;
778 if (new_state
== FAILSAFE
) {
779 clog
->error() << "full status failsafe engaged, dropping updates, now "
780 << (int)roundf(ratio
* 100) << "% full";
781 } else if (cur_state
== FAILSAFE
) {
782 clog
->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio
* 100) << "% full";
785 cur_state
= new_state
;
789 bool OSDService::need_fullness_update()
791 OSDMapRef osdmap
= get_osdmap();
793 if (osdmap
->exists(whoami
)) {
794 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
796 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
798 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
805 else if (is_backfillfull())
807 else if (is_nearfull())
812 bool OSDService::_check_inject_full(DoutPrefixProvider
*dpp
, s_names type
) const
814 if (injectfull
&& injectfull_state
>= type
) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
819 ldpp_dout(dpp
, 10) << __func__
<< " Injected " << get_full_state_name(type
) << " OSD ("
820 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")"
827 bool OSDService::_check_full(DoutPrefixProvider
*dpp
, s_names type
) const
829 std::lock_guard
l(full_status_lock
);
831 if (_check_inject_full(dpp
, type
))
834 if (cur_state
>= type
)
835 ldpp_dout(dpp
, 10) << __func__
<< " current usage is " << cur_ratio
836 << " physical " << physical_ratio
<< dendl
;
838 return cur_state
>= type
;
841 bool OSDService::_tentative_full(DoutPrefixProvider
*dpp
, s_names type
, uint64_t adjust_used
, osd_stat_t adjusted_stat
)
843 ldpp_dout(dpp
, 20) << __func__
<< " type " << get_full_state_name(type
) << " adjust_used " << (adjust_used
>> 10) << "KiB" << dendl
;
845 std::lock_guard
l(full_status_lock
);
846 if (_check_inject_full(dpp
, type
)) {
852 float ratio
= compute_adjusted_ratio(adjusted_stat
, &pratio
, adjust_used
);
855 s_names tentative_state
= recalc_full_state(ratio
, pratio
, notused
);
857 if (tentative_state
>= type
)
858 ldpp_dout(dpp
, 10) << __func__
<< " tentative usage is " << ratio
<< dendl
;
860 return tentative_state
>= type
;
863 bool OSDService::check_failsafe_full(DoutPrefixProvider
*dpp
) const
865 return _check_full(dpp
, FAILSAFE
);
868 bool OSDService::check_full(DoutPrefixProvider
*dpp
) const
870 return _check_full(dpp
, FULL
);
873 bool OSDService::tentative_backfill_full(DoutPrefixProvider
*dpp
, uint64_t adjust_used
, osd_stat_t stats
)
875 return _tentative_full(dpp
, BACKFILLFULL
, adjust_used
, stats
);
878 bool OSDService::check_backfill_full(DoutPrefixProvider
*dpp
) const
880 return _check_full(dpp
, BACKFILLFULL
);
883 bool OSDService::check_nearfull(DoutPrefixProvider
*dpp
) const
885 return _check_full(dpp
, NEARFULL
);
888 bool OSDService::is_failsafe_full() const
890 std::lock_guard
l(full_status_lock
);
891 return cur_state
== FAILSAFE
;
894 bool OSDService::is_full() const
896 std::lock_guard
l(full_status_lock
);
897 return cur_state
>= FULL
;
900 bool OSDService::is_backfillfull() const
902 std::lock_guard
l(full_status_lock
);
903 return cur_state
>= BACKFILLFULL
;
906 bool OSDService::is_nearfull() const
908 std::lock_guard
l(full_status_lock
);
909 return cur_state
>= NEARFULL
;
912 void OSDService::set_injectfull(s_names type
, int64_t count
)
914 std::lock_guard
l(full_status_lock
);
915 injectfull_state
= type
;
919 void OSDService::set_statfs(const struct store_statfs_t
&stbuf
,
920 osd_alert_list_t
& alerts
)
922 uint64_t bytes
= stbuf
.total
;
923 uint64_t avail
= stbuf
.available
;
924 uint64_t used
= stbuf
.get_used_raw();
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct
->_conf
->fake_statfs_for_testing
) {
929 uint64_t total_num_bytes
= 0;
933 total_num_bytes
+= p
->get_stats_num_bytes();
935 bytes
= cct
->_conf
->fake_statfs_for_testing
;
936 if (total_num_bytes
< bytes
)
937 avail
= bytes
- total_num_bytes
;
940 dout(0) << __func__
<< " fake total " << cct
->_conf
->fake_statfs_for_testing
941 << " adjust available " << avail
943 used
= bytes
- avail
;
946 osd
->logger
->set(l_osd_stat_bytes
, bytes
);
947 osd
->logger
->set(l_osd_stat_bytes_used
, used
);
948 osd
->logger
->set(l_osd_stat_bytes_avail
, avail
);
950 std::lock_guard
l(stat_lock
);
951 osd_stat
.statfs
= stbuf
;
952 osd_stat
.os_alerts
.clear();
953 osd_stat
.os_alerts
[whoami
].swap(alerts
);
954 if (cct
->_conf
->fake_statfs_for_testing
) {
955 osd_stat
.statfs
.total
= bytes
;
956 osd_stat
.statfs
.available
= avail
;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat
.statfs
.internally_reserved
= 0;
962 osd_stat_t
OSDService::set_osd_stat(vector
<int>& hb_peers
,
965 utime_t now
= ceph_clock_now();
966 auto stale_time
= g_conf().get_val
<int64_t>("osd_mon_heartbeat_stat_stale");
967 std::lock_guard
l(stat_lock
);
968 osd_stat
.hb_peers
.swap(hb_peers
);
969 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
970 osd_stat
.num_pgs
= num_pgs
;
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i
: osd_stat
.hb_pingtime
) {
974 if (i
.second
.last_update
== 0)
976 if (stale_time
&& now
.sec() - i
.second
.last_update
> stale_time
) {
977 dout(20) << __func__
<< " time out heartbeat for osd " << i
.first
978 << " last_update " << i
.second
.last_update
<< dendl
;
979 osd_stat
.hb_pingtime
.erase(i
.first
);
986 void OSDService::inc_osd_stat_repaired()
988 std::lock_guard
l(stat_lock
);
989 osd_stat
.num_shards_repaired
++;
993 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat
, float *pratio
,
994 uint64_t adjust_used
)
997 ((float)new_stat
.statfs
.get_used_raw()) / ((float)new_stat
.statfs
.total
);
1000 dout(20) << __func__
<< " Before kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1001 if (new_stat
.statfs
.available
> adjust_used
)
1002 new_stat
.statfs
.available
-= adjust_used
;
1004 new_stat
.statfs
.available
= 0;
1005 dout(20) << __func__
<< " After kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted
= 0;
1011 osd
->_get_pgs(&pgs
);
1012 for (auto p
: pgs
) {
1013 backfill_adjusted
+= p
->pg_stat_adjust(&new_stat
);
1015 if (backfill_adjusted
) {
1016 dout(20) << __func__
<< " backfill adjusted " << new_stat
<< dendl
;
1018 return ((float)new_stat
.statfs
.get_used_raw()) / ((float)new_stat
.statfs
.total
);
1021 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
1023 OSDMapRef next_map
= get_nextmap_reserved();
1024 // service map is always newer/newest
1025 ceph_assert(from_epoch
<= next_map
->get_epoch());
1027 if (next_map
->is_down(peer
) ||
1028 next_map
->get_info(peer
).up_from
> from_epoch
) {
1030 release_map(next_map
);
1033 ConnectionRef peer_con
;
1034 if (peer
== whoami
) {
1035 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1037 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1038 next_map
->get_cluster_addrs(peer
), false, true);
1040 maybe_share_map(peer_con
.get(), next_map
);
1041 peer_con
->send_message(m
);
1042 release_map(next_map
);
1045 void OSDService::send_message_osd_cluster(std::vector
<std::pair
<int, Message
*>>& messages
, epoch_t from_epoch
)
1047 OSDMapRef next_map
= get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch
<= next_map
->get_epoch());
1051 for (auto& iter
: messages
) {
1052 if (next_map
->is_down(iter
.first
) ||
1053 next_map
->get_info(iter
.first
).up_from
> from_epoch
) {
1057 ConnectionRef peer_con
;
1058 if (iter
.first
== whoami
) {
1059 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1061 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1062 next_map
->get_cluster_addrs(iter
.first
), false, true);
1064 maybe_share_map(peer_con
.get(), next_map
);
1065 peer_con
->send_message(iter
.second
);
1067 release_map(next_map
);
1069 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
1071 OSDMapRef next_map
= get_nextmap_reserved();
1072 // service map is always newer/newest
1073 ceph_assert(from_epoch
<= next_map
->get_epoch());
1075 if (next_map
->is_down(peer
) ||
1076 next_map
->get_info(peer
).up_from
> from_epoch
) {
1077 release_map(next_map
);
1081 if (peer
== whoami
) {
1082 con
= osd
->cluster_messenger
->get_loopback_connection();
1084 con
= osd
->cluster_messenger
->connect_to_osd(
1085 next_map
->get_cluster_addrs(peer
), false, true);
1087 release_map(next_map
);
1091 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
1093 OSDMapRef next_map
= get_nextmap_reserved();
1094 // service map is always newer/newest
1095 ceph_assert(from_epoch
<= next_map
->get_epoch());
1097 pair
<ConnectionRef
,ConnectionRef
> ret
;
1098 if (next_map
->is_down(peer
) ||
1099 next_map
->get_info(peer
).up_from
> from_epoch
) {
1100 release_map(next_map
);
1103 ret
.first
= osd
->hb_back_client_messenger
->connect_to_osd(
1104 next_map
->get_hb_back_addrs(peer
));
1105 ret
.second
= osd
->hb_front_client_messenger
->connect_to_osd(
1106 next_map
->get_hb_front_addrs(peer
));
1107 release_map(next_map
);
1111 entity_name_t
OSDService::get_cluster_msgr_name() const
1113 return cluster_messenger
->get_myname();
1116 void OSDService::queue_want_pg_temp(pg_t pgid
,
1117 const vector
<int>& want
,
1120 std::lock_guard
l(pg_temp_lock
);
1121 auto p
= pg_temp_pending
.find(pgid
);
1122 if (p
== pg_temp_pending
.end() ||
1123 p
->second
.acting
!= want
||
1125 pg_temp_wanted
[pgid
] = {want
, forced
};
1129 void OSDService::remove_want_pg_temp(pg_t pgid
)
1131 std::lock_guard
l(pg_temp_lock
);
1132 pg_temp_wanted
.erase(pgid
);
1133 pg_temp_pending
.erase(pgid
);
1136 void OSDService::_sent_pg_temp()
1138 #ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending
.merge(pg_temp_wanted
);
1141 pg_temp_pending
.insert(make_move_iterator(begin(pg_temp_wanted
)),
1142 make_move_iterator(end(pg_temp_wanted
)));
1144 pg_temp_wanted
.clear();
1147 void OSDService::requeue_pg_temp()
1149 std::lock_guard
l(pg_temp_lock
);
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted
= pg_temp_wanted
.size();
1153 unsigned old_pending
= pg_temp_pending
.size();
1155 pg_temp_wanted
.swap(pg_temp_pending
);
1156 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1157 << pg_temp_wanted
.size() << dendl
;
1160 std::ostream
& operator<<(std::ostream
& out
,
1161 const OSDService::pg_temp_t
& pg_temp
)
1163 out
<< pg_temp
.acting
;
1164 if (pg_temp
.forced
) {
1170 void OSDService::send_pg_temp()
1172 std::lock_guard
l(pg_temp_lock
);
1173 if (pg_temp_wanted
.empty())
1175 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1176 MOSDPGTemp
*ms
[2] = {nullptr, nullptr};
1177 for (auto& [pgid
, pg_temp
] : pg_temp_wanted
) {
1178 auto& m
= ms
[pg_temp
.forced
];
1180 m
= new MOSDPGTemp(osdmap
->get_epoch());
1181 m
->forced
= pg_temp
.forced
;
1183 m
->pg_temp
.emplace(pgid
, pg_temp
.acting
);
1187 monc
->send_mon_message(m
);
1193 void OSDService::send_pg_created(pg_t pgid
)
1195 std::lock_guard
l(pg_created_lock
);
1196 dout(20) << __func__
<< dendl
;
1197 auto o
= get_osdmap();
1198 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1199 pg_created
.insert(pgid
);
1200 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1204 void OSDService::send_pg_created()
1206 std::lock_guard
l(pg_created_lock
);
1207 dout(20) << __func__
<< dendl
;
1208 auto o
= get_osdmap();
1209 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1210 for (auto pgid
: pg_created
) {
1211 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1216 void OSDService::prune_pg_created()
1218 std::lock_guard
l(pg_created_lock
);
1219 dout(20) << __func__
<< dendl
;
1220 auto o
= get_osdmap();
1221 auto i
= pg_created
.begin();
1222 while (i
!= pg_created
.end()) {
1223 auto p
= o
->get_pg_pool(i
->pool());
1224 if (!p
|| !p
->has_flag(pg_pool_t::FLAG_CREATING
)) {
1225 dout(20) << __func__
<< " pruning " << *i
<< dendl
;
1226 i
= pg_created
.erase(i
);
1228 dout(20) << __func__
<< " keeping " << *i
<< dendl
;
1235 // --------------------------------------
1238 bool OSDService::can_inc_scrubs()
1240 bool can_inc
= false;
1241 std::lock_guard
l(sched_scrub_lock
);
1243 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1244 dout(20) << __func__
<< " == true " << scrubs_local
<< " local + " << scrubs_remote
1245 << " remote < max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1248 dout(20) << __func__
<< " == false " << scrubs_local
<< " local + " << scrubs_remote
1249 << " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1255 bool OSDService::inc_scrubs_local()
1257 bool result
= false;
1258 std::lock_guard l
{sched_scrub_lock
};
1259 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1260 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
+1)
1261 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1265 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1270 void OSDService::dec_scrubs_local()
1272 std::lock_guard l
{sched_scrub_lock
};
1273 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
-1)
1274 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1276 ceph_assert(scrubs_local
>= 0);
1279 bool OSDService::inc_scrubs_remote()
1281 bool result
= false;
1282 std::lock_guard l
{sched_scrub_lock
};
1283 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1284 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
+1)
1285 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1289 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1294 void OSDService::dec_scrubs_remote()
1296 std::lock_guard l
{sched_scrub_lock
};
1297 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
-1)
1298 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1300 ceph_assert(scrubs_remote
>= 0);
1303 void OSDService::dump_scrub_reservations(Formatter
*f
)
1305 std::lock_guard l
{sched_scrub_lock
};
1306 f
->dump_int("scrubs_local", scrubs_local
);
1307 f
->dump_int("scrubs_remote", scrubs_remote
);
1308 f
->dump_int("osd_max_scrubs", cct
->_conf
->osd_max_scrubs
);
1311 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1312 epoch_t
*_bind_epoch
) const
1314 std::lock_guard
l(epoch_lock
);
1316 *_boot_epoch
= boot_epoch
;
1318 *_up_epoch
= up_epoch
;
1320 *_bind_epoch
= bind_epoch
;
1323 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1324 const epoch_t
*_bind_epoch
)
1326 std::lock_guard
l(epoch_lock
);
1328 ceph_assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1329 boot_epoch
= *_boot_epoch
;
1332 ceph_assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1333 up_epoch
= *_up_epoch
;
1336 ceph_assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1337 bind_epoch
= *_bind_epoch
;
1341 bool OSDService::prepare_to_stop()
1343 std::unique_lock
l(is_stopping_lock
);
1344 if (get_state() != NOT_STOPPING
)
1347 OSDMapRef osdmap
= get_osdmap();
1348 if (osdmap
&& osdmap
->is_up(whoami
)) {
1349 dout(0) << __func__
<< " telling mon we are shutting down" << dendl
;
1350 set_state(PREPARING_TO_STOP
);
1351 monc
->send_mon_message(
1355 osdmap
->get_addrs(whoami
),
1356 osdmap
->get_epoch(),
1359 const auto timeout
= ceph::make_timespan(cct
->_conf
->osd_mon_shutdown_timeout
);
1360 is_stopping_cond
.wait_for(l
, timeout
,
1361 [this] { return get_state() == STOPPING
; });
1363 dout(0) << __func__
<< " starting shutdown" << dendl
;
1364 set_state(STOPPING
);
1368 void OSDService::got_stop_ack()
1370 std::scoped_lock
l(is_stopping_lock
);
1371 if (get_state() == PREPARING_TO_STOP
) {
1372 dout(0) << __func__
<< " starting shutdown" << dendl
;
1373 set_state(STOPPING
);
1374 is_stopping_cond
.notify_all();
1376 dout(10) << __func__
<< " ignoring msg" << dendl
;
1380 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1381 OSDSuperblock
& sblock
)
1383 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1384 osdmap
->get_encoding_features());
1385 m
->oldest_map
= max_oldest_map
;
1386 m
->newest_map
= sblock
.newest_map
;
1388 int max
= cct
->_conf
->osd_map_message_max
;
1389 ssize_t max_bytes
= cct
->_conf
->osd_map_message_max_bytes
;
1391 if (since
< m
->oldest_map
) {
1392 // we don't have the next map the target wants, so start with a
1395 dout(10) << __func__
<< " oldest map " << max_oldest_map
<< " > since "
1396 << since
<< ", starting with full map" << dendl
;
1397 since
= m
->oldest_map
;
1398 if (!get_map_bl(since
, bl
)) {
1399 derr
<< __func__
<< " missing full map " << since
<< dendl
;
1403 max_bytes
-= bl
.length();
1404 m
->maps
[since
].claim(bl
);
1406 for (epoch_t e
= since
+ 1; e
<= to
; ++e
) {
1408 if (get_inc_map_bl(e
, bl
)) {
1409 m
->incremental_maps
[e
].claim(bl
);
1411 dout(10) << __func__
<< " missing incremental map " << e
<< dendl
;
1412 if (!get_map_bl(e
, bl
)) {
1413 derr
<< __func__
<< " also missing full map " << e
<< dendl
;
1416 m
->maps
[e
].claim(bl
);
1419 max_bytes
-= bl
.length();
1420 if (max
<= 0 || max_bytes
<= 0) {
1427 if (!m
->maps
.empty() ||
1428 !m
->incremental_maps
.empty()) {
1429 // send what we have so far
1434 if (get_inc_map_bl(m
->newest_map
, bl
)) {
1435 m
->incremental_maps
[m
->newest_map
].claim(bl
);
1437 derr
<< __func__
<< " unable to load latest map " << m
->newest_map
<< dendl
;
1438 if (!get_map_bl(m
->newest_map
, bl
)) {
1439 derr
<< __func__
<< " unable to load latest full map " << m
->newest_map
1443 m
->maps
[m
->newest_map
].claim(bl
);
1448 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1450 con
->send_message(m
);
1453 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1454 const OSDMapRef
& osdmap
)
1456 epoch_t to
= osdmap
->get_epoch();
1457 dout(10) << "send_incremental_map " << since
<< " -> " << to
1458 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1462 OSDSuperblock
sblock(get_superblock());
1463 if (since
< sblock
.oldest_map
) {
1464 // just send latest full map
1465 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1466 osdmap
->get_encoding_features());
1467 m
->oldest_map
= max_oldest_map
;
1468 m
->newest_map
= sblock
.newest_map
;
1469 get_map_bl(to
, m
->maps
[to
]);
1474 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1475 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl
;
1477 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1480 m
= build_incremental_map_msg(since
, to
, sblock
);
1485 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1487 bool found
= map_bl_cache
.lookup(e
, &bl
);
1490 logger
->inc(l_osd_map_bl_cache_hit
);
1494 logger
->inc(l_osd_map_bl_cache_miss
);
1495 found
= store
->read(meta_ch
,
1496 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1504 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1506 std::lock_guard
l(map_cache_lock
);
1507 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1510 logger
->inc(l_osd_map_bl_cache_hit
);
1514 logger
->inc(l_osd_map_bl_cache_miss
);
1515 found
= store
->read(meta_ch
,
1516 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1519 _add_map_inc_bl(e
, bl
);
1524 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1526 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1527 // cache a contiguous buffer
1528 if (bl
.get_num_buffers() > 1) {
1531 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1532 map_bl_cache
.add(e
, bl
);
1535 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1537 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1538 // cache a contiguous buffer
1539 if (bl
.get_num_buffers() > 1) {
1542 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1543 map_bl_inc_cache
.add(e
, bl
);
1546 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1548 epoch_t e
= o
->get_epoch();
1550 if (cct
->_conf
->osd_map_dedup
) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1554 OSDMap::dedup(for_dedup
.get(), o
);
1558 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1565 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1567 std::lock_guard
l(map_cache_lock
);
1568 OSDMapRef retval
= map_cache
.lookup(epoch
);
1570 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1572 logger
->inc(l_osd_map_cache_hit
);
1577 logger
->inc(l_osd_map_cache_miss
);
1578 epoch_t lb
= map_cache
.cached_key_lower_bound();
1580 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1581 logger
->inc(l_osd_map_cache_miss_low
);
1582 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1586 OSDMap
*map
= new OSDMap
;
1588 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1590 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1591 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1597 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1599 return _add_map(map
);
1605 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1607 reply_op_error(op
, err
, eversion_t(), 0, {});
1610 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1612 vector
<pg_log_op_return_item_t
> op_returns
)
1614 auto m
= op
->get_req
<MOSDOp
>();
1615 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1617 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1619 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
,
1620 !m
->has_flag(CEPH_OSD_FLAG_RETURNVEC
));
1621 reply
->set_reply_versions(v
, uv
);
1622 reply
->set_op_returns(op_returns
);
1623 m
->get_connection()->send_message(reply
);
1626 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1628 if (!cct
->_conf
->osd_debug_misdirected_ops
) {
1632 auto m
= op
->get_req
<MOSDOp
>();
1633 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1635 ceph_assert(m
->get_map_epoch() >= pg
->get_history().same_primary_since
);
1637 if (pg
->is_ec_pg()) {
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1654 ceph_assert(m
->get_map_epoch() <= superblock
.newest_map
);
1655 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1657 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1658 << m
->get_map_epoch() << ", dropping" << dendl
;
1661 pg_t _pgid
= m
->get_raw_pg();
1663 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1664 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1665 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1666 pgid
.shard
!= pg
->pg_id
.shard
) {
1667 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1668 << m
->get_map_epoch() << ", dropping" << dendl
;
1673 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1674 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1675 << " pg " << m
->get_raw_pg()
1676 << " to osd." << whoami
1677 << " not " << pg
->get_acting()
1678 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1681 void OSDService::enqueue_back(OpSchedulerItem
&& qi
)
1683 osd
->op_shardedwq
.queue(std::move(qi
));
1686 void OSDService::enqueue_front(OpSchedulerItem
&& qi
)
1688 osd
->op_shardedwq
.queue_front(std::move(qi
));
1691 void OSDService::queue_recovery_context(
1693 GenContext
<ThreadPool::TPHandle
&> *c
)
1695 epoch_t e
= get_osdmap_epoch();
1698 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1699 new PGRecoveryContext(pg
->get_pgid(), c
, e
)),
1700 cct
->_conf
->osd_recovery_cost
,
1701 cct
->_conf
->osd_recovery_priority
,
1707 void OSDService::queue_for_snap_trim(PG
*pg
)
1709 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1712 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1713 new PGSnapTrim(pg
->get_pgid(), pg
->get_osdmap_epoch())),
1714 cct
->_conf
->osd_snap_trim_cost
,
1715 cct
->_conf
->osd_snap_trim_priority
,
1718 pg
->get_osdmap_epoch()));
1721 void OSDService::queue_for_scrub(PG
*pg
, bool with_high_priority
)
1723 unsigned scrub_queue_priority
= pg
->scrubber
.priority
;
1724 if (with_high_priority
&& scrub_queue_priority
< cct
->_conf
->osd_client_op_priority
) {
1725 scrub_queue_priority
= cct
->_conf
->osd_client_op_priority
;
1727 const auto epoch
= pg
->get_osdmap_epoch();
1730 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGScrub(pg
->get_pgid(), epoch
)),
1731 cct
->_conf
->osd_scrub_cost
,
1732 scrub_queue_priority
,
1738 void OSDService::queue_for_pg_delete(spg_t pgid
, epoch_t e
)
1740 dout(10) << __func__
<< " on " << pgid
<< " e " << e
<< dendl
;
1743 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1744 new PGDelete(pgid
, e
)),
1745 cct
->_conf
->osd_pg_delete_cost
,
1746 cct
->_conf
->osd_pg_delete_priority
,
1752 bool OSDService::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
1754 return osd
->try_finish_pg_delete(pg
, old_pg_num
);
1759 void OSDService::set_ready_to_merge_source(PG
*pg
, eversion_t version
)
1761 std::lock_guard
l(merge_lock
);
1762 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1763 ready_to_merge_source
[pg
->pg_id
.pgid
] = version
;
1764 assert(not_ready_to_merge_source
.count(pg
->pg_id
.pgid
) == 0);
1765 _send_ready_to_merge();
1768 void OSDService::set_ready_to_merge_target(PG
*pg
,
1770 epoch_t last_epoch_started
,
1771 epoch_t last_epoch_clean
)
1773 std::lock_guard
l(merge_lock
);
1774 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1775 ready_to_merge_target
.insert(make_pair(pg
->pg_id
.pgid
,
1778 last_epoch_clean
)));
1779 assert(not_ready_to_merge_target
.count(pg
->pg_id
.pgid
) == 0);
1780 _send_ready_to_merge();
1783 void OSDService::set_not_ready_to_merge_source(pg_t source
)
1785 std::lock_guard
l(merge_lock
);
1786 dout(10) << __func__
<< " " << source
<< dendl
;
1787 not_ready_to_merge_source
.insert(source
);
1788 assert(ready_to_merge_source
.count(source
) == 0);
1789 _send_ready_to_merge();
1792 void OSDService::set_not_ready_to_merge_target(pg_t target
, pg_t source
)
1794 std::lock_guard
l(merge_lock
);
1795 dout(10) << __func__
<< " " << target
<< " source " << source
<< dendl
;
1796 not_ready_to_merge_target
[target
] = source
;
1797 assert(ready_to_merge_target
.count(target
) == 0);
1798 _send_ready_to_merge();
1801 void OSDService::send_ready_to_merge()
1803 std::lock_guard
l(merge_lock
);
1804 _send_ready_to_merge();
1807 void OSDService::_send_ready_to_merge()
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1816 for (auto src
: not_ready_to_merge_source
) {
1817 if (sent_ready_to_merge_source
.count(src
) == 0) {
1818 monc
->send_mon_message(new MOSDPGReadyToMerge(
1822 osdmap
->get_epoch()));
1823 sent_ready_to_merge_source
.insert(src
);
1826 for (auto p
: not_ready_to_merge_target
) {
1827 if (sent_ready_to_merge_source
.count(p
.second
) == 0) {
1828 monc
->send_mon_message(new MOSDPGReadyToMerge(
1832 osdmap
->get_epoch()));
1833 sent_ready_to_merge_source
.insert(p
.second
);
1836 for (auto src
: ready_to_merge_source
) {
1837 if (not_ready_to_merge_source
.count(src
.first
) ||
1838 not_ready_to_merge_target
.count(src
.first
.get_parent())) {
1841 auto p
= ready_to_merge_target
.find(src
.first
.get_parent());
1842 if (p
!= ready_to_merge_target
.end() &&
1843 sent_ready_to_merge_source
.count(src
.first
) == 0) {
1844 monc
->send_mon_message(new MOSDPGReadyToMerge(
1845 src
.first
, // source pgid
1846 src
.second
, // src version
1847 std::get
<0>(p
->second
), // target version
1848 std::get
<1>(p
->second
), // PG's last_epoch_started
1849 std::get
<2>(p
->second
), // PG's last_epoch_clean
1851 osdmap
->get_epoch()));
1852 sent_ready_to_merge_source
.insert(src
.first
);
1857 void OSDService::clear_ready_to_merge(PG
*pg
)
1859 std::lock_guard
l(merge_lock
);
1860 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1861 ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1862 ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1863 not_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1864 not_ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1865 sent_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1868 void OSDService::clear_sent_ready_to_merge()
1870 std::lock_guard
l(merge_lock
);
1871 sent_ready_to_merge_source
.clear();
1874 void OSDService::prune_sent_ready_to_merge(const OSDMapRef
& osdmap
)
1876 std::lock_guard
l(merge_lock
);
1877 auto i
= sent_ready_to_merge_source
.begin();
1878 while (i
!= sent_ready_to_merge_source
.end()) {
1879 if (!osdmap
->pg_exists(*i
)) {
1880 dout(10) << __func__
<< " " << *i
<< dendl
;
1881 i
= sent_ready_to_merge_source
.erase(i
);
1890 void OSDService::_queue_for_recovery(
1891 std::pair
<epoch_t
, PGRef
> p
,
1892 uint64_t reserved_pushes
)
1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
1897 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1899 p
.second
->get_pgid(), p
.first
, reserved_pushes
)),
1900 cct
->_conf
->osd_recovery_cost
,
1901 cct
->_conf
->osd_recovery_priority
,
1907 // ====================================================================
1911 #define dout_prefix *_dout
1913 // Commands shared between OSD's console and admin console:
1915 namespace osd_cmds
{
1917 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
);
1919 }} // namespace ceph::osd_cmds
1921 int OSD::mkfs(CephContext
*cct
, ObjectStore
*store
, uuid_d fsid
, int whoami
, string osdspec_affinity
)
1927 ObjectStore::CollectionHandle ch
;
1929 // if we are fed a uuid for this osd, use it.
1930 store
->set_fsid(cct
->_conf
->osd_uuid
);
1932 ret
= store
->mkfs();
1934 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret
) << dendl
;
1939 store
->set_cache_shards(1); // doesn't matter for mkfs!
1941 ret
= store
->mount();
1943 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret
) << dendl
;
1948 ch
= store
->open_collection(coll_t::meta());
1950 ret
= store
->read(ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
1952 derr
<< "OSD::mkfs: have meta collection but no superblock" << dendl
;
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl
;
1957 auto p
= sbbl
.cbegin();
1959 if (whoami
!= sb
.whoami
) {
1960 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
1965 if (fsid
!= sb
.cluster_fsid
) {
1966 derr
<< "provided cluster fsid " << fsid
1967 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
1972 // create superblock
1973 sb
.cluster_fsid
= fsid
;
1974 sb
.osd_fsid
= store
->get_fsid();
1976 sb
.compat_features
= get_osd_initial_compat_set();
1981 ObjectStore::CollectionHandle ch
= store
->create_new_collection(
1983 ObjectStore::Transaction t
;
1984 t
.create_collection(coll_t::meta(), 0);
1985 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
1986 ret
= store
->queue_transaction(ch
, std::move(t
));
1988 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1989 << "queue_transaction returned " << cpp_strerror(ret
) << dendl
;
1994 ret
= write_meta(cct
, store
, sb
.cluster_fsid
, sb
.osd_fsid
, whoami
, osdspec_affinity
);
1996 derr
<< "OSD::mkfs: failed to write fsid file: error "
1997 << cpp_strerror(ret
) << dendl
;
2011 int OSD::write_meta(CephContext
*cct
, ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
, string
& osdspec_affinity
)
2016 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
2017 r
= store
->write_meta("magic", val
);
2021 snprintf(val
, sizeof(val
), "%d", whoami
);
2022 r
= store
->write_meta("whoami", val
);
2026 cluster_fsid
.print(val
);
2027 r
= store
->write_meta("ceph_fsid", val
);
2031 string key
= cct
->_conf
.get_val
<string
>("key");
2033 r
= store
->write_meta("osd_key", key
);
2037 string keyfile
= cct
->_conf
.get_val
<string
>("keyfile");
2038 if (!keyfile
.empty()) {
2041 r
= keybl
.read_file(keyfile
.c_str(), &err
);
2043 derr
<< __func__
<< " failed to read keyfile " << keyfile
<< ": "
2044 << err
<< ": " << cpp_strerror(r
) << dendl
;
2047 r
= store
->write_meta("osd_key", keybl
.to_str());
2052 if (!osdspec_affinity
.empty()) {
2053 r
= store
->write_meta("osdspec_affinity", osdspec_affinity
.c_str());
2058 r
= store
->write_meta("ready", "ready");
2065 int OSD::peek_meta(ObjectStore
*store
,
2067 uuid_d
*cluster_fsid
,
2070 ceph_release_t
*require_osd_release
)
2074 int r
= store
->read_meta("magic", &val
);
2079 r
= store
->read_meta("whoami", &val
);
2082 *whoami
= atoi(val
.c_str());
2084 r
= store
->read_meta("ceph_fsid", &val
);
2087 r
= cluster_fsid
->parse(val
.c_str());
2091 r
= store
->read_meta("fsid", &val
);
2093 *osd_fsid
= uuid_d();
2095 r
= osd_fsid
->parse(val
.c_str());
2100 r
= store
->read_meta("require_osd_release", &val
);
2102 *require_osd_release
= ceph_release_from_name(val
);
2110 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2114 OSD::OSD(CephContext
*cct_
, ObjectStore
*store_
,
2116 Messenger
*internal_messenger
,
2117 Messenger
*external_messenger
,
2118 Messenger
*hb_client_front
,
2119 Messenger
*hb_client_back
,
2120 Messenger
*hb_front_serverm
,
2121 Messenger
*hb_back_serverm
,
2122 Messenger
*osdc_messenger
,
2124 const std::string
&dev
, const std::string
&jdev
) :
2126 tick_timer(cct
, osd_lock
),
2127 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
2128 gss_ktfile_client(cct
->_conf
.get_val
<std::string
>("gss_ktab_client_file")),
2129 cluster_messenger(internal_messenger
),
2130 client_messenger(external_messenger
),
2131 objecter_messenger(osdc_messenger
),
2133 mgrc(cct_
, client_messenger
, &mc
->monmap
),
2135 recoverystate_perf(NULL
),
2137 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
2138 clog(log_client
.create_channel()),
2140 dev_path(dev
), journal_path(jdev
),
2141 store_is_rotational(store
->is_rotational()),
2142 trace_endpoint("0.0.0.0", 0, "osd"),
2144 m_osd_pg_epoch_max_lag_factor(cct
->_conf
.get_val
<double>(
2145 "osd_pg_epoch_max_lag_factor")),
2146 osd_compat(get_osd_compat_set()),
2147 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
2148 get_num_op_threads()),
2149 heartbeat_stop(false),
2150 heartbeat_need_update(true),
2151 hb_front_client_messenger(hb_client_front
),
2152 hb_back_client_messenger(hb_client_back
),
2153 hb_front_server_messenger(hb_front_serverm
),
2154 hb_back_server_messenger(hb_back_serverm
),
2156 heartbeat_thread(this),
2157 heartbeat_dispatcher(this),
2158 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
2159 cct
->_conf
->osd_num_op_tracker_shard
),
2160 test_ops_hook(NULL
),
2163 cct
->_conf
->osd_op_thread_timeout
,
2164 cct
->_conf
->osd_op_thread_suicide_timeout
,
2166 last_pg_create_epoch(0),
2169 requested_full_first(0),
2170 requested_full_last(0),
2174 if (!gss_ktfile_client
.empty()) {
2175 // Assert we can export environment variable
2177 The default client keytab is used, if it is present and readable,
2178 to automatically obtain initial credentials for GSSAPI client
2179 applications. The principal name of the first entry in the client
2180 keytab is used by default when obtaining initial credentials.
2181 1. The KRB5_CLIENT_KTNAME environment variable.
2182 2. The default_client_keytab_name profile variable in [libdefaults].
2183 3. The hardcoded default, DEFCKTNAME.
2185 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2186 gss_ktfile_client
.c_str(), 1));
2187 ceph_assert(set_result
== 0);
2190 monc
->set_messenger(client_messenger
);
2191 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
2192 cct
->_conf
->osd_op_log_threshold
);
2193 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
2194 cct
->_conf
->osd_op_history_duration
);
2195 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
2196 cct
->_conf
->osd_op_history_slow_op_threshold
);
2197 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
2199 std::stringstream ss
;
2200 ss
<< "osd." << whoami
;
2201 trace_endpoint
.copy_name(ss
.str());
2204 // initialize shards
2205 num_shards
= get_num_op_shards();
2206 for (uint32_t i
= 0; i
< num_shards
; i
++) {
2207 OSDShard
*one_shard
= new OSDShard(
2211 shards
.push_back(one_shard
);
2217 while (!shards
.empty()) {
2218 delete shards
.back();
2221 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
2222 cct
->get_perfcounters_collection()->remove(logger
);
2223 delete recoverystate_perf
;
2228 double OSD::get_tick_interval() const
2230 // vary +/- 5% to avoid scrub scheduling livelocks
2231 constexpr auto delta
= 0.05;
2232 return (OSD_TICK_INTERVAL
*
2233 ceph::util::generate_random_number(1.0 - delta
, 1.0 + delta
));
2236 void OSD::handle_signal(int signum
)
2238 ceph_assert(signum
== SIGINT
|| signum
== SIGTERM
);
2239 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
2245 std::lock_guard
lock(osd_lock
);
2249 if (store
->test_mount_in_use()) {
2250 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
2251 << "currently in use. (Is ceph-osd already running?)" << dendl
;
2255 cct
->_conf
.add_observer(this);
2259 int OSD::set_numa_affinity()
2261 // storage numa node
2262 int store_node
= -1;
2263 store
->get_numa_node(&store_node
, nullptr, nullptr);
2264 if (store_node
>= 0) {
2265 dout(1) << __func__
<< " storage numa node " << store_node
<< dendl
;
2268 // check network numa node(s)
2269 int front_node
= -1, back_node
= -1;
2270 string front_iface
= pick_iface(
2272 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
2273 string back_iface
= pick_iface(
2275 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
2276 int r
= get_iface_numa_node(front_iface
, &front_node
);
2277 if (r
>= 0 && front_node
>= 0) {
2278 dout(1) << __func__
<< " public network " << front_iface
<< " numa node "
2279 << front_node
<< dendl
;
2280 r
= get_iface_numa_node(back_iface
, &back_node
);
2281 if (r
>= 0 && back_node
>= 0) {
2282 dout(1) << __func__
<< " cluster network " << back_iface
<< " numa node "
2283 << back_node
<< dendl
;
2284 if (front_node
== back_node
&&
2285 front_node
== store_node
) {
2286 dout(1) << " objectstore and network numa nodes all match" << dendl
;
2287 if (g_conf().get_val
<bool>("osd_numa_auto_affinity")) {
2288 numa_node
= front_node
;
2290 } else if (front_node
!= back_node
) {
2291 dout(1) << __func__
<< " public and cluster network numa nodes do not match"
2294 dout(1) << __func__
<< " objectstore and network numa nodes do not match"
2297 } else if (back_node
== -2) {
2298 dout(1) << __func__
<< " cluster network " << back_iface
2299 << " ports numa nodes do not match" << dendl
;
2301 derr
<< __func__
<< " unable to identify cluster interface '" << back_iface
2302 << "' numa node: " << cpp_strerror(r
) << dendl
;
2304 } else if (front_node
== -2) {
2305 dout(1) << __func__
<< " public network " << front_iface
2306 << " ports numa nodes do not match" << dendl
;
2308 derr
<< __func__
<< " unable to identify public interface '" << front_iface
2309 << "' numa node: " << cpp_strerror(r
) << dendl
;
2311 if (int node
= g_conf().get_val
<int64_t>("osd_numa_node"); node
>= 0) {
2312 // this takes precedence over the automagic logic above
2315 if (numa_node
>= 0) {
2316 int r
= get_numa_node_cpu_set(numa_node
, &numa_cpu_set_size
, &numa_cpu_set
);
2318 dout(1) << __func__
<< " unable to determine numa node " << numa_node
2319 << " CPUs" << dendl
;
2322 dout(1) << __func__
<< " setting numa affinity to node " << numa_node
2324 << cpu_set_to_str_list(numa_cpu_set_size
, &numa_cpu_set
)
2326 r
= set_cpu_affinity_all_threads(numa_cpu_set_size
, &numa_cpu_set
);
2329 derr
<< __func__
<< " failed to set numa affinity: " << cpp_strerror(r
)
2335 dout(1) << __func__
<< " not setting numa affinity" << dendl
;
2342 class OSDSocketHook
: public AdminSocketHook
{
2345 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
2346 int call(std::string_view prefix
, const cmdmap_t
& cmdmap
,
2349 bufferlist
& out
) override
{
2350 ceph_abort("should use async hook");
2353 std::string_view prefix
,
2354 const cmdmap_t
& cmdmap
,
2356 const bufferlist
& inbl
,
2357 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
) override
{
2359 osd
->asok_command(prefix
, cmdmap
, f
, inbl
, on_finish
);
2360 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2362 on_finish(-EINVAL
, e
.what(), empty
);
2367 std::set
<int64_t> OSD::get_mapped_pools()
2369 std::set
<int64_t> pools
;
2370 std::vector
<spg_t
> pgids
;
2372 for (const auto &pgid
: pgids
) {
2373 pools
.insert(pgid
.pool());
2378 void OSD::asok_command(
2379 std::string_view prefix
, const cmdmap_t
& cmdmap
,
2381 const bufferlist
& inbl
,
2382 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
2385 stringstream ss
; // stderr error message stream
2386 bufferlist outbl
; // if empty at end, we'll dump formatter as output
2388 // --- PG commands are routed here to PG::do_command ---
2389 if (prefix
== "pg" ||
2390 prefix
== "query" ||
2391 prefix
== "mark_unfound_lost" ||
2392 prefix
== "list_unfound" ||
2393 prefix
== "scrub" ||
2394 prefix
== "deep_scrub"
2398 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
2399 ss
<< "no pgid specified";
2403 if (!pgid
.parse(pgidstr
.c_str())) {
2404 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
2410 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
2411 (pg
= _lookup_lock_pg(pcand
))) {
2412 if (pg
->is_primary()) {
2413 cmdmap_t new_cmdmap
= cmdmap
;
2415 pg
->do_command(prefix
, new_cmdmap
, inbl
, on_finish
);
2417 return; // the pg handler calls on_finish directly
2418 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2425 ss
<< "not primary for pgid " << pgid
;
2426 // do not reply; they will get newer maps and realize they
2433 ss
<< "i don't have pgid " << pgid
;
2438 // --- OSD commands follow ---
2440 else if (prefix
== "status") {
2441 lock_guard
l(osd_lock
);
2442 f
->open_object_section("status");
2443 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
2444 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
2445 f
->dump_unsigned("whoami", superblock
.whoami
);
2446 f
->dump_string("state", get_state_name(get_state()));
2447 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
2448 f
->dump_unsigned("newest_map", superblock
.newest_map
);
2449 f
->dump_unsigned("num_pgs", num_pgs
);
2451 } else if (prefix
== "flush_journal") {
2452 store
->flush_journal();
2453 } else if (prefix
== "dump_ops_in_flight" ||
2455 prefix
== "dump_blocked_ops" ||
2456 prefix
== "dump_historic_ops" ||
2457 prefix
== "dump_historic_ops_by_duration" ||
2458 prefix
== "dump_historic_slow_ops") {
2460 const string error_str
= "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2461 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2462 will start to track new ops received afterwards.";
2464 set
<string
> filters
;
2465 vector
<string
> filter_str
;
2466 if (cmd_getval(cmdmap
, "filterstr", filter_str
)) {
2467 copy(filter_str
.begin(), filter_str
.end(),
2468 inserter(filters
, filters
.end()));
2471 if (prefix
== "dump_ops_in_flight" ||
2473 if (!op_tracker
.dump_ops_in_flight(f
, false, filters
)) {
2479 if (prefix
== "dump_blocked_ops") {
2480 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
)) {
2486 if (prefix
== "dump_historic_ops") {
2487 if (!op_tracker
.dump_historic_ops(f
, false, filters
)) {
2493 if (prefix
== "dump_historic_ops_by_duration") {
2494 if (!op_tracker
.dump_historic_ops(f
, true, filters
)) {
2500 if (prefix
== "dump_historic_slow_ops") {
2501 if (!op_tracker
.dump_historic_slow_ops(f
, filters
)) {
2507 } else if (prefix
== "dump_op_pq_state") {
2508 f
->open_object_section("pq");
2509 op_shardedwq
.dump(f
);
2511 } else if (prefix
== "dump_blacklist") {
2512 list
<pair
<entity_addr_t
,utime_t
> > bl
;
2513 OSDMapRef curmap
= service
.get_osdmap();
2515 f
->open_array_section("blacklist");
2516 curmap
->get_blacklist(&bl
);
2517 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
2518 it
!= bl
.end(); ++it
) {
2519 f
->open_object_section("entry");
2520 f
->open_object_section("entity_addr_t");
2522 f
->close_section(); //entity_addr_t
2523 it
->second
.localtime(f
->dump_stream("expire_time"));
2524 f
->close_section(); //entry
2526 f
->close_section(); //blacklist
2527 } else if (prefix
== "dump_watchers") {
2528 list
<obj_watch_item_t
> watchers
;
2532 for (auto& pg
: pgs
) {
2533 list
<obj_watch_item_t
> pg_watchers
;
2534 pg
->get_watchers(&pg_watchers
);
2535 watchers
.splice(watchers
.end(), pg_watchers
);
2538 f
->open_array_section("watchers");
2539 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2540 it
!= watchers
.end(); ++it
) {
2542 f
->open_object_section("watch");
2544 f
->dump_string("namespace", it
->obj
.nspace
);
2545 f
->dump_string("object", it
->obj
.oid
.name
);
2547 f
->open_object_section("entity_name");
2548 it
->wi
.name
.dump(f
);
2549 f
->close_section(); //entity_name_t
2551 f
->dump_unsigned("cookie", it
->wi
.cookie
);
2552 f
->dump_unsigned("timeout", it
->wi
.timeout_seconds
);
2554 f
->open_object_section("entity_addr_t");
2555 it
->wi
.addr
.dump(f
);
2556 f
->close_section(); //entity_addr_t
2558 f
->close_section(); //watch
2561 f
->close_section(); //watchers
2562 } else if (prefix
== "dump_recovery_reservations") {
2563 f
->open_object_section("reservations");
2564 f
->open_object_section("local_reservations");
2565 service
.local_reserver
.dump(f
);
2567 f
->open_object_section("remote_reservations");
2568 service
.remote_reserver
.dump(f
);
2571 } else if (prefix
== "dump_scrub_reservations") {
2572 f
->open_object_section("scrub_reservations");
2573 service
.dump_scrub_reservations(f
);
2575 } else if (prefix
== "get_latest_osdmap") {
2576 get_latest_osdmap();
2577 } else if (prefix
== "set_heap_property") {
2581 bool success
= false;
2582 if (!cmd_getval(cmdmap
, "property", property
)) {
2583 error
= "unable to get property";
2585 } else if (!cmd_getval(cmdmap
, "value", value
)) {
2586 error
= "unable to get value";
2588 } else if (value
< 0) {
2589 error
= "negative value not allowed";
2591 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2592 error
= "invalid property";
2597 f
->open_object_section("result");
2598 f
->dump_string("error", error
);
2599 f
->dump_bool("success", success
);
2601 } else if (prefix
== "get_heap_property") {
2605 bool success
= false;
2606 if (!cmd_getval(cmdmap
, "property", property
)) {
2607 error
= "unable to get property";
2609 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2610 error
= "invalid property";
2615 f
->open_object_section("result");
2616 f
->dump_string("error", error
);
2617 f
->dump_bool("success", success
);
2618 f
->dump_int("value", value
);
2620 } else if (prefix
== "dump_objectstore_kv_stats") {
2621 store
->get_db_statistics(f
);
2622 } else if (prefix
== "dump_scrubs") {
2623 service
.dumps_scrub(f
);
2624 } else if (prefix
== "calc_objectstore_db_histogram") {
2625 store
->generate_db_histogram(f
);
2626 } else if (prefix
== "flush_store_cache") {
2627 store
->flush_cache(&ss
);
2628 } else if (prefix
== "dump_pgstate_history") {
2629 f
->open_object_section("pgstate_history");
2630 f
->open_array_section("pgs");
2633 for (auto& pg
: pgs
) {
2634 f
->open_object_section("pg");
2635 f
->dump_stream("pg") << pg
->pg_id
;
2636 f
->dump_string("currently", pg
->get_current_state());
2637 pg
->dump_pgstate_history(f
);
2642 } else if (prefix
== "compact") {
2643 dout(1) << "triggering manual compaction" << dendl
;
2644 auto start
= ceph::coarse_mono_clock::now();
2646 auto end
= ceph::coarse_mono_clock::now();
2647 double duration
= std::chrono::duration
<double>(end
-start
).count();
2648 dout(1) << "finished manual compaction in "
2650 << " seconds" << dendl
;
2651 f
->open_object_section("compact_result");
2652 f
->dump_float("elapsed_time", duration
);
2654 } else if (prefix
== "get_mapped_pools") {
2655 f
->open_array_section("mapped_pools");
2656 set
<int64_t> poollist
= get_mapped_pools();
2657 for (auto pool
: poollist
) {
2658 f
->dump_int("pool_id", pool
);
2661 } else if (prefix
== "smart") {
2663 cmd_getval(cmdmap
, "devid", devid
);
2665 probe_smart(devid
, out
);
2666 outbl
.append(out
.str());
2667 } else if (prefix
== "list_devices") {
2668 set
<string
> devnames
;
2669 store
->get_devices(&devnames
);
2670 f
->open_array_section("list_devices");
2671 for (auto dev
: devnames
) {
2672 if (dev
.find("dm-") == 0) {
2676 f
->open_object_section("device");
2677 f
->dump_string("device", "/dev/" + dev
);
2678 f
->dump_string("device_id", get_device_id(dev
, &err
));
2682 } else if (prefix
== "send_beacon") {
2683 lock_guard
l(osd_lock
);
2685 send_beacon(ceph::coarse_mono_clock::now());
2689 else if (prefix
== "cluster_log") {
2691 cmd_getval(cmdmap
, "message", msg
);
2694 ss
<< "ignoring empty log message";
2697 string message
= msg
.front();
2698 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
2699 message
+= " " + *a
;
2701 cmd_getval(cmdmap
, "level", lvl
);
2702 clog_type level
= string_to_clog_type(lvl
);
2705 ss
<< "unknown level '" << lvl
<< "'";
2708 clog
->do_log(level
, message
);
2711 else if (prefix
== "bench") {
2714 int64_t osize
, onum
;
2715 // default count 1G, size 4MB
2716 cmd_getval(cmdmap
, "count", count
, (int64_t)1 << 30);
2717 cmd_getval(cmdmap
, "size", bsize
, (int64_t)4 << 20);
2718 cmd_getval(cmdmap
, "object_size", osize
, (int64_t)0);
2719 cmd_getval(cmdmap
, "object_num", onum
, (int64_t)0);
2721 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
2723 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
2724 // let us limit the block size because the next checks rely on it
2725 // having a sane value. If we allow any block size to be set things
2726 // can still go sideways.
2727 ss
<< "block 'size' values are capped at "
2728 << byte_u_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
2729 << " a higher value, please adjust 'osd_bench_max_block_size'";
2732 } else if (bsize
< (int64_t) (1 << 20)) {
2733 // entering the realm of small block sizes.
2734 // limit the count to a sane value, assuming a configurable amount of
2735 // IOPS and duration, so that the OSD doesn't get hung up on this,
2736 // preventing timeouts from going off
2738 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
2739 if (count
> max_count
) {
2740 ss
<< "'count' values greater than " << max_count
2741 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
2742 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
2743 << " for " << duration
<< " seconds,"
2744 << " can cause ill effects on osd. "
2745 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2746 << " value if you wish to use a higher 'count'.";
2751 // 1MB block sizes are big enough so that we get more stuff done.
2752 // However, to avoid the osd from getting hung on this and having
2753 // timers being triggered, we are going to limit the count assuming
2754 // a configurable throughput and duration.
2755 // NOTE: max_count is the total amount of bytes that we believe we
2756 // will be able to write during 'duration' for the given
2757 // throughput. The block size hardly impacts this unless it's
2758 // way too big. Given we already check how big the block size
2759 // is, it's safe to assume everything will check out.
2761 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
2762 if (count
> max_count
) {
2763 ss
<< "'count' values greater than " << max_count
2764 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
2765 << byte_u_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
2766 << " for " << duration
<< " seconds,"
2767 << " can cause ill effects on osd. "
2768 << " Please adjust 'osd_bench_large_size_max_throughput'"
2769 << " with a higher value if you wish to use a higher 'count'.";
2775 if (osize
&& bsize
> osize
)
2778 dout(1) << " bench count " << count
2779 << " bsize " << byte_u_t(bsize
) << dendl
;
2781 ObjectStore::Transaction cleanupt
;
2783 if (osize
&& onum
) {
2785 bufferptr
bp(osize
);
2787 bl
.push_back(std::move(bp
));
2788 bl
.rebuild_page_aligned();
2789 for (int i
=0; i
<onum
; ++i
) {
2791 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
2793 hobject_t
soid(sobject_t(oid
, 0));
2794 ObjectStore::Transaction t
;
2795 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
2796 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
2797 cleanupt
.remove(coll_t(), ghobject_t(soid
));
2802 bufferptr
bp(bsize
);
2804 bl
.push_back(std::move(bp
));
2805 bl
.rebuild_page_aligned();
2809 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2814 utime_t start
= ceph_clock_now();
2815 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
2817 unsigned offset
= 0;
2818 if (onum
&& osize
) {
2819 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
2820 offset
= rand() % (osize
/ bsize
) * bsize
;
2822 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
2825 hobject_t
soid(sobject_t(oid
, 0));
2826 ObjectStore::Transaction t
;
2827 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
2828 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
2829 if (!onum
|| !osize
)
2830 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
2835 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2839 utime_t end
= ceph_clock_now();
2842 store
->queue_transaction(service
.meta_ch
, std::move(cleanupt
), NULL
);
2845 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2850 double elapsed
= end
- start
;
2851 double rate
= count
/ elapsed
;
2852 double iops
= rate
/ bsize
;
2853 f
->open_object_section("osd_bench_results");
2854 f
->dump_int("bytes_written", count
);
2855 f
->dump_int("blocksize", bsize
);
2856 f
->dump_float("elapsed_sec", elapsed
);
2857 f
->dump_float("bytes_per_sec", rate
);
2858 f
->dump_float("iops", iops
);
2862 else if (prefix
== "flush_pg_stats") {
2863 mgrc
.send_pgstats();
2864 f
->dump_unsigned("stat_seq", service
.get_osd_stat_seq());
2867 else if (prefix
== "heap") {
2868 ret
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ss
);
2871 else if (prefix
== "debug dump_missing") {
2872 f
->open_array_section("pgs");
2875 for (auto& pg
: pgs
) {
2876 string s
= stringify(pg
->pg_id
);
2877 f
->open_array_section(s
.c_str());
2879 pg
->dump_missing(f
);
2886 else if (prefix
== "debug kick_recovery_wq") {
2888 cmd_getval(cmdmap
, "delay", delay
);
2891 ret
= cct
->_conf
.set_val("osd_recovery_delay_start", oss
.str().c_str());
2893 ss
<< "kick_recovery_wq: error setting "
2894 << "osd_recovery_delay_start to '" << delay
<< "': error "
2898 cct
->_conf
.apply_changes(nullptr);
2899 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
2900 << "to " << cct
->_conf
->osd_recovery_delay_start
;
2903 else if (prefix
== "cpu_profiler") {
2906 cmd_getval(cmdmap
, "arg", arg
);
2907 vector
<string
> argvec
;
2908 get_str_vec(arg
, argvec
);
2909 cpu_profiler_handle_command(argvec
, ds
);
2910 outbl
.append(ds
.str());
2913 else if (prefix
== "dump_pg_recovery_stats") {
2914 lock_guard
l(osd_lock
);
2915 pg_recovery_stats
.dump_formatted(f
);
2918 else if (prefix
== "reset_pg_recovery_stats") {
2919 lock_guard
l(osd_lock
);
2920 pg_recovery_stats
.reset();
2923 else if (prefix
== "perf histogram dump") {
2925 std::string counter
;
2926 cmd_getval(cmdmap
, "logger", logger
);
2927 cmd_getval(cmdmap
, "counter", counter
);
2928 cct
->get_perfcounters_collection()->dump_formatted_histograms(
2929 f
, false, logger
, counter
);
2932 else if (prefix
== "cache drop") {
2933 lock_guard
l(osd_lock
);
2934 dout(20) << "clearing all caches" << dendl
;
2935 // Clear the objectstore's cache - onode and buffer for Bluestore,
2936 // system's pagecache for Filestore
2937 ret
= store
->flush_cache(&ss
);
2939 ss
<< "Error flushing objectstore cache: " << cpp_strerror(ret
);
2942 // Clear the objectcontext cache (per PG)
2945 for (auto& pg
: pgs
) {
2950 else if (prefix
== "cache status") {
2951 lock_guard
l(osd_lock
);
2952 int obj_ctx_count
= 0;
2955 for (auto& pg
: pgs
) {
2956 obj_ctx_count
+= pg
->get_cache_obj_count();
2958 f
->open_object_section("cache_status");
2959 f
->dump_int("object_ctx", obj_ctx_count
);
2960 store
->dump_cache_stats(f
);
2964 else if (prefix
== "scrub_purged_snaps") {
2965 lock_guard
l(osd_lock
);
2966 scrub_purged_snaps();
2969 else if (prefix
== "dump_osd_network") {
2970 lock_guard
l(osd_lock
);
2972 if (!(cmd_getval(cmdmap
, "value", value
))) {
2973 // Convert milliseconds to microseconds
2974 value
= static_cast<double>(g_conf().get_val
<double>(
2975 "mon_warn_on_slow_ping_time")) * 1000;
2977 double ratio
= g_conf().get_val
<double>("mon_warn_on_slow_ping_ratio");
2978 value
= g_conf().get_val
<int64_t>("osd_heartbeat_grace");
2979 value
*= 1000000 * ratio
; // Seconds of grace to microseconds at ratio
2982 // Convert user input to microseconds
2985 if (value
< 0) value
= 0;
2987 struct osd_ping_time_t
{
2991 std::array
<uint32_t,3> times
;
2992 std::array
<uint32_t,3> min
;
2993 std::array
<uint32_t,3> max
;
2995 uint32_t last_update
;
2997 bool operator<(const osd_ping_time_t
& rhs
) const {
2998 if (pingtime
< rhs
.pingtime
)
3000 if (pingtime
> rhs
.pingtime
)
3010 set
<osd_ping_time_t
> sorted
;
3011 // Get pingtimes under lock and not on the stack
3012 map
<int, osd_stat_t::Interfaces
> *pingtimes
= new map
<int, osd_stat_t::Interfaces
>;
3013 service
.get_hb_pingtime(pingtimes
);
3014 for (auto j
: *pingtimes
) {
3015 if (j
.second
.last_update
== 0)
3017 osd_ping_time_t item
;
3018 item
.pingtime
= std::max(j
.second
.back_pingtime
[0], j
.second
.back_pingtime
[1]);
3019 item
.pingtime
= std::max(item
.pingtime
, j
.second
.back_pingtime
[2]);
3020 if (item
.pingtime
>= value
) {
3022 item
.times
[0] = j
.second
.back_pingtime
[0];
3023 item
.times
[1] = j
.second
.back_pingtime
[1];
3024 item
.times
[2] = j
.second
.back_pingtime
[2];
3025 item
.min
[0] = j
.second
.back_min
[0];
3026 item
.min
[1] = j
.second
.back_min
[1];
3027 item
.min
[2] = j
.second
.back_min
[2];
3028 item
.max
[0] = j
.second
.back_max
[0];
3029 item
.max
[1] = j
.second
.back_max
[1];
3030 item
.max
[2] = j
.second
.back_max
[2];
3031 item
.last
= j
.second
.back_last
;
3033 item
.last_update
= j
.second
.last_update
;
3034 sorted
.emplace(item
);
3036 if (j
.second
.front_last
== 0)
3038 item
.pingtime
= std::max(j
.second
.front_pingtime
[0], j
.second
.front_pingtime
[1]);
3039 item
.pingtime
= std::max(item
.pingtime
, j
.second
.front_pingtime
[2]);
3040 if (item
.pingtime
>= value
) {
3042 item
.times
[0] = j
.second
.front_pingtime
[0];
3043 item
.times
[1] = j
.second
.front_pingtime
[1];
3044 item
.times
[2] = j
.second
.front_pingtime
[2];
3045 item
.min
[0] = j
.second
.front_min
[0];
3046 item
.min
[1] = j
.second
.front_min
[1];
3047 item
.min
[2] = j
.second
.front_min
[2];
3048 item
.max
[0] = j
.second
.front_max
[0];
3049 item
.max
[1] = j
.second
.front_max
[1];
3050 item
.max
[2] = j
.second
.front_max
[2];
3051 item
.last
= j
.second
.front_last
;
3052 item
.last_update
= j
.second
.last_update
;
3054 sorted
.emplace(item
);
3059 // Network ping times (1min 5min 15min)
3060 f
->open_object_section("network_ping_times");
3061 f
->dump_int("threshold", value
/ 1000);
3062 f
->open_array_section("entries");
3063 for (auto &sitem
: boost::adaptors::reverse(sorted
)) {
3064 ceph_assert(sitem
.pingtime
>= value
);
3065 f
->open_object_section("entry");
3067 const time_t lu(sitem
.last_update
);
3069 string
lustr(ctime_r(&lu
, buffer
));
3070 lustr
.pop_back(); // Remove trailing \n
3071 auto stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3072 f
->dump_string("last update", lustr
);
3073 f
->dump_bool("stale", ceph_clock_now().sec() - sitem
.last_update
> stale
);
3074 f
->dump_int("from osd", whoami
);
3075 f
->dump_int("to osd", sitem
.to
);
3076 f
->dump_string("interface", (sitem
.back
? "back" : "front"));
3077 f
->open_object_section("average");
3078 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.times
[0],3).c_str());
3079 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.times
[1],3).c_str());
3080 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.times
[2],3).c_str());
3081 f
->close_section(); // average
3082 f
->open_object_section("min");
3083 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3084 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3085 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3086 f
->close_section(); // min
3087 f
->open_object_section("max");
3088 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3089 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3090 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3091 f
->close_section(); // max
3092 f
->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem
.last
,3).c_str());
3093 f
->close_section(); // entry
3095 f
->close_section(); // entries
3096 f
->close_section(); // network_ping_times
3098 ceph_abort_msg("broken asok registration");
3102 on_finish(ret
, ss
.str(), outbl
);
3105 class TestOpsSocketHook
: public AdminSocketHook
{
3106 OSDService
*service
;
3109 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
3110 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
3112 std::ostream
& errss
,
3113 bufferlist
& out
) override
{
3117 test_ops(service
, store
, command
, cmdmap
, outss
);
3119 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
3125 void test_ops(OSDService
*service
, ObjectStore
*store
,
3126 std::string_view command
, const cmdmap_t
& cmdmap
, ostream
&ss
);
3130 class OSD::C_Tick
: public Context
{
3133 explicit C_Tick(OSD
*o
) : osd(o
) {}
3134 void finish(int r
) override
{
3139 class OSD::C_Tick_WithoutOSDLock
: public Context
{
3142 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
3143 void finish(int r
) override
{
3144 osd
->tick_without_osd_lock();
3148 int OSD::enable_disable_fuse(bool stop
)
3152 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
3153 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
3154 dout(1) << __func__
<< " disabling" << dendl
;
3158 r
= ::rmdir(mntpath
.c_str());
3161 derr
<< __func__
<< " failed to rmdir " << mntpath
<< ": "
3162 << cpp_strerror(r
) << dendl
;
3167 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
3168 dout(1) << __func__
<< " enabling" << dendl
;
3169 r
= ::mkdir(mntpath
.c_str(), 0700);
3172 if (r
< 0 && r
!= -EEXIST
) {
3173 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
3174 << cpp_strerror(r
) << dendl
;
3177 fuse_store
= new FuseStore(store
, mntpath
);
3178 r
= fuse_store
->start();
3180 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
3186 #endif // HAVE_LIBFUSE
3190 size_t OSD::get_num_cache_shards()
3192 return cct
->_conf
.get_val
<Option::size_t>("osd_num_cache_shards");
3195 int OSD::get_num_op_shards()
3197 if (cct
->_conf
->osd_op_num_shards
)
3198 return cct
->_conf
->osd_op_num_shards
;
3199 if (store_is_rotational
)
3200 return cct
->_conf
->osd_op_num_shards_hdd
;
3202 return cct
->_conf
->osd_op_num_shards_ssd
;
3205 int OSD::get_num_op_threads()
3207 if (cct
->_conf
->osd_op_num_threads_per_shard
)
3208 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard
;
3209 if (store_is_rotational
)
3210 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_hdd
;
3212 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_ssd
;
3215 float OSD::get_osd_recovery_sleep()
3217 if (cct
->_conf
->osd_recovery_sleep
)
3218 return cct
->_conf
->osd_recovery_sleep
;
3219 if (!store_is_rotational
&& !journal_is_rotational
)
3220 return cct
->_conf
->osd_recovery_sleep_ssd
;
3221 else if (store_is_rotational
&& !journal_is_rotational
)
3222 return cct
->_conf
.get_val
<double>("osd_recovery_sleep_hybrid");
3224 return cct
->_conf
->osd_recovery_sleep_hdd
;
3227 float OSD::get_osd_delete_sleep()
3229 float osd_delete_sleep
= cct
->_conf
.get_val
<double>("osd_delete_sleep");
3230 if (osd_delete_sleep
> 0)
3231 return osd_delete_sleep
;
3232 if (!store_is_rotational
&& !journal_is_rotational
)
3233 return cct
->_conf
.get_val
<double>("osd_delete_sleep_ssd");
3234 if (store_is_rotational
&& !journal_is_rotational
)
3235 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hybrid");
3236 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hdd");
3239 int OSD::get_recovery_max_active()
3241 if (cct
->_conf
->osd_recovery_max_active
)
3242 return cct
->_conf
->osd_recovery_max_active
;
3243 if (store_is_rotational
)
3244 return cct
->_conf
->osd_recovery_max_active_hdd
;
3246 return cct
->_conf
->osd_recovery_max_active_ssd
;
3249 float OSD::get_osd_snap_trim_sleep()
3251 float osd_snap_trim_sleep
= cct
->_conf
.get_val
<double>("osd_snap_trim_sleep");
3252 if (osd_snap_trim_sleep
> 0)
3253 return osd_snap_trim_sleep
;
3254 if (!store_is_rotational
&& !journal_is_rotational
)
3255 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_ssd");
3256 if (store_is_rotational
&& !journal_is_rotational
)
3257 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hybrid");
3258 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hdd");
3264 CompatSet initial
, diff
;
3265 std::lock_guard
lock(osd_lock
);
3270 tick_timer_without_osd_lock
.init();
3271 service
.recovery_request_timer
.init();
3272 service
.sleep_timer
.init();
3274 boot_finisher
.start();
3278 store
->read_meta("require_osd_release", &val
);
3279 last_require_osd_release
= ceph_release_from_name(val
);
3283 dout(2) << "init " << dev_path
3284 << " (looks like " << (store_is_rotational
? "hdd" : "ssd") << ")"
3286 dout(2) << "journal " << journal_path
<< dendl
;
3287 ceph_assert(store
); // call pre_init() first!
3289 store
->set_cache_shards(get_num_cache_shards());
3291 int r
= store
->mount();
3293 derr
<< "OSD:init: unable to mount object store" << dendl
;
3296 journal_is_rotational
= store
->is_journal_rotational();
3297 dout(2) << "journal looks like " << (journal_is_rotational
? "hdd" : "ssd")
3300 enable_disable_fuse(false);
3302 dout(2) << "boot" << dendl
;
3304 service
.meta_ch
= store
->open_collection(coll_t::meta());
3306 // initialize the daily loadavg with current 15min loadavg
3308 if (getloadavg(loadavgs
, 3) == 3) {
3309 daily_loadavg
= loadavgs
[2];
3311 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
3312 daily_loadavg
= 1.0;
3315 int rotating_auth_attempts
= 0;
3316 auto rotating_auth_timeout
=
3317 g_conf().get_val
<int64_t>("rotating_keys_bootstrap_timeout");
3319 // sanity check long object name handling
3322 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
3323 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
3324 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
3325 r
= store
->validate_hobject_key(l
);
3327 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
3328 << "object name[space] len" << dendl
;
3329 derr
<< " osd max object name len = "
3330 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
3331 derr
<< " osd max object namespace len = "
3332 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
3333 derr
<< cpp_strerror(r
) << dendl
;
3334 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
3337 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
3340 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
3345 r
= read_superblock();
3347 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
3352 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
3353 derr
<< "The disk uses features unsupported by the executable." << dendl
;
3354 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
3355 derr
<< " daemon features " << osd_compat
<< dendl
;
3357 if (osd_compat
.writeable(superblock
.compat_features
)) {
3358 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3359 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
3364 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3365 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
3371 assert_warn(whoami
== superblock
.whoami
);
3372 if (whoami
!= superblock
.whoami
) {
3373 derr
<< "OSD::init: superblock says osd"
3374 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
3379 startup_time
= ceph::mono_clock::now();
3381 // load up "current" osdmap
3382 assert_warn(!get_osdmap());
3384 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
3388 osdmap
= get_map(superblock
.current_epoch
);
3391 // make sure we don't have legacy pgs deleting
3394 int r
= store
->list_collections(ls
);
3395 ceph_assert(r
>= 0);
3398 if (c
.is_pg(&pgid
) &&
3399 !osdmap
->have_pg_pool(pgid
.pool())) {
3400 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
3401 if (!store
->exists(service
.meta_ch
, oid
)) {
3402 derr
<< __func__
<< " missing pg_pool_t for deleted pool "
3403 << pgid
.pool() << " for pg " << pgid
3404 << "; please downgrade to luminous and allow "
3405 << "pg deletion to complete before upgrading" << dendl
;
3412 initial
= get_osd_initial_compat_set();
3413 diff
= superblock
.compat_features
.unsupported(initial
);
3414 if (superblock
.compat_features
.merge(initial
)) {
3415 // Are we adding SNAPMAPPER2?
3416 if (diff
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
)) {
3417 dout(1) << __func__
<< " upgrade snap_mapper (first start as octopus)"
3419 auto ch
= service
.meta_ch
;
3420 auto hoid
= make_snapmapper_oid();
3421 unsigned max
= cct
->_conf
->osd_target_transaction_size
;
3422 r
= SnapMapper::convert_legacy(cct
, store
, ch
, hoid
, max
);
3426 // We need to persist the new compat_set before we
3428 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
3429 ObjectStore::Transaction t
;
3430 write_superblock(t
);
3431 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3436 // make sure snap mapper object exists
3437 if (!store
->exists(service
.meta_ch
, OSD::make_snapmapper_oid())) {
3438 dout(10) << "init creating/touching snapmapper object" << dendl
;
3439 ObjectStore::Transaction t
;
3440 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3441 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3445 if (!store
->exists(service
.meta_ch
, OSD::make_purged_snaps_oid())) {
3446 dout(10) << "init creating/touching purged_snaps object" << dendl
;
3447 ObjectStore::Transaction t
;
3448 t
.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3449 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3454 if (cct
->_conf
->osd_open_classes_on_start
) {
3455 int r
= ClassHandler::get_instance().open_all_classes();
3457 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
3460 check_osdmap_features();
3462 create_recoverystate_perf();
3465 epoch_t bind_epoch
= osdmap
->get_epoch();
3466 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
3469 clear_temp_objects();
3471 // initialize osdmap references in sharded wq
3472 for (auto& shard
: shards
) {
3473 std::lock_guard
l(shard
->osdmap_lock
);
3474 shard
->shard_osdmap
= osdmap
;
3477 // load up pgs (as they previously existed)
3480 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
3486 struct store_statfs_t stbuf
;
3487 osd_alert_list_t alerts
;
3488 int r
= store
->statfs(&stbuf
, &alerts
);
3489 ceph_assert(r
== 0);
3490 service
.set_statfs(stbuf
, alerts
);
3493 // client_messenger auth_client is already set up by monc.
3494 for (auto m
: { cluster_messenger
,
3496 hb_front_client_messenger
,
3497 hb_back_client_messenger
,
3498 hb_front_server_messenger
,
3499 hb_back_server_messenger
} ) {
3500 m
->set_auth_client(monc
);
3502 for (auto m
: { client_messenger
,
3504 hb_front_server_messenger
,
3505 hb_back_server_messenger
}) {
3506 m
->set_auth_server(monc
);
3508 monc
->set_handle_authentication_dispatcher(this);
3510 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
3511 | CEPH_ENTITY_TYPE_MGR
);
3516 mgrc
.set_pgstats_cb([this](){ return collect_pg_stats(); });
3517 mgrc
.set_perf_metric_query_cb(
3518 [this](const ConfigPayload
&config_payload
) {
3519 set_perf_queries(config_payload
);
3522 return get_perf_reports();
3526 // tell monc about log_client so it will know about mon session resets
3527 monc
->set_log_client(&log_client
);
3528 update_log_config();
3531 client_messenger
->add_dispatcher_tail(&mgrc
);
3532 client_messenger
->add_dispatcher_tail(this);
3533 cluster_messenger
->add_dispatcher_head(this);
3535 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3536 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3537 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3538 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3540 objecter_messenger
->add_dispatcher_head(service
.objecter
.get());
3543 service
.publish_map(osdmap
);
3544 service
.publish_superblock(superblock
);
3545 service
.max_oldest_map
= superblock
.oldest_map
;
3547 for (auto& shard
: shards
) {
3548 // put PGs in a temporary set because we may modify pg_slots
3549 // unordered_map below.
3551 for (auto& i
: shard
->pg_slots
) {
3552 PGRef pg
= i
.second
->pg
;
3558 for (auto pg
: pgs
) {
3559 std::scoped_lock l
{*pg
};
3560 set
<pair
<spg_t
,epoch_t
>> new_children
;
3561 set
<pair
<spg_t
,epoch_t
>> merge_pgs
;
3562 service
.identify_splits_and_merges(pg
->get_osdmap(), osdmap
, pg
->pg_id
,
3563 &new_children
, &merge_pgs
);
3564 if (!new_children
.empty()) {
3565 for (auto shard
: shards
) {
3566 shard
->prime_splits(osdmap
, &new_children
);
3568 assert(new_children
.empty());
3570 if (!merge_pgs
.empty()) {
3571 for (auto shard
: shards
) {
3572 shard
->prime_merges(osdmap
, &merge_pgs
);
3574 assert(merge_pgs
.empty());
3581 // start the heartbeat
3582 heartbeat_thread
.create("osd_srv_heartbt");
3585 tick_timer
.add_event_after(get_tick_interval(),
3588 std::lock_guard
l(tick_timer_lock
);
3589 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
3590 new C_Tick_WithoutOSDLock(this));
3595 r
= monc
->authenticate();
3597 derr
<< __func__
<< " authentication failed: " << cpp_strerror(r
)
3602 while (monc
->wait_auth_rotating(rotating_auth_timeout
) < 0) {
3603 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
3604 ++rotating_auth_attempts
;
3605 if (rotating_auth_attempts
> g_conf()->max_rotating_auth_attempts
) {
3606 derr
<< __func__
<< " wait_auth_rotating timed out" << dendl
;
3611 r
= update_crush_device_class();
3613 derr
<< __func__
<< " unable to update_crush_device_class: "
3614 << cpp_strerror(r
) << dendl
;
3618 r
= update_crush_location();
3620 derr
<< __func__
<< " unable to update_crush_location: "
3621 << cpp_strerror(r
) << dendl
;
3629 // start objecter *after* we have authenticated, so that we don't ignore
3630 // the OSDMaps it requests.
3631 service
.final_init();
3635 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
3638 dout(0) << "done with init, starting boot process" << dendl
;
3640 // subscribe to any pg creations
3641 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
3643 // MgrClient needs this (it doesn't have MonClient reference itself)
3644 monc
->sub_want("mgrmap", 0, 0);
3646 // we don't need to ask for an osdmap here; objecter will
3647 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3656 enable_disable_fuse(true);
3663 void OSD::final_init()
3665 AdminSocket
*admin_socket
= cct
->get_admin_socket();
3666 asok_hook
= new OSDSocketHook(this);
3667 int r
= admin_socket
->register_command("status", asok_hook
,
3668 "high-level status of OSD");
3669 ceph_assert(r
== 0);
3670 r
= admin_socket
->register_command("flush_journal",
3672 "flush the journal to permanent store");
3673 ceph_assert(r
== 0);
3674 r
= admin_socket
->register_command("dump_ops_in_flight " \
3675 "name=filterstr,type=CephString,n=N,req=false",
3677 "show the ops currently in flight");
3678 ceph_assert(r
== 0);
3679 r
= admin_socket
->register_command("ops " \
3680 "name=filterstr,type=CephString,n=N,req=false",
3682 "show the ops currently in flight");
3683 ceph_assert(r
== 0);
3684 r
= admin_socket
->register_command("dump_blocked_ops " \
3685 "name=filterstr,type=CephString,n=N,req=false",
3687 "show the blocked ops currently in flight");
3688 ceph_assert(r
== 0);
3689 r
= admin_socket
->register_command("dump_historic_ops " \
3690 "name=filterstr,type=CephString,n=N,req=false",
3693 ceph_assert(r
== 0);
3694 r
= admin_socket
->register_command("dump_historic_slow_ops " \
3695 "name=filterstr,type=CephString,n=N,req=false",
3697 "show slowest recent ops");
3698 ceph_assert(r
== 0);
3699 r
= admin_socket
->register_command("dump_historic_ops_by_duration " \
3700 "name=filterstr,type=CephString,n=N,req=false",
3702 "show slowest recent ops, sorted by duration");
3703 ceph_assert(r
== 0);
3704 r
= admin_socket
->register_command("dump_op_pq_state",
3706 "dump op priority queue state");
3707 ceph_assert(r
== 0);
3708 r
= admin_socket
->register_command("dump_blacklist",
3710 "dump blacklisted clients and times");
3711 ceph_assert(r
== 0);
3712 r
= admin_socket
->register_command("dump_watchers",
3714 "show clients which have active watches,"
3715 " and on which objects");
3716 ceph_assert(r
== 0);
3717 r
= admin_socket
->register_command("dump_recovery_reservations",
3719 "show recovery reservations");
3720 ceph_assert(r
== 0);
3721 r
= admin_socket
->register_command("dump_scrub_reservations",
3723 "show scrub reservations");
3724 ceph_assert(r
== 0);
3725 r
= admin_socket
->register_command("get_latest_osdmap",
3727 "force osd to update the latest map from "
3729 ceph_assert(r
== 0);
3731 r
= admin_socket
->register_command("set_heap_property " \
3732 "name=property,type=CephString " \
3733 "name=value,type=CephInt",
3735 "update malloc extension heap property");
3736 ceph_assert(r
== 0);
3738 r
= admin_socket
->register_command("get_heap_property " \
3739 "name=property,type=CephString",
3741 "get malloc extension heap property");
3742 ceph_assert(r
== 0);
3744 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
3746 "print statistics of kvdb which used by bluestore");
3747 ceph_assert(r
== 0);
3749 r
= admin_socket
->register_command("dump_scrubs",
3751 "print scheduled scrubs");
3752 ceph_assert(r
== 0);
3754 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
3756 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3757 ceph_assert(r
== 0);
3759 r
= admin_socket
->register_command("flush_store_cache",
3761 "Flush bluestore internal cache");
3762 ceph_assert(r
== 0);
3763 r
= admin_socket
->register_command("dump_pgstate_history",
3765 "show recent state history");
3766 ceph_assert(r
== 0);
3768 r
= admin_socket
->register_command("compact",
3770 "Commpact object store's omap."
3771 " WARNING: Compaction probably slows your requests");
3772 ceph_assert(r
== 0);
3774 r
= admin_socket
->register_command("get_mapped_pools",
3776 "dump pools whose PG(s) are mapped to this OSD.");
3778 ceph_assert(r
== 0);
3780 r
= admin_socket
->register_command("smart name=devid,type=CephString,req=false",
3782 "probe OSD devices for SMART data.");
3784 ceph_assert(r
== 0);
3786 r
= admin_socket
->register_command("list_devices",
3788 "list OSD devices.");
3789 r
= admin_socket
->register_command("send_beacon",
3791 "send OSD beacon to mon immediately");
3793 r
= admin_socket
->register_command(
3794 "dump_osd_network name=value,type=CephInt,req=false", asok_hook
,
3795 "Dump osd heartbeat network ping times");
3796 ceph_assert(r
== 0);
3798 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
);
3799 // Note: pools are CephString instead of CephPoolname because
3800 // these commands traditionally support both pool names and numbers
3801 r
= admin_socket
->register_command(
3803 "name=pool,type=CephString " \
3804 "name=objname,type=CephObjectname " \
3805 "name=key,type=CephString "\
3806 "name=val,type=CephString",
3809 ceph_assert(r
== 0);
3810 r
= admin_socket
->register_command(
3812 "name=pool,type=CephString " \
3813 "name=objname,type=CephObjectname " \
3814 "name=key,type=CephString",
3817 ceph_assert(r
== 0);
3818 r
= admin_socket
->register_command(
3820 "name=pool,type=CephString " \
3821 "name=objname,type=CephObjectname " \
3822 "name=header,type=CephString",
3825 ceph_assert(r
== 0);
3827 r
= admin_socket
->register_command(
3829 "name=pool,type=CephString " \
3830 "name=objname,type=CephObjectname",
3832 "output entire object map");
3833 ceph_assert(r
== 0);
3835 r
= admin_socket
->register_command(
3837 "name=pool,type=CephString " \
3838 "name=objname,type=CephObjectname " \
3839 "name=len,type=CephInt",
3841 "truncate object to length");
3842 ceph_assert(r
== 0);
3844 r
= admin_socket
->register_command(
3846 "name=pool,type=CephString " \
3847 "name=objname,type=CephObjectname " \
3848 "name=shardid,type=CephInt,req=false,range=0|255",
3850 "inject data error to an object");
3851 ceph_assert(r
== 0);
3853 r
= admin_socket
->register_command(
3855 "name=pool,type=CephString " \
3856 "name=objname,type=CephObjectname " \
3857 "name=shardid,type=CephInt,req=false,range=0|255",
3859 "inject metadata error to an object");
3860 ceph_assert(r
== 0);
3861 r
= admin_socket
->register_command(
3862 "set_recovery_delay " \
3863 "name=utime,type=CephInt,req=false",
3865 "Delay osd recovery by specified seconds");
3866 ceph_assert(r
== 0);
3867 r
= admin_socket
->register_command(
3869 "name=type,type=CephString,req=false " \
3870 "name=count,type=CephInt,req=false ",
3872 "Inject a full disk (optional count times)");
3873 ceph_assert(r
== 0);
3874 r
= admin_socket
->register_command(
3876 "name=count,type=CephInt,req=false " \
3877 "name=size,type=CephInt,req=false " \
3878 "name=object_size,type=CephInt,req=false " \
3879 "name=object_num,type=CephInt,req=false ",
3881 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3882 "(default count=1G default size=4MB). Results in log.");
3883 ceph_assert(r
== 0);
3884 r
= admin_socket
->register_command(
3886 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3887 "name=message,type=CephString,n=N",
3889 "log a message to the cluster log");
3890 ceph_assert(r
== 0);
3891 r
= admin_socket
->register_command(
3895 ceph_assert(r
== 0);
3896 r
= admin_socket
->register_command(
3898 "name=heapcmd,type=CephChoices,strings=" \
3899 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3900 "name=value,type=CephString,req=false",
3902 "show heap usage info (available only if compiled with tcmalloc)");
3903 ceph_assert(r
== 0);
3904 r
= admin_socket
->register_command(
3905 "debug dump_missing " \
3906 "name=filename,type=CephFilepath",
3908 "dump missing objects to a named file");
3909 ceph_assert(r
== 0);
3910 r
= admin_socket
->register_command(
3911 "debug kick_recovery_wq " \
3912 "name=delay,type=CephInt,range=0",
3914 "set osd_recovery_delay_start to <val>");
3915 ceph_assert(r
== 0);
3916 r
= admin_socket
->register_command(
3918 "name=arg,type=CephChoices,strings=status|flush",
3920 "run cpu profiling on daemon");
3921 ceph_assert(r
== 0);
3922 r
= admin_socket
->register_command(
3923 "dump_pg_recovery_stats",
3925 "dump pg recovery statistics");
3926 ceph_assert(r
== 0);
3927 r
= admin_socket
->register_command(
3928 "reset_pg_recovery_stats",
3930 "reset pg recovery statistics");
3931 ceph_assert(r
== 0);
3932 r
= admin_socket
->register_command(
3935 "Drop all OSD caches");
3936 ceph_assert(r
== 0);
3937 r
= admin_socket
->register_command(
3940 "Get OSD caches statistics");
3941 ceph_assert(r
== 0);
3942 r
= admin_socket
->register_command(
3943 "scrub_purged_snaps",
3945 "Scrub purged_snaps vs snapmapper index");
3946 ceph_assert(r
== 0);
3948 // -- pg commands --
3949 // old form: ceph pg <pgid> command ...
3950 r
= admin_socket
->register_command(
3952 "name=pgid,type=CephPgid " \
3953 "name=cmd,type=CephChoices,strings=query",
3956 ceph_assert(r
== 0);
3957 r
= admin_socket
->register_command(
3959 "name=pgid,type=CephPgid " \
3960 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3961 "name=mulcmd,type=CephChoices,strings=revert|delete",
3964 ceph_assert(r
== 0);
3965 r
= admin_socket
->register_command(
3967 "name=pgid,type=CephPgid " \
3968 "name=cmd,type=CephChoices,strings=list_unfound " \
3969 "name=offset,type=CephString,req=false",
3972 ceph_assert(r
== 0);
3973 r
= admin_socket
->register_command(
3975 "name=pgid,type=CephPgid " \
3976 "name=cmd,type=CephChoices,strings=scrub " \
3977 "name=time,type=CephInt,req=false",
3980 ceph_assert(r
== 0);
3981 r
= admin_socket
->register_command(
3983 "name=pgid,type=CephPgid " \
3984 "name=cmd,type=CephChoices,strings=deep_scrub " \
3985 "name=time,type=CephInt,req=false",
3988 ceph_assert(r
== 0);
3989 // new form: tell <pgid> <cmd> for both cli and rest
3990 r
= admin_socket
->register_command(
3993 "show details of a specific pg");
3994 ceph_assert(r
== 0);
3995 r
= admin_socket
->register_command(
3996 "mark_unfound_lost " \
3997 "name=pgid,type=CephPgid,req=false " \
3998 "name=mulcmd,type=CephChoices,strings=revert|delete",
4000 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4001 ceph_assert(r
== 0);
4002 r
= admin_socket
->register_command(
4004 "name=pgid,type=CephPgid,req=false " \
4005 "name=offset,type=CephString,req=false",
4007 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4008 ceph_assert(r
== 0);
4009 r
= admin_socket
->register_command(
4011 "name=pgid,type=CephPgid,req=false " \
4012 "name=time,type=CephInt,req=false",
4014 "Trigger a scheduled scrub ");
4015 ceph_assert(r
== 0);
4016 r
= admin_socket
->register_command(
4018 "name=pgid,type=CephPgid,req=false " \
4019 "name=time,type=CephInt,req=false",
4021 "Trigger a scheduled deep scrub ");
4022 ceph_assert(r
== 0);
4025 void OSD::create_logger()
4027 dout(10) << "create_logger" << dendl
;
4029 logger
= build_osd_logger(cct
);
4030 cct
->get_perfcounters_collection()->add(logger
);
4033 void OSD::create_recoverystate_perf()
4035 dout(10) << "create_recoverystate_perf" << dendl
;
4037 recoverystate_perf
= build_recoverystate_perf(cct
);
4038 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
4043 if (cct
->_conf
->osd_fast_shutdown
) {
4044 derr
<< "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl
;
4045 if (cct
->_conf
->osd_fast_shutdown_notify_mon
)
4046 service
.prepare_to_stop();
4051 if (!service
.prepare_to_stop())
4052 return 0; // already shutting down
4054 if (is_stopping()) {
4058 dout(0) << "shutdown" << dendl
;
4060 set_state(STATE_STOPPING
);
4063 if (cct
->_conf
.get_val
<bool>("osd_debug_shutdown")) {
4064 cct
->_conf
.set_val("debug_osd", "100");
4065 cct
->_conf
.set_val("debug_journal", "100");
4066 cct
->_conf
.set_val("debug_filestore", "100");
4067 cct
->_conf
.set_val("debug_bluestore", "100");
4068 cct
->_conf
.set_val("debug_ms", "100");
4069 cct
->_conf
.apply_changes(nullptr);
4072 // stop MgrClient earlier as it's more like an internal consumer of OSD
4075 service
.start_shutdown();
4077 // stop sending work to pgs. this just prevents any new work in _process
4078 // from racing with on_shutdown and potentially entering the pg after.
4079 op_shardedwq
.drain();
4085 for (auto pg
: pgs
) {
4090 // drain op queue again (in case PGs requeued something)
4091 op_shardedwq
.drain();
4093 finished
.clear(); // zap waiters (bleh, this is messy)
4094 waiting_for_osdmap
.clear();
4097 // unregister commands
4098 cct
->get_admin_socket()->unregister_commands(asok_hook
);
4102 cct
->get_admin_socket()->unregister_commands(test_ops_hook
);
4103 delete test_ops_hook
;
4104 test_ops_hook
= NULL
;
4109 std::lock_guard l
{heartbeat_lock
};
4110 heartbeat_stop
= true;
4111 heartbeat_cond
.notify_all();
4112 heartbeat_peers
.clear();
4114 heartbeat_thread
.join();
4116 hb_back_server_messenger
->mark_down_all();
4117 hb_front_server_messenger
->mark_down_all();
4118 hb_front_client_messenger
->mark_down_all();
4119 hb_back_client_messenger
->mark_down_all();
4123 dout(10) << "op sharded tp stopped" << dendl
;
4125 dout(10) << "stopping agent" << dendl
;
4126 service
.agent_stop();
4128 boot_finisher
.wait_for_empty();
4132 boot_finisher
.stop();
4133 reset_heartbeat_peers(true);
4135 tick_timer
.shutdown();
4138 std::lock_guard
l(tick_timer_lock
);
4139 tick_timer_without_osd_lock
.shutdown();
4142 // note unmount epoch
4143 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl
;
4144 superblock
.mounted
= service
.get_boot_epoch();
4145 superblock
.clean_thru
= get_osdmap_epoch();
4146 ObjectStore::Transaction t
;
4147 write_superblock(t
);
4148 int r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
4150 derr
<< "OSD::shutdown: error writing superblock: "
4151 << cpp_strerror(r
) << dendl
;
4155 service
.shutdown_reserver();
4158 #ifdef PG_DEBUG_REFS
4159 service
.dump_live_pgids();
4163 _get_pgs(&pgs
, true);
4167 for (auto& pg
: pgs
) {
4168 if (pg
->is_deleted()) {
4171 dout(20) << " kicking pg " << pg
<< dendl
;
4173 if (pg
->get_num_ref() != 1) {
4174 derr
<< "pgid " << pg
->get_pgid() << " has ref count of "
4175 << pg
->get_num_ref() << dendl
;
4176 #ifdef PG_DEBUG_REFS
4177 pg
->dump_live_ids();
4179 if (cct
->_conf
->osd_shutdown_pgref_assert
) {
4187 #ifdef PG_DEBUG_REFS
4188 service
.dump_live_pgids();
4192 cct
->_conf
.remove_observer(this);
4195 service
.meta_ch
.reset();
4197 dout(10) << "syncing store" << dendl
;
4198 enable_disable_fuse(true);
4200 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
4201 dout(10) << "flushing journal" << dendl
;
4202 store
->flush_journal();
4208 std::unique_lock l
{map_lock
};
4209 set_osdmap(OSDMapRef());
4211 for (auto s
: shards
) {
4212 std::lock_guard
l(s
->osdmap_lock
);
4213 s
->shard_osdmap
= OSDMapRef();
4217 std::lock_guard
lock(osd_lock
);
4221 dout(10) << "Store synced" << dendl
;
4223 op_tracker
.on_shutdown();
4225 ClassHandler::get_instance().shutdown();
4226 client_messenger
->shutdown();
4227 cluster_messenger
->shutdown();
4228 hb_front_client_messenger
->shutdown();
4229 hb_back_client_messenger
->shutdown();
4230 objecter_messenger
->shutdown();
4231 hb_front_server_messenger
->shutdown();
4232 hb_back_server_messenger
->shutdown();
4237 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
4239 bool created
= false;
4241 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
4242 vector
<string
> vcmd
{cmd
};
4246 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
4249 if (r
== -ENOENT
&& !created
) {
4250 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
4251 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
4252 vector
<string
> vnewcmd
{newcmd
};
4256 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
4259 derr
<< __func__
<< " fail: osd does not exist and created failed: "
4260 << cpp_strerror(r
) << dendl
;
4266 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
4275 int OSD::update_crush_location()
4277 if (!cct
->_conf
->osd_crush_update_on_start
) {
4278 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
4283 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
4284 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
4286 struct store_statfs_t st
;
4287 osd_alert_list_t alerts
;
4288 int r
= store
->statfs(&st
, &alerts
);
4290 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
4293 snprintf(weight
, sizeof(weight
), "%.4lf",
4296 double(1ull << 40 /* TB */)));
4299 dout(10) << __func__
<< " crush location is " << cct
->crush_location
<< dendl
;
4302 string("{\"prefix\": \"osd crush create-or-move\", ") +
4303 string("\"id\": ") + stringify(whoami
) + ", " +
4304 string("\"weight\":") + weight
+ ", " +
4305 string("\"args\": [") + stringify(cct
->crush_location
) + "]}";
4306 return mon_cmd_maybe_osd_create(cmd
);
4309 int OSD::update_crush_device_class()
4311 if (!cct
->_conf
->osd_class_update_on_start
) {
4312 dout(10) << __func__
<< " osd_class_update_on_start = false" << dendl
;
4316 string device_class
;
4317 int r
= store
->read_meta("crush_device_class", &device_class
);
4318 if (r
< 0 || device_class
.empty()) {
4319 device_class
= store
->get_default_device_class();
4322 if (device_class
.empty()) {
4323 dout(20) << __func__
<< " no device class stored locally" << dendl
;
4328 string("{\"prefix\": \"osd crush set-device-class\", ") +
4329 string("\"class\": \"") + device_class
+ string("\", ") +
4330 string("\"ids\": [\"") + stringify(whoami
) + string("\"]}");
4332 r
= mon_cmd_maybe_osd_create(cmd
);
4334 // good, already bound to a device-class
4341 void OSD::write_superblock(ObjectStore::Transaction
& t
)
4343 dout(10) << "write_superblock " << superblock
<< dendl
;
4345 //hack: at minimum it's using the baseline feature set
4346 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
4347 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
4350 encode(superblock
, bl
);
4351 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
4354 int OSD::read_superblock()
4357 int r
= store
->read(service
.meta_ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
4361 auto p
= bl
.cbegin();
4362 decode(superblock
, p
);
4364 dout(10) << "read_superblock " << superblock
<< dendl
;
4369 void OSD::clear_temp_objects()
4371 dout(10) << __func__
<< dendl
;
4373 store
->list_collections(ls
);
4374 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
4376 if (!p
->is_pg(&pgid
))
4379 // list temp objects
4380 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
4382 vector
<ghobject_t
> temps
;
4385 vector
<ghobject_t
> objects
;
4386 auto ch
= store
->open_collection(*p
);
4388 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4389 store
->get_ideal_list_max(),
4391 if (objects
.empty())
4393 vector
<ghobject_t
>::iterator q
;
4394 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
4395 // Hammer set pool for temps to -1, so check for clean-up
4396 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
4397 temps
.push_back(*q
);
4402 // If we saw a non-temp object and hit the break above we can
4403 // break out of the while loop too.
4404 if (q
!= objects
.end())
4407 if (!temps
.empty()) {
4408 ObjectStore::Transaction t
;
4410 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
4411 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
4413 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
4414 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4415 t
= ObjectStore::Transaction();
4420 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4426 void OSD::recursive_remove_collection(CephContext
* cct
,
4427 ObjectStore
*store
, spg_t pgid
,
4433 make_snapmapper_oid());
4435 ObjectStore::CollectionHandle ch
= store
->open_collection(tmp
);
4436 ObjectStore::Transaction t
;
4437 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
4440 int max
= cct
->_conf
->osd_target_transaction_size
;
4441 vector
<ghobject_t
> objects
;
4442 objects
.reserve(max
);
4445 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4446 max
, &objects
, &next
);
4447 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
4448 if (objects
.empty())
4450 for (auto& p
: objects
) {
4451 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
4452 int r
= mapper
.remove_oid(p
.hobj
, &_t
);
4453 if (r
!= 0 && r
!= -ENOENT
)
4457 int r
= store
->queue_transaction(ch
, std::move(t
));
4458 ceph_assert(r
== 0);
4459 t
= ObjectStore::Transaction();
4461 t
.remove_collection(tmp
);
4462 int r
= store
->queue_transaction(ch
, std::move(t
));
4463 ceph_assert(r
== 0);
4466 if (!ch
->flush_commit(&waiter
)) {
4472 // ======================================================
4476 OSDMapRef createmap
,
4479 dout(10) << __func__
<< " " << pgid
<< dendl
;
4481 map
<string
,string
> ec_profile
;
4483 if (createmap
->have_pg_pool(pgid
.pool())) {
4484 pi
= *createmap
->get_pg_pool(pgid
.pool());
4485 name
= createmap
->get_pool_name(pgid
.pool());
4486 if (pi
.is_erasure()) {
4487 ec_profile
= createmap
->get_erasure_code_profile(pi
.erasure_code_profile
);
4490 // pool was deleted; grab final pg_pool_t off disk.
4491 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
4493 int r
= store
->read(service
.meta_ch
, oid
, 0, 0, bl
);
4495 derr
<< __func__
<< " missing pool " << pgid
.pool() << " tombstone"
4499 ceph_assert(r
>= 0);
4500 auto p
= bl
.cbegin();
4503 if (p
.end()) { // dev release v13.0.2 did not include ec_profile
4504 derr
<< __func__
<< " missing ec_profile from pool " << pgid
.pool()
4505 << " tombstone" << dendl
;
4508 decode(ec_profile
, p
);
4510 PGPool
pool(cct
, createmap
, pgid
.pool(), pi
, name
);
4512 if (pi
.type
== pg_pool_t::TYPE_REPLICATED
||
4513 pi
.type
== pg_pool_t::TYPE_ERASURE
)
4514 pg
= new PrimaryLogPG(&service
, createmap
, pool
, ec_profile
, pgid
);
4520 void OSD::_get_pgs(vector
<PGRef
> *v
, bool clear_too
)
4523 v
->reserve(get_num_pgs());
4524 for (auto& s
: shards
) {
4525 std::lock_guard
l(s
->shard_lock
);
4526 for (auto& j
: s
->pg_slots
) {
4528 !j
.second
->pg
->is_deleted()) {
4529 v
->push_back(j
.second
->pg
);
4531 s
->_detach_pg(j
.second
.get());
4538 void OSD::_get_pgids(vector
<spg_t
> *v
)
4541 v
->reserve(get_num_pgs());
4542 for (auto& s
: shards
) {
4543 std::lock_guard
l(s
->shard_lock
);
4544 for (auto& j
: s
->pg_slots
) {
4546 !j
.second
->pg
->is_deleted()) {
4547 v
->push_back(j
.first
);
4553 void OSD::register_pg(PGRef pg
)
4555 spg_t pgid
= pg
->get_pgid();
4556 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4557 auto sdata
= shards
[shard_index
];
4558 std::lock_guard
l(sdata
->shard_lock
);
4559 auto r
= sdata
->pg_slots
.emplace(pgid
, make_unique
<OSDShardPGSlot
>());
4560 ceph_assert(r
.second
);
4561 auto *slot
= r
.first
->second
.get();
4562 dout(20) << __func__
<< " " << pgid
<< " " << pg
<< dendl
;
4563 sdata
->_attach_pg(slot
, pg
.get());
4566 bool OSD::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
4568 auto sdata
= pg
->osd_shard
;
4571 std::lock_guard
l(sdata
->shard_lock
);
4572 auto p
= sdata
->pg_slots
.find(pg
->pg_id
);
4573 if (p
== sdata
->pg_slots
.end() ||
4575 dout(20) << __func__
<< " " << pg
->pg_id
<< " not found" << dendl
;
4578 if (p
->second
->waiting_for_merge_epoch
) {
4579 dout(20) << __func__
<< " " << pg
->pg_id
<< " waiting for merge" << dendl
;
4582 dout(20) << __func__
<< " " << pg
->pg_id
<< " " << pg
<< dendl
;
4583 sdata
->_detach_pg(p
->second
.get());
4586 for (auto shard
: shards
) {
4587 shard
->unprime_split_children(pg
->pg_id
, old_pg_num
);
4590 // update pg count now since we might not get an osdmap any time soon.
4591 if (pg
->is_primary())
4592 service
.logger
->dec(l_osd_pg_primary
);
4593 else if (pg
->is_nonprimary())
4594 service
.logger
->dec(l_osd_pg_replica
); // misnomver
4596 service
.logger
->dec(l_osd_pg_stray
);
4601 PGRef
OSD::_lookup_pg(spg_t pgid
)
4603 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4604 auto sdata
= shards
[shard_index
];
4605 std::lock_guard
l(sdata
->shard_lock
);
4606 auto p
= sdata
->pg_slots
.find(pgid
);
4607 if (p
== sdata
->pg_slots
.end()) {
4610 return p
->second
->pg
;
4613 PGRef
OSD::_lookup_lock_pg(spg_t pgid
)
4615 PGRef pg
= _lookup_pg(pgid
);
4620 if (!pg
->is_deleted()) {
4627 PGRef
OSD::lookup_lock_pg(spg_t pgid
)
4629 return _lookup_lock_pg(pgid
);
4632 void OSD::load_pgs()
4634 ceph_assert(ceph_mutex_is_locked(osd_lock
));
4635 dout(0) << "load_pgs" << dendl
;
4638 auto pghist
= make_pg_num_history_oid();
4640 int r
= store
->read(service
.meta_ch
, pghist
, 0, 0, bl
, 0);
4641 if (r
>= 0 && bl
.length() > 0) {
4642 auto p
= bl
.cbegin();
4643 decode(pg_num_history
, p
);
4645 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
4649 int r
= store
->list_collections(ls
);
4651 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
4655 for (vector
<coll_t
>::iterator it
= ls
.begin();
4659 if (it
->is_temp(&pgid
) ||
4660 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
, pgid
))) {
4661 dout(10) << "load_pgs " << *it
4662 << " removing, legacy or flagged for removal pg" << dendl
;
4663 recursive_remove_collection(cct
, store
, pgid
, *it
);
4667 if (!it
->is_pg(&pgid
)) {
4668 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
4672 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
4673 epoch_t map_epoch
= 0;
4674 int r
= PG::peek_map_epoch(store
, pgid
, &map_epoch
);
4676 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
4682 if (map_epoch
> 0) {
4683 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
4685 if (!get_osdmap()->have_pg_pool(pgid
.pool())) {
4686 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
4687 << " on pg " << pgid
<< ", but the pool is not present in the "
4688 << "current map, so this is probably a result of bug 10617. "
4689 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4690 << "to clean it up later." << dendl
;
4693 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
4694 << map_epoch
<< ", but missing map. Crashing."
4696 ceph_abort_msg("Missing map in load_pgs");
4699 pg
= _make_pg(pgosdmap
, pgid
);
4701 pg
= _make_pg(get_osdmap(), pgid
);
4704 recursive_remove_collection(cct
, store
, pgid
, *it
);
4708 // there can be no waiters here, so we don't call _wake_pg_slot
4711 pg
->ch
= store
->open_collection(pg
->coll
);
4713 // read pg state, log
4714 pg
->read_state(store
);
4717 dout(10) << "load_pgs " << *it
<< " deleting dne" << dendl
;
4720 recursive_remove_collection(cct
, store
, pgid
, *it
);
4724 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4725 assert(NULL
!= shards
[shard_index
]);
4726 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4729 pg
->reg_next_scrub();
4731 dout(10) << __func__
<< " loaded " << *pg
<< dendl
;
4737 dout(0) << __func__
<< " opened " << num
<< " pgs" << dendl
;
4741 PGRef
OSD::handle_pg_create_info(const OSDMapRef
& osdmap
,
4742 const PGCreateInfo
*info
)
4744 spg_t pgid
= info
->pgid
;
4746 if (maybe_wait_for_max_pg(osdmap
, pgid
, info
->by_mon
)) {
4747 dout(10) << __func__
<< " hit max pg, dropping" << dendl
;
4751 PeeringCtx rctx
= create_context();
4753 OSDMapRef startmap
= get_map(info
->epoch
);
4756 int64_t pool_id
= pgid
.pgid
.pool();
4757 const pg_pool_t
*pool
= osdmap
->get_pg_pool(pool_id
);
4759 dout(10) << __func__
<< " ignoring " << pgid
<< ", pool dne" << dendl
;
4762 if (osdmap
->require_osd_release
>= ceph_release_t::nautilus
&&
4763 !pool
->has_flag(pg_pool_t::FLAG_CREATING
)) {
4764 // this ensures we do not process old creating messages after the
4765 // pool's initial pgs have been created (and pg are subsequently
4766 // allowed to split or merge).
4767 dout(20) << __func__
<< " dropping " << pgid
4768 << "create, pool does not have CREATING flag set" << dendl
;
4773 int up_primary
, acting_primary
;
4774 vector
<int> up
, acting
;
4775 startmap
->pg_to_up_acting_osds(
4776 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4778 const pg_pool_t
* pp
= startmap
->get_pg_pool(pgid
.pool());
4779 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
4780 store
->get_type() != "bluestore") {
4781 clog
->warn() << "pg " << pgid
4782 << " is at risk of silent data corruption: "
4783 << "the pool allows ec overwrites but is not stored in "
4784 << "bluestore, so deep scrubbing will not detect bitrot";
4786 create_pg_collection(
4787 rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
4788 init_pg_ondisk(rctx
.transaction
, pgid
, pp
);
4790 int role
= startmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
4792 PGRef pg
= _make_pg(startmap
, pgid
);
4793 pg
->ch
= store
->create_new_collection(pg
->coll
);
4796 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4797 assert(NULL
!= shards
[shard_index
]);
4798 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4803 // we are holding the shard lock
4804 ceph_assert(!pg
->is_deleted());
4813 info
->past_intervals
,
4817 pg
->init_collection_pool_opts();
4819 if (pg
->is_primary()) {
4820 std::lock_guard locker
{m_perf_queries_lock
};
4821 pg
->set_dynamic_perf_stats_queries(m_perf_queries
);
4824 pg
->handle_initialize(rctx
);
4825 pg
->handle_activate_map(rctx
);
4827 dispatch_context(rctx
, pg
.get(), osdmap
, nullptr);
4829 dout(10) << __func__
<< " new pg " << *pg
<< dendl
;
4833 bool OSD::maybe_wait_for_max_pg(const OSDMapRef
& osdmap
,
4837 const auto max_pgs_per_osd
=
4838 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4839 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4841 if (num_pgs
< max_pgs_per_osd
) {
4845 std::lock_guard
l(pending_creates_lock
);
4846 if (is_mon_create
) {
4847 pending_creates_from_mon
++;
4849 bool is_primary
= osdmap
->get_pg_acting_role(pgid
, whoami
) == 0;
4850 pending_creates_from_osd
.emplace(pgid
, is_primary
);
4852 dout(1) << __func__
<< " withhold creation of pg " << pgid
4853 << ": " << num_pgs
<< " >= "<< max_pgs_per_osd
<< dendl
;
4857 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4858 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4859 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4860 static vector
<int32_t> twiddle(const vector
<int>& acting
) {
4861 if (acting
.size() > 1) {
4864 vector
<int32_t> twiddled(acting
.begin(), acting
.end());
4865 twiddled
.push_back(-1);
4870 void OSD::resume_creating_pg()
4872 bool do_sub_pg_creates
= false;
4873 bool have_pending_creates
= false;
4875 const auto max_pgs_per_osd
=
4876 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4877 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4878 if (max_pgs_per_osd
<= num_pgs
) {
4879 // this could happen if admin decreases this setting before a PG is removed
4882 unsigned spare_pgs
= max_pgs_per_osd
- num_pgs
;
4883 std::lock_guard
l(pending_creates_lock
);
4884 if (pending_creates_from_mon
> 0) {
4885 dout(20) << __func__
<< " pending_creates_from_mon "
4886 << pending_creates_from_mon
<< dendl
;
4887 do_sub_pg_creates
= true;
4888 if (pending_creates_from_mon
>= spare_pgs
) {
4889 spare_pgs
= pending_creates_from_mon
= 0;
4891 spare_pgs
-= pending_creates_from_mon
;
4892 pending_creates_from_mon
= 0;
4895 auto pg
= pending_creates_from_osd
.cbegin();
4896 while (spare_pgs
> 0 && pg
!= pending_creates_from_osd
.cend()) {
4897 dout(20) << __func__
<< " pg " << pg
->first
<< dendl
;
4899 get_osdmap()->pg_to_up_acting_osds(pg
->first
.pgid
, nullptr, nullptr, &acting
, nullptr);
4900 service
.queue_want_pg_temp(pg
->first
.pgid
, twiddle(acting
), true);
4901 pg
= pending_creates_from_osd
.erase(pg
);
4902 do_sub_pg_creates
= true;
4905 have_pending_creates
= (pending_creates_from_mon
> 0 ||
4906 !pending_creates_from_osd
.empty());
4909 bool do_renew_subs
= false;
4910 if (do_sub_pg_creates
) {
4911 if (monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0)) {
4912 dout(4) << __func__
<< ": resolicit pg creates from mon since "
4913 << last_pg_create_epoch
<< dendl
;
4914 do_renew_subs
= true;
4917 version_t start
= get_osdmap_epoch() + 1;
4918 if (have_pending_creates
) {
4919 // don't miss any new osdmap deleting PGs
4920 if (monc
->sub_want("osdmap", start
, 0)) {
4921 dout(4) << __func__
<< ": resolicit osdmap from mon since "
4923 do_renew_subs
= true;
4925 } else if (do_sub_pg_creates
) {
4926 // no need to subscribe the osdmap continuously anymore
4927 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4928 if (monc
->sub_want_increment("osdmap", start
, CEPH_SUBSCRIBE_ONETIME
)) {
4929 dout(4) << __func__
<< ": re-subscribe osdmap(onetime) since "
4931 do_renew_subs
= true;
4935 if (do_renew_subs
) {
4939 service
.send_pg_temp();
4942 void OSD::build_initial_pg_history(
4945 utime_t created_stamp
,
4949 dout(10) << __func__
<< " " << pgid
<< " created " << created
<< dendl
;
4950 *h
= pg_history_t(created
, created_stamp
);
4952 OSDMapRef lastmap
= service
.get_map(created
);
4953 int up_primary
, acting_primary
;
4954 vector
<int> up
, acting
;
4955 lastmap
->pg_to_up_acting_osds(
4956 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4958 ostringstream debug
;
4959 for (epoch_t e
= created
+ 1; e
<= get_osdmap_epoch(); ++e
) {
4960 OSDMapRef osdmap
= service
.get_map(e
);
4961 int new_up_primary
, new_acting_primary
;
4962 vector
<int> new_up
, new_acting
;
4963 osdmap
->pg_to_up_acting_osds(
4964 pgid
.pgid
, &new_up
, &new_up_primary
, &new_acting
, &new_acting_primary
);
4966 // this is a bit imprecise, but sufficient?
4967 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
4968 const pg_pool_t
*pi
;
4969 bool operator()(const set
<pg_shard_t
> &have
) const {
4970 return have
.size() >= pi
->min_size
;
4972 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
4973 } min_size_predicate(osdmap
->get_pg_pool(pgid
.pgid
.pool()));
4975 bool new_interval
= PastIntervals::check_new_interval(
4982 h
->same_interval_since
,
4983 h
->last_epoch_clean
,
4991 h
->same_interval_since
= e
;
4993 h
->same_up_since
= e
;
4995 if (acting_primary
!= new_acting_primary
) {
4996 h
->same_primary_since
= e
;
4998 if (pgid
.pgid
.is_split(lastmap
->get_pg_num(pgid
.pgid
.pool()),
4999 osdmap
->get_pg_num(pgid
.pgid
.pool()),
5001 h
->last_epoch_split
= e
;
5004 acting
= new_acting
;
5005 up_primary
= new_up_primary
;
5006 acting_primary
= new_acting_primary
;
5010 dout(20) << __func__
<< " " << debug
.str() << dendl
;
5011 dout(10) << __func__
<< " " << *h
<< " " << *pi
5012 << " [" << (pi
->empty() ? pair
<epoch_t
,epoch_t
>(0,0) :
5013 pi
->get_bounds()) << ")"
5017 void OSD::_add_heartbeat_peer(int p
)
5023 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
5024 if (i
== heartbeat_peers
.end()) {
5025 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, get_osdmap_epoch());
5028 assert(cons
.second
);
5030 hi
= &heartbeat_peers
[p
];
5033 auto stamps
= service
.get_hb_stamps(p
);
5035 auto sb
= ceph::make_ref
<Session
>(cct
, cons
.first
.get());
5037 sb
->stamps
= stamps
;
5038 hi
->hb_interval_start
= ceph_clock_now();
5039 hi
->con_back
= cons
.first
.get();
5040 hi
->con_back
->set_priv(sb
);
5042 auto sf
= ceph::make_ref
<Session
>(cct
, cons
.second
.get());
5044 sf
->stamps
= stamps
;
5045 hi
->con_front
= cons
.second
.get();
5046 hi
->con_front
->set_priv(sf
);
5048 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5049 << " " << hi
->con_back
->get_peer_addr()
5050 << " " << hi
->con_front
->get_peer_addr()
5055 hi
->epoch
= get_osdmap_epoch();
5058 void OSD::_remove_heartbeat_peer(int n
)
5060 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
5061 ceph_assert(q
!= heartbeat_peers
.end());
5062 dout(20) << " removing heartbeat peer osd." << n
5063 << " " << q
->second
.con_back
->get_peer_addr()
5064 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
5066 q
->second
.clear_mark_down();
5067 heartbeat_peers
.erase(q
);
5070 void OSD::need_heartbeat_peer_update()
5074 dout(20) << "need_heartbeat_peer_update" << dendl
;
5075 heartbeat_set_peers_need_update();
5078 void OSD::maybe_update_heartbeat_peers()
5080 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5082 if (is_waiting_for_healthy() || is_active()) {
5083 utime_t now
= ceph_clock_now();
5084 if (last_heartbeat_resample
== utime_t()) {
5085 last_heartbeat_resample
= now
;
5086 heartbeat_set_peers_need_update();
5087 } else if (!heartbeat_peers_need_update()) {
5088 utime_t dur
= now
- last_heartbeat_resample
;
5089 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
5090 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
5091 heartbeat_set_peers_need_update();
5092 last_heartbeat_resample
= now
;
5093 // automatically clean up any stale heartbeat peers
5094 // if we are unhealthy, then clean all
5095 reset_heartbeat_peers(is_waiting_for_healthy());
5100 if (!heartbeat_peers_need_update())
5102 heartbeat_clear_peers_need_update();
5104 std::lock_guard
l(heartbeat_lock
);
5106 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
5109 // build heartbeat from set
5113 for (auto& pg
: pgs
) {
5114 pg
->with_heartbeat_peers([&](int peer
) {
5115 if (get_osdmap()->is_up(peer
)) {
5116 _add_heartbeat_peer(peer
);
5122 // include next and previous up osds to ensure we have a fully-connected set
5123 set
<int> want
, extras
;
5124 const int next
= get_osdmap()->get_next_up_osd_after(whoami
);
5127 int prev
= get_osdmap()->get_previous_up_osd_before(whoami
);
5128 if (prev
>= 0 && prev
!= next
)
5131 // make sure we have at least **min_down** osds coming from different
5132 // subtree level (e.g., hosts) for fast failure detection.
5133 auto min_down
= cct
->_conf
.get_val
<uint64_t>("mon_osd_min_down_reporters");
5134 auto subtree
= cct
->_conf
.get_val
<string
>("mon_osd_reporter_subtree_level");
5135 auto limit
= std::max(min_down
, (uint64_t)cct
->_conf
->osd_heartbeat_min_peers
);
5136 get_osdmap()->get_random_up_osds_by_subtree(
5137 whoami
, subtree
, limit
, want
, &want
);
5139 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
5140 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
5142 _add_heartbeat_peer(*p
);
5145 // remove down peers; enumerate extras
5146 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5147 while (p
!= heartbeat_peers
.end()) {
5148 if (!get_osdmap()->is_up(p
->first
)) {
5151 _remove_heartbeat_peer(o
);
5154 if (p
->second
.epoch
< get_osdmap_epoch()) {
5155 extras
.insert(p
->first
);
5161 for (int n
= next
; n
>= 0; ) {
5162 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
5164 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
5165 dout(10) << " adding random peer osd." << n
<< dendl
;
5167 _add_heartbeat_peer(n
);
5169 n
= get_osdmap()->get_next_up_osd_after(n
);
5171 break; // came full circle; stop
5175 for (set
<int>::iterator p
= extras
.begin();
5176 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
5180 _remove_heartbeat_peer(*p
);
5183 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
5185 // clean up stale failure pending
5186 for (auto it
= failure_pending
.begin(); it
!= failure_pending
.end();) {
5187 if (heartbeat_peers
.count(it
->first
) == 0) {
5188 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
5189 failure_pending
.erase(it
++);
5196 void OSD::reset_heartbeat_peers(bool all
)
5198 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5199 dout(10) << "reset_heartbeat_peers" << dendl
;
5200 utime_t stale
= ceph_clock_now();
5201 stale
-= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
5202 std::lock_guard
l(heartbeat_lock
);
5203 for (auto it
= heartbeat_peers
.begin(); it
!= heartbeat_peers
.end();) {
5204 auto& [peer
, hi
] = *it
;
5205 if (all
|| hi
.is_stale(stale
)) {
5206 hi
.clear_mark_down();
5207 // stop sending failure_report to mon too
5208 failure_queue
.erase(peer
);
5209 failure_pending
.erase(peer
);
5210 it
= heartbeat_peers
.erase(it
);
5217 void OSD::handle_osd_ping(MOSDPing
*m
)
5219 if (superblock
.cluster_fsid
!= m
->fsid
) {
5220 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
5221 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
5227 int from
= m
->get_source().num();
5229 heartbeat_lock
.lock();
5230 if (is_stopping()) {
5231 heartbeat_lock
.unlock();
5236 utime_t now
= ceph_clock_now();
5237 auto mnow
= service
.get_mnow();
5238 ConnectionRef
con(m
->get_connection());
5239 OSDMapRef curmap
= service
.get_osdmap();
5241 heartbeat_lock
.unlock();
5246 auto sref
= con
->get_priv();
5247 Session
*s
= static_cast<Session
*>(sref
.get());
5249 heartbeat_lock
.unlock();
5255 s
->stamps
= service
.get_hb_stamps(from
);
5260 case MOSDPing::PING
:
5262 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
5263 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
5264 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
5265 if (heartbeat_drop
->second
== 0) {
5266 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
5268 --heartbeat_drop
->second
;
5269 dout(5) << "Dropping heartbeat from " << from
5270 << ", " << heartbeat_drop
->second
5271 << " remaining to drop" << dendl
;
5274 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
5275 ((((double)(rand()%100))/100.0))) {
5277 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
5278 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
5279 dout(5) << "Dropping heartbeat from " << from
5280 << ", " << heartbeat_drop
->second
5281 << " remaining to drop" << dendl
;
5286 ceph::signedspan sender_delta_ub
{};
5287 s
->stamps
->got_ping(
5293 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5295 if (!cct
->get_heartbeat_map()->is_healthy()) {
5296 dout(10) << "internal heartbeat not healthy, dropping ping request"
5301 Message
*r
= new MOSDPing(monc
->get_fsid(),
5302 curmap
->get_epoch(),
5303 MOSDPing::PING_REPLY
,
5307 service
.get_up_epoch(),
5308 cct
->_conf
->osd_heartbeat_min_size
,
5310 con
->send_message(r
);
5312 if (curmap
->is_up(from
)) {
5314 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5315 from
, curmap
->get_epoch());
5317 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5320 } else if (!curmap
->exists(from
) ||
5321 curmap
->get_down_at(from
) > m
->map_epoch
) {
5322 // tell them they have died
5323 Message
*r
= new MOSDPing(monc
->get_fsid(),
5324 curmap
->get_epoch(),
5329 service
.get_up_epoch(),
5330 cct
->_conf
->osd_heartbeat_min_size
);
5331 con
->send_message(r
);
5336 case MOSDPing::PING_REPLY
:
5338 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
5339 if (i
!= heartbeat_peers
.end()) {
5340 auto acked
= i
->second
.ping_history
.find(m
->ping_stamp
);
5341 if (acked
!= i
->second
.ping_history
.end()) {
5342 int &unacknowledged
= acked
->second
.second
;
5343 if (con
== i
->second
.con_back
) {
5344 dout(25) << "handle_osd_ping got reply from osd." << from
5345 << " first_tx " << i
->second
.first_tx
5346 << " last_tx " << i
->second
.last_tx
5347 << " last_rx_back " << i
->second
.last_rx_back
5349 << " last_rx_front " << i
->second
.last_rx_front
5351 i
->second
.last_rx_back
= now
;
5352 ceph_assert(unacknowledged
> 0);
5354 // if there is no front con, set both stamps.
5355 if (i
->second
.con_front
== NULL
) {
5356 i
->second
.last_rx_front
= now
;
5357 ceph_assert(unacknowledged
> 0);
5360 } else if (con
== i
->second
.con_front
) {
5361 dout(25) << "handle_osd_ping got reply from osd." << from
5362 << " first_tx " << i
->second
.first_tx
5363 << " last_tx " << i
->second
.last_tx
5364 << " last_rx_back " << i
->second
.last_rx_back
5365 << " last_rx_front " << i
->second
.last_rx_front
5368 i
->second
.last_rx_front
= now
;
5369 ceph_assert(unacknowledged
> 0);
5373 if (unacknowledged
== 0) {
5374 // succeeded in getting all replies
5375 dout(25) << "handle_osd_ping got all replies from osd." << from
5376 << " , erase pending ping(sent at " << m
->ping_stamp
<< ")"
5377 << " and older pending ping(s)"
5380 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5381 ++i
->second
.hb_average_count
;
5382 uint32_t back_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_back
- m
->ping_stamp
);
5383 i
->second
.hb_total_back
+= back_pingtime
;
5384 if (back_pingtime
< i
->second
.hb_min_back
)
5385 i
->second
.hb_min_back
= back_pingtime
;
5386 if (back_pingtime
> i
->second
.hb_max_back
)
5387 i
->second
.hb_max_back
= back_pingtime
;
5388 uint32_t front_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_front
- m
->ping_stamp
);
5389 i
->second
.hb_total_front
+= front_pingtime
;
5390 if (front_pingtime
< i
->second
.hb_min_front
)
5391 i
->second
.hb_min_front
= front_pingtime
;
5392 if (front_pingtime
> i
->second
.hb_max_front
)
5393 i
->second
.hb_max_front
= front_pingtime
;
5395 ceph_assert(i
->second
.hb_interval_start
!= utime_t());
5396 if (i
->second
.hb_interval_start
== utime_t())
5397 i
->second
.hb_interval_start
= now
;
5398 int64_t hb_avg_time_period
= 60;
5399 if (cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span")) {
5400 hb_avg_time_period
= cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span");
5402 if (now
- i
->second
.hb_interval_start
>= utime_t(hb_avg_time_period
, 0)) {
5403 uint32_t back_avg
= i
->second
.hb_total_back
/ i
->second
.hb_average_count
;
5404 uint32_t back_min
= i
->second
.hb_min_back
;
5405 uint32_t back_max
= i
->second
.hb_max_back
;
5406 uint32_t front_avg
= i
->second
.hb_total_front
/ i
->second
.hb_average_count
;
5407 uint32_t front_min
= i
->second
.hb_min_front
;
5408 uint32_t front_max
= i
->second
.hb_max_front
;
5410 // Reset for new interval
5411 i
->second
.hb_average_count
= 0;
5412 i
->second
.hb_interval_start
= now
;
5413 i
->second
.hb_total_back
= i
->second
.hb_max_back
= 0;
5414 i
->second
.hb_min_back
= UINT_MAX
;
5415 i
->second
.hb_total_front
= i
->second
.hb_max_front
= 0;
5416 i
->second
.hb_min_front
= UINT_MAX
;
5418 // Record per osd interace ping times
5419 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5420 if (i
->second
.hb_back_pingtime
.size() == 0) {
5421 ceph_assert(i
->second
.hb_front_pingtime
.size() == 0);
5422 for (unsigned k
= 0 ; k
< hb_vector_size
; ++k
) {
5423 i
->second
.hb_back_pingtime
.push_back(back_avg
);
5424 i
->second
.hb_back_min
.push_back(back_min
);
5425 i
->second
.hb_back_max
.push_back(back_max
);
5426 i
->second
.hb_front_pingtime
.push_back(front_avg
);
5427 i
->second
.hb_front_min
.push_back(front_min
);
5428 i
->second
.hb_front_max
.push_back(front_max
);
5429 ++i
->second
.hb_index
;
5432 int index
= i
->second
.hb_index
& (hb_vector_size
- 1);
5433 i
->second
.hb_back_pingtime
[index
] = back_avg
;
5434 i
->second
.hb_back_min
[index
] = back_min
;
5435 i
->second
.hb_back_max
[index
] = back_max
;
5436 i
->second
.hb_front_pingtime
[index
] = front_avg
;
5437 i
->second
.hb_front_min
[index
] = front_min
;
5438 i
->second
.hb_front_max
[index
] = front_max
;
5439 ++i
->second
.hb_index
;
5443 std::lock_guard
l(service
.stat_lock
);
5444 service
.osd_stat
.hb_pingtime
[from
].last_update
= now
.sec();
5445 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5448 uint32_t min
= UINT_MAX
;
5452 uint32_t size
= (uint32_t)i
->second
.hb_back_pingtime
.size();
5453 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5455 int index
= (i
->second
.hb_index
+ k
) % size
;
5456 total
+= i
->second
.hb_back_pingtime
[index
];
5457 if (i
->second
.hb_back_min
[index
] < min
)
5458 min
= i
->second
.hb_back_min
[index
];
5459 if (i
->second
.hb_back_max
[index
] > max
)
5460 max
= i
->second
.hb_back_max
[index
];
5461 if (count
== 1 || count
== 5 || count
== 15) {
5462 service
.osd_stat
.hb_pingtime
[from
].back_pingtime
[which
] = total
/ count
;
5463 service
.osd_stat
.hb_pingtime
[from
].back_min
[which
] = min
;
5464 service
.osd_stat
.hb_pingtime
[from
].back_max
[which
] = max
;
5471 if (i
->second
.con_front
!= NULL
) {
5472 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5479 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5481 int index
= (i
->second
.hb_index
+ k
) % size
;
5482 total
+= i
->second
.hb_front_pingtime
[index
];
5483 if (i
->second
.hb_front_min
[index
] < min
)
5484 min
= i
->second
.hb_front_min
[index
];
5485 if (i
->second
.hb_front_max
[index
] > max
)
5486 max
= i
->second
.hb_front_max
[index
];
5487 if (count
== 1 || count
== 5 || count
== 15) {
5488 service
.osd_stat
.hb_pingtime
[from
].front_pingtime
[which
] = total
/ count
;
5489 service
.osd_stat
.hb_pingtime
[from
].front_min
[which
] = min
;
5490 service
.osd_stat
.hb_pingtime
[from
].front_max
[which
] = max
;
5499 std::lock_guard
l(service
.stat_lock
);
5500 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5501 if (i
->second
.con_front
!= NULL
)
5502 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5504 i
->second
.ping_history
.erase(i
->second
.ping_history
.begin(), ++acked
);
5507 if (i
->second
.is_healthy(now
)) {
5508 // Cancel false reports
5509 auto failure_queue_entry
= failure_queue
.find(from
);
5510 if (failure_queue_entry
!= failure_queue
.end()) {
5511 dout(10) << "handle_osd_ping canceling queued "
5512 << "failure report for osd." << from
<< dendl
;
5513 failure_queue
.erase(failure_queue_entry
);
5516 auto failure_pending_entry
= failure_pending
.find(from
);
5517 if (failure_pending_entry
!= failure_pending
.end()) {
5518 dout(10) << "handle_osd_ping canceling in-flight "
5519 << "failure report for osd." << from
<< dendl
;
5520 send_still_alive(curmap
->get_epoch(),
5522 failure_pending_entry
->second
.second
);
5523 failure_pending
.erase(failure_pending_entry
);
5527 // old replies, deprecated by newly sent pings.
5528 dout(10) << "handle_osd_ping no pending ping(sent at " << m
->ping_stamp
5529 << ") is found, treat as covered by newly sent pings "
5536 curmap
->is_up(from
)) {
5538 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5539 from
, curmap
->get_epoch());
5541 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5546 s
->stamps
->got_ping_reply(
5550 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5554 case MOSDPing::YOU_DIED
:
5555 dout(10) << "handle_osd_ping " << m
->get_source_inst()
5556 << " says i am down in " << m
->map_epoch
<< dendl
;
5557 osdmap_subscribe(curmap
->get_epoch()+1, false);
5561 heartbeat_lock
.unlock();
5565 void OSD::heartbeat_entry()
5567 std::unique_lock
l(heartbeat_lock
);
5570 while (!heartbeat_stop
) {
5574 if (cct
->_conf
.get_val
<bool>("debug_disable_randomized_ping")) {
5575 wait
= (float)cct
->_conf
->osd_heartbeat_interval
;
5577 wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
5579 auto w
= ceph::make_timespan(wait
);
5580 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
5581 heartbeat_cond
.wait_for(l
, w
);
5584 dout(30) << "heartbeat_entry woke up" << dendl
;
5588 void OSD::heartbeat_check()
5590 ceph_assert(ceph_mutex_is_locked(heartbeat_lock
));
5591 utime_t now
= ceph_clock_now();
5593 // check for incoming heartbeats (move me elsewhere?)
5594 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5595 p
!= heartbeat_peers
.end();
5598 if (p
->second
.first_tx
== utime_t()) {
5599 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
5600 << " yet, skipping" << dendl
;
5604 dout(25) << "heartbeat_check osd." << p
->first
5605 << " first_tx " << p
->second
.first_tx
5606 << " last_tx " << p
->second
.last_tx
5607 << " last_rx_back " << p
->second
.last_rx_back
5608 << " last_rx_front " << p
->second
.last_rx_front
5610 if (p
->second
.is_unhealthy(now
)) {
5611 utime_t oldest_deadline
= p
->second
.ping_history
.begin()->second
.first
;
5612 if (p
->second
.last_rx_back
== utime_t() ||
5613 p
->second
.last_rx_front
== utime_t()) {
5614 derr
<< "heartbeat_check: no reply from "
5615 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5616 << " osd." << p
->first
5617 << " ever on either front or back, first ping sent "
5618 << p
->second
.first_tx
5619 << " (oldest deadline " << oldest_deadline
<< ")"
5622 failure_queue
[p
->first
] = p
->second
.first_tx
;
5624 derr
<< "heartbeat_check: no reply from "
5625 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5626 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
5627 << " front " << p
->second
.last_rx_front
5628 << " (oldest deadline " << oldest_deadline
<< ")"
5631 failure_queue
[p
->first
] = std::min(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
5637 void OSD::heartbeat()
5639 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock
));
5640 dout(30) << "heartbeat" << dendl
;
5644 int hb_interval
= cct
->_conf
->osd_heartbeat_interval
;
5645 int n_samples
= 86400;
5646 if (hb_interval
> 1) {
5647 n_samples
/= hb_interval
;
5652 if (getloadavg(loadavgs
, 1) == 1) {
5653 logger
->set(l_osd_loadavg
, 100 * loadavgs
[0]);
5654 daily_loadavg
= (daily_loadavg
* (n_samples
- 1) + loadavgs
[0]) / n_samples
;
5655 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg
<< dendl
;
5658 dout(30) << "heartbeat checking stats" << dendl
;
5660 // refresh peer list and osd stats
5661 vector
<int> hb_peers
;
5662 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5663 p
!= heartbeat_peers
.end();
5665 hb_peers
.push_back(p
->first
);
5667 auto new_stat
= service
.set_osd_stat(hb_peers
, get_num_pgs());
5668 dout(5) << __func__
<< " " << new_stat
<< dendl
;
5669 ceph_assert(new_stat
.statfs
.total
);
5672 float ratio
= service
.compute_adjusted_ratio(new_stat
, &pratio
);
5674 service
.check_full_status(ratio
, pratio
);
5676 utime_t now
= ceph_clock_now();
5677 auto mnow
= service
.get_mnow();
5678 utime_t deadline
= now
;
5679 deadline
+= cct
->_conf
->osd_heartbeat_grace
;
5682 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
5683 i
!= heartbeat_peers
.end();
5685 int peer
= i
->first
;
5686 Session
*s
= static_cast<Session
*>(i
->second
.con_back
->get_priv().get());
5688 dout(30) << "heartbeat osd." << peer
<< " has no open con" << dendl
;
5691 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
5693 i
->second
.last_tx
= now
;
5694 if (i
->second
.first_tx
== utime_t())
5695 i
->second
.first_tx
= now
;
5696 i
->second
.ping_history
[now
] = make_pair(deadline
,
5697 HeartbeatInfo::HEARTBEAT_MAX_CONN
);
5698 if (i
->second
.hb_interval_start
== utime_t())
5699 i
->second
.hb_interval_start
= now
;
5701 std::optional
<ceph::signedspan
> delta_ub
;
5702 s
->stamps
->sent_ping(&delta_ub
);
5704 i
->second
.con_back
->send_message(
5705 new MOSDPing(monc
->get_fsid(),
5706 service
.get_osdmap_epoch(),
5711 service
.get_up_epoch(),
5712 cct
->_conf
->osd_heartbeat_min_size
,
5715 if (i
->second
.con_front
)
5716 i
->second
.con_front
->send_message(
5717 new MOSDPing(monc
->get_fsid(),
5718 service
.get_osdmap_epoch(),
5723 service
.get_up_epoch(),
5724 cct
->_conf
->osd_heartbeat_min_size
,
5728 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
5730 // hmm.. am i all alone?
5731 dout(30) << "heartbeat lonely?" << dendl
;
5732 if (heartbeat_peers
.empty()) {
5733 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
5734 last_mon_heartbeat
= now
;
5735 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
5736 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5740 dout(30) << "heartbeat done" << dendl
;
5743 bool OSD::heartbeat_reset(Connection
*con
)
5745 std::lock_guard
l(heartbeat_lock
);
5746 auto s
= con
->get_priv();
5747 dout(20) << __func__
<< " con " << con
<< " s " << s
.get() << dendl
;
5748 con
->set_priv(nullptr);
5750 if (is_stopping()) {
5753 auto session
= static_cast<Session
*>(s
.get());
5754 auto p
= heartbeat_peers
.find(session
->peer
);
5755 if (p
!= heartbeat_peers
.end() &&
5756 (p
->second
.con_back
== con
||
5757 p
->second
.con_front
== con
)) {
5758 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5759 << ", reopening" << dendl
;
5760 p
->second
.clear_mark_down(con
);
5761 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
5763 p
->second
.con_back
= newcon
.first
.get();
5764 p
->second
.con_back
->set_priv(s
);
5765 if (newcon
.second
) {
5766 p
->second
.con_front
= newcon
.second
.get();
5767 p
->second
.con_front
->set_priv(s
);
5769 p
->second
.ping_history
.clear();
5771 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5772 << ", raced with osdmap update, closing out peer" << dendl
;
5773 heartbeat_peers
.erase(p
);
5776 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
5784 // =========================================
5788 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5789 dout(10) << "tick" << dendl
;
5791 utime_t now
= ceph_clock_now();
5792 // throw out any obsolete markdown log
5793 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
5794 while (!osd_markdown_log
.empty() &&
5795 osd_markdown_log
.front() + grace
< now
)
5796 osd_markdown_log
.pop_front();
5798 if (is_active() || is_waiting_for_healthy()) {
5799 maybe_update_heartbeat_peers();
5802 if (is_waiting_for_healthy()) {
5806 if (is_waiting_for_healthy() || is_booting()) {
5807 std::lock_guard
l(heartbeat_lock
);
5808 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
) {
5809 last_mon_heartbeat
= now
;
5810 dout(1) << __func__
<< " checking mon for new map" << dendl
;
5811 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5817 // scrub purged_snaps every deep scrub interval
5819 const utime_t last
= superblock
.last_purged_snaps_scrub
;
5820 utime_t next
= last
;
5821 next
+= cct
->_conf
->osd_scrub_min_interval
;
5823 // use a seed that is stable for each scrub interval, but varies
5824 // by OSD to avoid any herds.
5825 rng
.seed(whoami
+ superblock
.last_purged_snaps_scrub
.sec());
5826 double r
= (rng() % 1024) / 1024;
5828 cct
->_conf
->osd_scrub_min_interval
*
5829 cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
5830 if (next
< ceph_clock_now()) {
5831 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5832 << " next " << next
<< " ... now" << dendl
;
5833 scrub_purged_snaps();
5835 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5836 << " next " << next
<< dendl
;
5840 tick_timer
.add_event_after(get_tick_interval(), new C_Tick(this));
5843 void OSD::tick_without_osd_lock()
5845 ceph_assert(ceph_mutex_is_locked(tick_timer_lock
));
5846 dout(10) << "tick_without_osd_lock" << dendl
;
5848 logger
->set(l_osd_cached_crc
, buffer::get_cached_crc());
5849 logger
->set(l_osd_cached_crc_adjusted
, buffer::get_cached_crc_adjusted());
5850 logger
->set(l_osd_missed_crc
, buffer::get_missed_crc());
5852 // refresh osd stats
5853 struct store_statfs_t stbuf
;
5854 osd_alert_list_t alerts
;
5855 int r
= store
->statfs(&stbuf
, &alerts
);
5856 ceph_assert(r
== 0);
5857 service
.set_statfs(stbuf
, alerts
);
5859 // osd_lock is not being held, which means the OSD state
5860 // might change when doing the monitor report
5861 if (is_active() || is_waiting_for_healthy()) {
5863 std::lock_guard l
{heartbeat_lock
};
5866 map_lock
.lock_shared();
5867 std::lock_guard
l(mon_report_lock
);
5870 utime_t now
= ceph_clock_now();
5871 if (service
.need_fullness_update() ||
5872 now
- last_mon_report
> cct
->_conf
->osd_mon_report_interval
) {
5873 last_mon_report
= now
;
5877 map_lock
.unlock_shared();
5879 epoch_t max_waiting_epoch
= 0;
5880 for (auto s
: shards
) {
5881 max_waiting_epoch
= std::max(max_waiting_epoch
,
5882 s
->get_max_waiting_epoch());
5884 if (max_waiting_epoch
> get_osdmap()->get_epoch()) {
5885 dout(20) << __func__
<< " max_waiting_epoch " << max_waiting_epoch
5886 << ", requesting new map" << dendl
;
5887 osdmap_subscribe(superblock
.newest_map
+ 1, false);
5892 if (!scrub_random_backoff()) {
5895 service
.promote_throttle_recalibrate();
5896 resume_creating_pg();
5897 bool need_send_beacon
= false;
5898 const auto now
= ceph::coarse_mono_clock::now();
5900 // borrow lec lock to pretect last_sent_beacon from changing
5901 std::lock_guard l
{min_last_epoch_clean_lock
};
5902 const auto elapsed
= now
- last_sent_beacon
;
5903 if (chrono::duration_cast
<chrono::seconds
>(elapsed
).count() >
5904 cct
->_conf
->osd_beacon_report_interval
) {
5905 need_send_beacon
= true;
5908 if (need_send_beacon
) {
5913 mgrc
.update_daemon_health(get_health_metrics());
5914 service
.kick_recovery_queue();
5915 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
5916 new C_Tick_WithoutOSDLock(this));
5920 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5921 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5922 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5923 // getomap <pool> [namespace/]<obj-name>
5924 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5925 // injectmdataerr [namespace/]<obj-name> [shardid]
5926 // injectdataerr [namespace/]<obj-name> [shardid]
5928 // set_recovery_delay [utime]
5929 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
5930 std::string_view command
,
5931 const cmdmap_t
& cmdmap
, ostream
&ss
)
5934 //Support changing the omap on a single osd by using the Admin Socket to
5935 //directly request the osd make a change.
5936 if (command
== "setomapval" || command
== "rmomapkey" ||
5937 command
== "setomapheader" || command
== "getomap" ||
5938 command
== "truncobj" || command
== "injectmdataerr" ||
5939 command
== "injectdataerr"
5943 OSDMapRef curmap
= service
->get_osdmap();
5948 cmd_getval(cmdmap
, "pool", poolstr
);
5949 pool
= curmap
->lookup_pg_pool_name(poolstr
);
5950 //If we can't find it by name then maybe id specified
5951 if (pool
< 0 && isdigit(poolstr
[0]))
5952 pool
= atoll(poolstr
.c_str());
5954 ss
<< "Invalid pool '" << poolstr
<< "''";
5958 string objname
, nspace
;
5959 cmd_getval(cmdmap
, "objname", objname
);
5960 std::size_t found
= objname
.find_first_of('/');
5961 if (found
!= string::npos
) {
5962 nspace
= objname
.substr(0, found
);
5963 objname
= objname
.substr(found
+1);
5965 object_locator_t
oloc(pool
, nspace
);
5966 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
5969 ss
<< "Invalid namespace/objname";
5974 cmd_getval(cmdmap
, "shardid", shardid
, int64_t(shard_id_t::NO_SHARD
));
5975 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
5976 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
5977 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
5978 if (curmap
->pg_is_ec(rawpg
)) {
5979 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
5980 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
5985 ObjectStore::Transaction t
;
5987 if (command
== "setomapval") {
5988 map
<string
, bufferlist
> newattrs
;
5991 cmd_getval(cmdmap
, "key", key
);
5992 cmd_getval(cmdmap
, "val", valstr
);
5995 newattrs
[key
] = val
;
5996 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
5997 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5999 ss
<< "error=" << r
;
6002 } else if (command
== "rmomapkey") {
6004 cmd_getval(cmdmap
, "key", key
);
6006 t
.omap_rmkey(coll_t(pgid
), ghobject_t(obj
), key
);
6007 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6009 ss
<< "error=" << r
;
6012 } else if (command
== "setomapheader") {
6013 bufferlist newheader
;
6016 cmd_getval(cmdmap
, "header", headerstr
);
6017 newheader
.append(headerstr
);
6018 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
6019 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6021 ss
<< "error=" << r
;
6024 } else if (command
== "getomap") {
6025 //Debug: Output entire omap
6027 map
<string
, bufferlist
> keyvals
;
6028 auto ch
= store
->open_collection(coll_t(pgid
));
6030 ss
<< "unable to open collection for " << pgid
;
6033 r
= store
->omap_get(ch
, ghobject_t(obj
), &hdrbl
, &keyvals
);
6035 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
6036 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
6037 it
!= keyvals
.end(); ++it
)
6038 ss
<< " key=" << (*it
).first
<< " val="
6039 << string((*it
).second
.c_str(), (*it
).second
.length());
6041 ss
<< "error=" << r
;
6044 } else if (command
== "truncobj") {
6046 cmd_getval(cmdmap
, "len", trunclen
);
6047 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
6048 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6050 ss
<< "error=" << r
;
6053 } else if (command
== "injectdataerr") {
6054 store
->inject_data_error(gobj
);
6056 } else if (command
== "injectmdataerr") {
6057 store
->inject_mdata_error(gobj
);
6062 if (command
== "set_recovery_delay") {
6064 cmd_getval(cmdmap
, "utime", delay
, (int64_t)0);
6067 int r
= service
->cct
->_conf
.set_val("osd_recovery_delay_start",
6070 ss
<< "set_recovery_delay: error setting "
6071 << "osd_recovery_delay_start to '" << delay
<< "': error "
6075 service
->cct
->_conf
.apply_changes(nullptr);
6076 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
6077 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
6080 if (command
== "injectfull") {
6083 OSDService::s_names state
;
6084 cmd_getval(cmdmap
, "type", type
, string("full"));
6085 cmd_getval(cmdmap
, "count", count
, (int64_t)-1);
6086 if (type
== "none" || count
== 0) {
6090 state
= service
->get_full_state(type
);
6091 if (state
== OSDService::s_names::INVALID
) {
6092 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6095 service
->set_injectfull(state
, count
);
6098 ss
<< "Internal error - command=" << command
;
6101 // =========================================
6103 void OSD::ms_handle_connect(Connection
*con
)
6105 dout(10) << __func__
<< " con " << con
<< dendl
;
6106 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
6107 std::lock_guard
l(osd_lock
);
6110 dout(10) << __func__
<< " on mon" << dendl
;
6114 } else if (is_booting()) {
6115 _send_boot(); // resend boot message
6117 map_lock
.lock_shared();
6118 std::lock_guard
l2(mon_report_lock
);
6120 utime_t now
= ceph_clock_now();
6121 last_mon_report
= now
;
6123 // resend everything, it's a new session
6126 service
.requeue_pg_temp();
6127 service
.clear_sent_ready_to_merge();
6128 service
.send_pg_temp();
6129 service
.send_ready_to_merge();
6130 service
.send_pg_created();
6134 map_lock
.unlock_shared();
6136 send_beacon(ceph::coarse_mono_clock::now());
6140 // full map requests may happen while active or pre-boot
6141 if (requested_full_first
) {
6142 rerequest_full_maps();
6147 void OSD::ms_handle_fast_connect(Connection
*con
)
6149 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6150 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6151 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6152 s
= ceph::make_ref
<Session
>(cct
, con
);
6154 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
6155 << " addr=" << s
->con
->get_peer_addr() << dendl
;
6156 // we don't connect to clients
6157 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6158 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6163 void OSD::ms_handle_fast_accept(Connection
*con
)
6165 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6166 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6167 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6168 s
= ceph::make_ref
<Session
>(cct
, con
);
6170 dout(10) << "new session (incoming)" << s
<< " con=" << con
6171 << " addr=" << con
->get_peer_addr()
6172 << " must have raced with connect" << dendl
;
6173 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6174 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6179 bool OSD::ms_handle_reset(Connection
*con
)
6181 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6182 dout(2) << "ms_handle_reset con " << con
<< " session " << session
.get() << dendl
;
6185 session
->wstate
.reset(con
);
6186 session
->con
->set_priv(nullptr);
6187 session
->con
.reset(); // break con <-> session ref cycle
6188 // note that we break session->con *before* the session_handle_reset
6189 // cleanup below. this avoids a race between us and
6190 // PG::add_backoff, Session::check_backoff, etc.
6191 session_handle_reset(session
);
6195 bool OSD::ms_handle_refused(Connection
*con
)
6197 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
6200 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6201 dout(2) << "ms_handle_refused con " << con
<< " session " << session
.get() << dendl
;
6204 int type
= con
->get_peer_type();
6205 // handle only OSD failures here
6206 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
6207 OSDMapRef osdmap
= get_osdmap();
6209 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
6210 if (id
>= 0 && osdmap
->is_up(id
)) {
6211 // I'm cheating mon heartbeat grace logic, because we know it's not going
6212 // to respawn alone. +1 so we won't hit any boundary case.
6213 monc
->send_mon_message(
6217 osdmap
->get_addrs(id
),
6218 cct
->_conf
->osd_heartbeat_grace
+ 1,
6219 osdmap
->get_epoch(),
6220 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
6228 struct C_OSD_GetVersion
: public Context
{
6230 uint64_t oldest
, newest
;
6231 explicit C_OSD_GetVersion(OSD
*o
) : osd(o
), oldest(0), newest(0) {}
6232 void finish(int r
) override
{
6234 osd
->_got_mon_epochs(oldest
, newest
);
6238 void OSD::start_boot()
6240 if (!_is_healthy()) {
6241 // if we are not healthy, do not mark ourselves up (yet)
6242 dout(1) << "not healthy; waiting to boot" << dendl
;
6243 if (!is_waiting_for_healthy())
6244 start_waiting_for_healthy();
6245 // send pings sooner rather than later
6249 dout(1) << __func__
<< dendl
;
6250 set_state(STATE_PREBOOT
);
6251 dout(10) << "start_boot - have maps " << superblock
.oldest_map
6252 << ".." << superblock
.newest_map
<< dendl
;
6253 C_OSD_GetVersion
*c
= new C_OSD_GetVersion(this);
6254 monc
->get_version("osdmap", &c
->newest
, &c
->oldest
, c
);
6257 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
6259 std::lock_guard
l(osd_lock
);
6261 _preboot(oldest
, newest
);
6265 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
6267 ceph_assert(is_preboot());
6268 dout(10) << __func__
<< " _preboot mon has osdmaps "
6269 << oldest
<< ".." << newest
<< dendl
;
6271 // ensure our local fullness awareness is accurate
6273 std::lock_guard
l(heartbeat_lock
);
6277 const auto& monmap
= monc
->monmap
;
6278 const auto osdmap
= get_osdmap();
6279 // if our map within recent history, try to add ourselves to the osdmap.
6280 if (osdmap
->get_epoch() == 0) {
6281 derr
<< "waiting for initial osdmap" << dendl
;
6282 } else if (osdmap
->is_destroyed(whoami
)) {
6283 derr
<< "osdmap says I am destroyed" << dendl
;
6284 // provide a small margin so we don't livelock seeing if we
6285 // un-destroyed ourselves.
6286 if (osdmap
->get_epoch() > newest
- 1) {
6289 } else if (osdmap
->is_noup(whoami
)) {
6290 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
6291 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
6292 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6294 } else if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
6295 derr
<< "osdmap require_osd_release < luminous; please upgrade to luminous"
6297 } else if (service
.need_fullness_update()) {
6298 derr
<< "osdmap fullness state needs update" << dendl
;
6300 } else if (monmap
.min_mon_release
>= ceph_release_t::octopus
&&
6301 superblock
.purged_snaps_last
< superblock
.current_epoch
) {
6302 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6303 << " < newest_map " << superblock
.current_epoch
<< dendl
;
6304 _get_purged_snaps();
6305 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
6306 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
6308 // wait for pgs to fully catch up in a different thread, since
6309 // this thread might be required for splitting and merging PGs to
6311 boot_finisher
.queue(
6314 std::unique_lock
l(osd_lock
);
6316 dout(10) << __func__
<< " waiting for peering work to drain"
6319 for (auto shard
: shards
) {
6320 shard
->wait_min_pg_epoch(get_osdmap_epoch());
6331 // get all the latest maps
6332 if (osdmap
->get_epoch() + 1 >= oldest
)
6333 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6335 osdmap_subscribe(oldest
- 1, true);
6338 void OSD::_get_purged_snaps()
6340 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6341 // overlapping requests to the mon, which will be somewhat inefficient, but
6342 // it should be reliable.
6343 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6344 << ", newest_map " << superblock
.current_epoch
<< dendl
;
6345 MMonGetPurgedSnaps
*m
= new MMonGetPurgedSnaps(
6346 superblock
.purged_snaps_last
+ 1,
6347 superblock
.current_epoch
+ 1);
6348 monc
->send_mon_message(m
);
6351 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply
*m
)
6353 dout(10) << __func__
<< " " << *m
<< dendl
;
6354 ObjectStore::Transaction t
;
6355 if (!is_preboot() ||
6356 m
->last
< superblock
.purged_snaps_last
) {
6359 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
6360 make_purged_snaps_oid(), &t
,
6362 superblock
.purged_snaps_last
= m
->last
;
6363 write_superblock(t
);
6364 store
->queue_transaction(
6367 service
.publish_superblock(superblock
);
6368 if (m
->last
< superblock
.current_epoch
) {
6369 _get_purged_snaps();
6377 void OSD::send_full_update()
6379 if (!service
.need_fullness_update())
6382 if (service
.is_full()) {
6383 state
= CEPH_OSD_FULL
;
6384 } else if (service
.is_backfillfull()) {
6385 state
= CEPH_OSD_BACKFILLFULL
;
6386 } else if (service
.is_nearfull()) {
6387 state
= CEPH_OSD_NEARFULL
;
6390 OSDMap::calc_state_set(state
, s
);
6391 dout(10) << __func__
<< " want state " << s
<< dendl
;
6392 monc
->send_mon_message(new MOSDFull(get_osdmap_epoch(), state
));
6395 void OSD::start_waiting_for_healthy()
6397 dout(1) << "start_waiting_for_healthy" << dendl
;
6398 set_state(STATE_WAITING_FOR_HEALTHY
);
6399 last_heartbeat_resample
= utime_t();
6401 // subscribe to osdmap updates, in case our peers really are known to be dead
6402 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6405 bool OSD::_is_healthy()
6407 if (!cct
->get_heartbeat_map()->is_healthy()) {
6408 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
6412 if (is_waiting_for_healthy()) {
6413 utime_t now
= ceph_clock_now();
6414 if (osd_markdown_log
.empty()) {
6415 dout(5) << __func__
<< " force returning true since last markdown"
6416 << " was " << cct
->_conf
->osd_max_markdown_period
6417 << "s ago" << dendl
;
6420 std::lock_guard
l(heartbeat_lock
);
6421 int num
= 0, up
= 0;
6422 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
6423 p
!= heartbeat_peers
.end();
6425 if (p
->second
.is_healthy(now
))
6429 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
6430 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
6431 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
6439 void OSD::_send_boot()
6441 dout(10) << "_send_boot" << dendl
;
6442 Connection
*local_connection
=
6443 cluster_messenger
->get_loopback_connection().get();
6444 entity_addrvec_t client_addrs
= client_messenger
->get_myaddrs();
6445 entity_addrvec_t cluster_addrs
= cluster_messenger
->get_myaddrs();
6446 entity_addrvec_t hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6447 entity_addrvec_t hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6449 dout(20) << " initial client_addrs " << client_addrs
6450 << ", cluster_addrs " << cluster_addrs
6451 << ", hb_back_addrs " << hb_back_addrs
6452 << ", hb_front_addrs " << hb_front_addrs
6454 if (cluster_messenger
->set_addr_unknowns(client_addrs
)) {
6455 dout(10) << " assuming cluster_addrs match client_addrs "
6456 << client_addrs
<< dendl
;
6457 cluster_addrs
= cluster_messenger
->get_myaddrs();
6459 if (auto session
= local_connection
->get_priv(); !session
) {
6460 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6463 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
6464 if (hb_back_server_messenger
->set_addr_unknowns(cluster_addrs
)) {
6465 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6466 << cluster_addrs
<< dendl
;
6467 hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6469 if (auto session
= local_connection
->get_priv(); !session
) {
6470 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6473 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
6474 if (hb_front_server_messenger
->set_addr_unknowns(client_addrs
)) {
6475 dout(10) << " assuming hb_front_addrs match client_addrs "
6476 << client_addrs
<< dendl
;
6477 hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6479 if (auto session
= local_connection
->get_priv(); !session
) {
6480 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6483 // we now know what our front and back addrs will be, and we are
6484 // about to tell the mon what our metadata (including numa bindings)
6485 // are, so now is a good time!
6486 set_numa_affinity();
6488 MOSDBoot
*mboot
= new MOSDBoot(
6489 superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
6490 hb_back_addrs
, hb_front_addrs
, cluster_addrs
,
6492 dout(10) << " final client_addrs " << client_addrs
6493 << ", cluster_addrs " << cluster_addrs
6494 << ", hb_back_addrs " << hb_back_addrs
6495 << ", hb_front_addrs " << hb_front_addrs
6497 _collect_metadata(&mboot
->metadata
);
6498 monc
->send_mon_message(mboot
);
6499 set_state(STATE_BOOTING
);
6502 void OSD::_collect_metadata(map
<string
,string
> *pm
)
6505 (*pm
)["osd_data"] = dev_path
;
6506 if (store
->get_type() == "filestore") {
6507 // not applicable for bluestore
6508 (*pm
)["osd_journal"] = journal_path
;
6510 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddrs());
6511 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddrs());
6512 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddrs());
6513 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddrs());
6516 (*pm
)["osd_objectstore"] = store
->get_type();
6517 (*pm
)["rotational"] = store_is_rotational
? "1" : "0";
6518 (*pm
)["journal_rotational"] = journal_is_rotational
? "1" : "0";
6519 (*pm
)["default_device_class"] = store
->get_default_device_class();
6520 string osdspec_affinity
;
6521 int r
= store
->read_meta("osdspec_affinity", &osdspec_affinity
);
6522 if (r
< 0 || osdspec_affinity
.empty()) {
6523 osdspec_affinity
= "";
6525 (*pm
)["osdspec_affinity"] = osdspec_affinity
;
6526 store
->collect_metadata(pm
);
6528 collect_sys_info(pm
, cct
);
6530 (*pm
)["front_iface"] = pick_iface(
6532 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
6533 (*pm
)["back_iface"] = pick_iface(
6535 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
6541 set
<string
> unknown
;
6542 for (auto nm
: { "front_iface", "back_iface" }) {
6543 if (!(*pm
)[nm
].size()) {
6548 int r
= get_iface_numa_node((*pm
)[nm
], &n
);
6550 unknown
.insert((*pm
)[nm
]);
6558 if (unknown
.size()) {
6559 (*pm
)["network_numa_unknown_ifaces"] = stringify(unknown
);
6561 if (!nodes
.empty()) {
6562 (*pm
)["network_numa_nodes"] = stringify(nodes
);
6564 if (node
>= 0 && nodes
.size() == 1 && unknown
.empty()) {
6565 (*pm
)["network_numa_node"] = stringify(node
);
6569 if (numa_node
>= 0) {
6570 (*pm
)["numa_node"] = stringify(numa_node
);
6571 (*pm
)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size
,
6575 set
<string
> devnames
;
6576 store
->get_devices(&devnames
);
6577 map
<string
,string
> errs
;
6578 get_device_metadata(devnames
, pm
, &errs
);
6579 for (auto& i
: errs
) {
6580 dout(1) << __func__
<< " " << i
.first
<< ": " << i
.second
<< dendl
;
6582 dout(10) << __func__
<< " " << *pm
<< dendl
;
6585 void OSD::queue_want_up_thru(epoch_t want
)
6587 std::shared_lock map_locker
{map_lock
};
6588 epoch_t cur
= get_osdmap()->get_up_thru(whoami
);
6589 std::lock_guard
report_locker(mon_report_lock
);
6590 if (want
> up_thru_wanted
) {
6591 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
6592 << ", currently " << cur
6594 up_thru_wanted
= want
;
6597 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
6598 << ", currently " << cur
6603 void OSD::send_alive()
6605 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6606 const auto osdmap
= get_osdmap();
6607 if (!osdmap
->exists(whoami
))
6609 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
6610 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
6611 if (up_thru_wanted
> up_thru
) {
6612 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
6613 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
6617 void OSD::request_full_map(epoch_t first
, epoch_t last
)
6619 dout(10) << __func__
<< " " << first
<< ".." << last
6620 << ", previously requested "
6621 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
6622 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6623 ceph_assert(first
> 0 && last
> 0);
6624 ceph_assert(first
<= last
);
6625 ceph_assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
6626 if (requested_full_first
== 0) {
6628 requested_full_first
= first
;
6629 requested_full_last
= last
;
6630 } else if (last
<= requested_full_last
) {
6634 // additional request
6635 first
= requested_full_last
+ 1;
6636 requested_full_last
= last
;
6638 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
6639 req
->request_full(first
, last
);
6640 monc
->send_mon_message(req
);
6643 void OSD::got_full_map(epoch_t e
)
6645 ceph_assert(requested_full_first
<= requested_full_last
);
6646 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6647 if (requested_full_first
== 0) {
6648 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
6651 if (e
< requested_full_first
) {
6652 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6653 << ".." << requested_full_last
6654 << ", ignoring" << dendl
;
6657 if (e
>= requested_full_last
) {
6658 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6659 << ".." << requested_full_last
<< ", resetting" << dendl
;
6660 requested_full_first
= requested_full_last
= 0;
6664 requested_full_first
= e
+ 1;
6666 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6667 << ".." << requested_full_last
6668 << ", still need more" << dendl
;
6671 void OSD::requeue_failures()
6673 std::lock_guard
l(heartbeat_lock
);
6674 unsigned old_queue
= failure_queue
.size();
6675 unsigned old_pending
= failure_pending
.size();
6676 for (auto p
= failure_pending
.begin(); p
!= failure_pending
.end(); ) {
6677 failure_queue
[p
->first
] = p
->second
.first
;
6678 failure_pending
.erase(p
++);
6680 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
6681 << failure_queue
.size() << dendl
;
6684 void OSD::send_failures()
6686 ceph_assert(ceph_mutex_is_locked(map_lock
));
6687 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6688 std::lock_guard
l(heartbeat_lock
);
6689 utime_t now
= ceph_clock_now();
6690 const auto osdmap
= get_osdmap();
6691 while (!failure_queue
.empty()) {
6692 int osd
= failure_queue
.begin()->first
;
6693 if (!failure_pending
.count(osd
)) {
6694 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
6695 monc
->send_mon_message(
6699 osdmap
->get_addrs(osd
),
6701 osdmap
->get_epoch()));
6702 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
,
6703 osdmap
->get_addrs(osd
));
6705 failure_queue
.erase(osd
);
6709 void OSD::send_still_alive(epoch_t epoch
, int osd
, const entity_addrvec_t
&addrs
)
6711 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), osd
, addrs
, 0, epoch
,
6712 MOSDFailure::FLAG_ALIVE
);
6713 monc
->send_mon_message(m
);
6716 void OSD::cancel_pending_failures()
6718 std::lock_guard
l(heartbeat_lock
);
6719 auto it
= failure_pending
.begin();
6720 while (it
!= failure_pending
.end()) {
6721 dout(10) << __func__
<< " canceling in-flight failure report for osd."
6722 << it
->first
<< dendl
;
6723 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
6724 failure_pending
.erase(it
++);
6728 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
6730 const auto& monmap
= monc
->monmap
;
6731 // send beacon to mon even if we are just connected, and the monmap is not
6732 // initialized yet by then.
6733 if (monmap
.epoch
> 0 &&
6734 monmap
.get_required_features().contains_all(
6735 ceph::features::mon::FEATURE_LUMINOUS
)) {
6736 dout(20) << __func__
<< " sending" << dendl
;
6737 MOSDBeacon
* beacon
= nullptr;
6739 std::lock_guard l
{min_last_epoch_clean_lock
};
6740 beacon
= new MOSDBeacon(get_osdmap_epoch(),
6741 min_last_epoch_clean
,
6742 superblock
.last_purged_snaps_scrub
);
6743 beacon
->pgs
= min_last_epoch_clean_pgs
;
6744 last_sent_beacon
= now
;
6746 monc
->send_mon_message(beacon
);
6748 dout(20) << __func__
<< " not sending" << dendl
;
6752 void OSD::handle_command(MCommand
*m
)
6754 ConnectionRef con
= m
->get_connection();
6755 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6757 con
->send_message(new MCommandReply(m
, -EACCES
));
6761 if (!session
->caps
.allow_all()) {
6762 con
->send_message(new MCommandReply(m
, -EACCES
));
6766 cct
->get_admin_socket()->queue_tell_command(m
);
6771 class unlock_guard
{
6774 explicit unlock_guard(ceph::mutex
& mutex
)
6779 unlock_guard(unlock_guard
&) = delete;
6786 void OSD::scrub_purged_snaps()
6788 dout(10) << __func__
<< dendl
;
6789 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6790 SnapMapper::Scrubber
s(cct
, store
, service
.meta_ch
,
6791 make_snapmapper_oid(),
6792 make_purged_snaps_oid());
6793 clog
->debug() << "purged_snaps scrub starts";
6796 if (s
.stray
.size()) {
6797 clog
->debug() << "purged_snaps scrub found " << s
.stray
.size() << " strays";
6799 clog
->debug() << "purged_snaps scrub ok";
6801 set
<pair
<spg_t
,snapid_t
>> queued
;
6802 for (auto& [pool
, snap
, hash
, shard
] : s
.stray
) {
6803 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(pool
);
6805 dout(20) << __func__
<< " pool " << pool
<< " dne" << dendl
;
6808 pg_t
pgid(pi
->raw_hash_to_pg(hash
), pool
);
6809 spg_t
spgid(pgid
, shard
);
6810 pair
<spg_t
,snapid_t
> p(spgid
, snap
);
6811 if (queued
.count(p
)) {
6812 dout(20) << __func__
<< " pg " << spgid
<< " snap " << snap
6813 << " already queued" << dendl
;
6816 PGRef pg
= lookup_lock_pg(spgid
);
6818 dout(20) << __func__
<< " pg " << spgid
<< " not found" << dendl
;
6822 dout(10) << __func__
<< " requeue pg " << spgid
<< " " << pg
<< " snap "
6824 pg
->queue_snap_retrim(snap
);
6828 if (is_stopping()) {
6831 dout(10) << __func__
<< " done queueing pgs, updating superblock" << dendl
;
6832 ObjectStore::Transaction t
;
6833 superblock
.last_purged_snaps_scrub
= ceph_clock_now();
6834 write_superblock(t
);
6835 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
6836 ceph_assert(tr
== 0);
6838 send_beacon(ceph::coarse_mono_clock::now());
6840 dout(10) << __func__
<< " done" << dendl
;
6843 void OSD::probe_smart(const string
& only_devid
, ostream
& ss
)
6845 set
<string
> devnames
;
6846 store
->get_devices(&devnames
);
6847 uint64_t smart_timeout
= cct
->_conf
.get_val
<uint64_t>(
6848 "osd_smart_report_timeout");
6850 // == typedef std::map<std::string, mValue> mObject;
6851 json_spirit::mObject json_map
;
6853 for (auto dev
: devnames
) {
6854 // smartctl works only on physical devices; filter out any logical device
6855 if (dev
.find("dm-") == 0) {
6860 string devid
= get_device_id(dev
, &err
);
6861 if (devid
.size() == 0) {
6862 dout(10) << __func__
<< " no unique id for dev " << dev
<< " ("
6863 << err
<< "), skipping" << dendl
;
6866 if (only_devid
.size() && devid
!= only_devid
) {
6870 json_spirit::mValue smart_json
;
6871 if (block_device_get_metrics(dev
, smart_timeout
,
6873 dout(10) << "block_device_get_metrics failed for /dev/" << dev
<< dendl
;
6876 json_map
[devid
] = smart_json
;
6878 json_spirit::write(json_map
, ss
, json_spirit::pretty_print
);
6881 bool OSD::heartbeat_dispatch(Message
*m
)
6883 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
6884 switch (m
->get_type()) {
6887 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
6892 handle_osd_ping(static_cast<MOSDPing
*>(m
));
6896 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
6903 bool OSD::ms_dispatch(Message
*m
)
6905 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
6906 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
6907 service
.got_stop_ack();
6915 if (is_stopping()) {
6929 void OSDService::maybe_share_map(
6931 const OSDMapRef
& osdmap
,
6932 epoch_t peer_epoch_lb
)
6934 // NOTE: we assume caller hold something that keeps the Connection itself
6935 // pinned (e.g., an OpRequest's MessageRef).
6936 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6941 // assume the peer has the newer of the op's sent_epoch and what
6942 // we think we sent them.
6943 session
->sent_epoch_lock
.lock();
6944 if (peer_epoch_lb
> session
->last_sent_epoch
) {
6945 dout(10) << __func__
<< " con " << con
6946 << " " << con
->get_peer_addr()
6947 << " map epoch " << session
->last_sent_epoch
6948 << " -> " << peer_epoch_lb
<< " (as per caller)" << dendl
;
6949 session
->last_sent_epoch
= peer_epoch_lb
;
6951 epoch_t last_sent_epoch
= session
->last_sent_epoch
;
6952 session
->sent_epoch_lock
.unlock();
6954 if (osdmap
->get_epoch() <= last_sent_epoch
) {
6958 send_incremental_map(last_sent_epoch
, con
, osdmap
);
6959 last_sent_epoch
= osdmap
->get_epoch();
6961 session
->sent_epoch_lock
.lock();
6962 if (session
->last_sent_epoch
< last_sent_epoch
) {
6963 dout(10) << __func__
<< " con " << con
6964 << " " << con
->get_peer_addr()
6965 << " map epoch " << session
->last_sent_epoch
6966 << " -> " << last_sent_epoch
<< " (shared)" << dendl
;
6967 session
->last_sent_epoch
= last_sent_epoch
;
6969 session
->sent_epoch_lock
.unlock();
6972 void OSD::dispatch_session_waiting(const ceph::ref_t
<Session
>& session
, OSDMapRef osdmap
)
6974 ceph_assert(ceph_mutex_is_locked(session
->session_dispatch_lock
));
6976 auto i
= session
->waiting_on_map
.begin();
6977 while (i
!= session
->waiting_on_map
.end()) {
6978 OpRequestRef op
= &(*i
);
6979 ceph_assert(ms_can_fast_dispatch(op
->get_req()));
6980 auto m
= op
->get_req
<MOSDFastDispatchOp
>();
6981 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
6984 session
->waiting_on_map
.erase(i
++);
6988 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
6989 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
6990 static_cast<const MOSDOp
*>(m
)->get_pg());
6991 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
6995 pgid
= m
->get_spg();
6997 enqueue_op(pgid
, std::move(op
), m
->get_map_epoch());
7000 if (session
->waiting_on_map
.empty()) {
7001 clear_session_waiting_on_map(session
);
7003 register_session_waiting_on_map(session
);
7007 void OSD::ms_fast_dispatch(Message
*m
)
7010 if (service
.is_stopping()) {
7016 switch (m
->get_type()) {
7018 dout(10) << "ping from " << m
->get_source() << dendl
;
7021 case MSG_OSD_FORCE_RECOVERY
:
7022 handle_fast_force_recovery(static_cast<MOSDForceRecovery
*>(m
));
7024 case MSG_OSD_SCRUB2
:
7025 handle_fast_scrub(static_cast<MOSDScrub2
*>(m
));
7028 case MSG_OSD_PG_CREATE2
:
7029 return handle_fast_pg_create(static_cast<MOSDPGCreate2
*>(m
));
7030 case MSG_OSD_PG_QUERY
:
7031 return handle_fast_pg_query(static_cast<MOSDPGQuery
*>(m
));
7032 case MSG_OSD_PG_NOTIFY
:
7033 return handle_fast_pg_notify(static_cast<MOSDPGNotify
*>(m
));
7034 case MSG_OSD_PG_INFO
:
7035 return handle_fast_pg_info(static_cast<MOSDPGInfo
*>(m
));
7036 case MSG_OSD_PG_REMOVE
:
7037 return handle_fast_pg_remove(static_cast<MOSDPGRemove
*>(m
));
7039 // these are single-pg messages that handle themselves
7040 case MSG_OSD_PG_LOG
:
7041 case MSG_OSD_PG_TRIM
:
7042 case MSG_OSD_PG_NOTIFY2
:
7043 case MSG_OSD_PG_QUERY2
:
7044 case MSG_OSD_PG_INFO2
:
7045 case MSG_OSD_BACKFILL_RESERVE
:
7046 case MSG_OSD_RECOVERY_RESERVE
:
7047 case MSG_OSD_PG_LEASE
:
7048 case MSG_OSD_PG_LEASE_ACK
:
7050 MOSDPeeringOp
*pm
= static_cast<MOSDPeeringOp
*>(m
);
7051 if (require_osd_peer(pm
)) {
7052 enqueue_peering_evt(
7054 PGPeeringEventRef(pm
->get_event()));
7061 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7064 osd_reqid_t reqid
= op
->get_reqid();
7066 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
7067 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
7071 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7073 // note sender epoch, min req's epoch
7074 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
7075 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
7076 ceph_assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
7078 service
.maybe_inject_dispatch_delay();
7080 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
7081 m
->get_type() != CEPH_MSG_OSD_OP
) {
7082 // queue it directly
7084 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
7086 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
7088 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7089 // message that didn't have an explicit spg_t); we need to map
7090 // them to an spg_t while preserving delivery order.
7091 auto priv
= m
->get_connection()->get_priv();
7092 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
7093 std::lock_guard l
{session
->session_dispatch_lock
};
7095 session
->waiting_on_map
.push_back(*op
);
7096 OSDMapRef nextmap
= service
.get_nextmap_reserved();
7097 dispatch_session_waiting(session
, nextmap
);
7098 service
.release_map(nextmap
);
7101 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
7104 int OSD::ms_handle_authentication(Connection
*con
)
7107 auto s
= ceph::ref_cast
<Session
>(con
->get_priv());
7109 s
= ceph::make_ref
<Session
>(cct
, con
);
7111 s
->entity_name
= con
->get_peer_entity_name();
7112 dout(10) << __func__
<< " new session " << s
<< " con " << s
->con
7113 << " entity " << s
->entity_name
7114 << " addr " << con
->get_peer_addrs() << dendl
;
7116 dout(10) << __func__
<< " existing session " << s
<< " con " << s
->con
7117 << " entity " << s
->entity_name
7118 << " addr " << con
->get_peer_addrs() << dendl
;
7121 AuthCapsInfo
&caps_info
= con
->get_peer_caps_info();
7122 if (caps_info
.allow_all
) {
7123 s
->caps
.set_allow_all();
7124 } else if (caps_info
.caps
.length() > 0) {
7125 bufferlist::const_iterator p
= caps_info
.caps
.cbegin();
7130 catch (buffer::error
& e
) {
7131 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7132 << " failed to decode caps string" << dendl
;
7136 bool success
= s
->caps
.parse(str
);
7138 dout(10) << __func__
<< " session " << s
7139 << " " << s
->entity_name
7140 << " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
7143 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7144 << " failed to parse caps '" << str
<< "'" << dendl
;
7152 void OSD::do_waiters()
7154 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7156 dout(10) << "do_waiters -- start" << dendl
;
7157 while (!finished
.empty()) {
7158 OpRequestRef next
= finished
.front();
7159 finished
.pop_front();
7162 dout(10) << "do_waiters -- finish" << dendl
;
7165 void OSD::dispatch_op(OpRequestRef op
)
7167 switch (op
->get_req()->get_type()) {
7169 case MSG_OSD_PG_CREATE
:
7170 handle_pg_create(op
);
7175 void OSD::_dispatch(Message
*m
)
7177 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7178 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
7180 switch (m
->get_type()) {
7181 // -- don't need OSDMap --
7183 // map and replication
7184 case CEPH_MSG_OSD_MAP
:
7185 handle_osd_map(static_cast<MOSDMap
*>(m
));
7187 case MSG_MON_GET_PURGED_SNAPS_REPLY
:
7188 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply
*>(m
));
7193 handle_scrub(static_cast<MOSDScrub
*>(m
));
7197 handle_command(static_cast<MCommand
*>(m
));
7200 // -- need OSDMap --
7202 case MSG_OSD_PG_CREATE
:
7204 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7206 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7207 // no map? starting up?
7208 if (!get_osdmap()) {
7209 dout(7) << "no OSDMap, not booted" << dendl
;
7210 logger
->inc(l_osd_waiting_for_map
);
7211 waiting_for_osdmap
.push_back(op
);
7212 op
->mark_delayed("no osdmap");
7222 // remove me post-nautilus
7223 void OSD::handle_scrub(MOSDScrub
*m
)
7225 dout(10) << "handle_scrub " << *m
<< dendl
;
7226 if (!require_mon_or_mgr_peer(m
)) {
7230 if (m
->fsid
!= monc
->get_fsid()) {
7231 dout(0) << "handle_scrub fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7240 if (!m
->scrub_pgs
.empty()) {
7242 for (auto pgid
: m
->scrub_pgs
) {
7244 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
7245 std::find(spgs
.begin(), spgs
.end(), pcand
) != spgs
.end()) {
7252 for (auto pgid
: spgs
) {
7253 enqueue_peering_evt(
7256 std::make_shared
<PGPeeringEvent
>(
7259 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7265 void OSD::handle_fast_scrub(MOSDScrub2
*m
)
7267 dout(10) << __func__
<< " " << *m
<< dendl
;
7268 if (!require_mon_or_mgr_peer(m
)) {
7272 if (m
->fsid
!= monc
->get_fsid()) {
7273 dout(0) << __func__
<< " fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7278 for (auto pgid
: m
->scrub_pgs
) {
7279 enqueue_peering_evt(
7282 std::make_shared
<PGPeeringEvent
>(
7285 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7290 bool OSD::scrub_random_backoff()
7292 bool coin_flip
= (rand() / (double)RAND_MAX
>=
7293 cct
->_conf
->osd_scrub_backoff_ratio
);
7295 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl
;
7301 OSDService::ScrubJob::ScrubJob(CephContext
* cct
,
7302 const spg_t
& pg
, const utime_t
& timestamp
,
7303 double pool_scrub_min_interval
,
7304 double pool_scrub_max_interval
, bool must
)
7307 sched_time(timestamp
),
7310 // if not explicitly requested, postpone the scrub with a random delay
7312 double scrub_min_interval
= pool_scrub_min_interval
> 0 ?
7313 pool_scrub_min_interval
: cct
->_conf
->osd_scrub_min_interval
;
7314 double scrub_max_interval
= pool_scrub_max_interval
> 0 ?
7315 pool_scrub_max_interval
: cct
->_conf
->osd_scrub_max_interval
;
7317 sched_time
+= scrub_min_interval
;
7318 double r
= rand() / (double)RAND_MAX
;
7320 scrub_min_interval
* cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
7321 if (scrub_max_interval
== 0) {
7322 deadline
= utime_t();
7324 deadline
+= scrub_max_interval
;
7330 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob
& rhs
) const {
7331 if (sched_time
< rhs
.sched_time
)
7333 if (sched_time
> rhs
.sched_time
)
7335 return pgid
< rhs
.pgid
;
7338 double OSD::scrub_sleep_time(bool must_scrub
)
7341 return cct
->_conf
->osd_scrub_sleep
;
7343 utime_t now
= ceph_clock_now();
7344 if (scrub_time_permit(now
)) {
7345 return cct
->_conf
->osd_scrub_sleep
;
7347 double normal_sleep
= cct
->_conf
->osd_scrub_sleep
;
7348 double extended_sleep
= cct
->_conf
->osd_scrub_extended_sleep
;
7349 return std::max(extended_sleep
, normal_sleep
);
7352 bool OSD::scrub_time_permit(utime_t now
)
7355 time_t tt
= now
.sec();
7356 localtime_r(&tt
, &bdt
);
7358 bool day_permit
= false;
7359 if (cct
->_conf
->osd_scrub_begin_week_day
< cct
->_conf
->osd_scrub_end_week_day
) {
7360 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
&& bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7364 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
|| bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7370 dout(20) << __func__
<< " should run between week day " << cct
->_conf
->osd_scrub_begin_week_day
7371 << " - " << cct
->_conf
->osd_scrub_end_week_day
7372 << " now " << bdt
.tm_wday
<< " = no" << dendl
;
7376 bool time_permit
= false;
7377 if (cct
->_conf
->osd_scrub_begin_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7378 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
&& bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7382 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
|| bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7387 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7388 << " - " << cct
->_conf
->osd_scrub_end_hour
7389 << " now " << bdt
.tm_hour
<< " = no" << dendl
;
7391 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7392 << " - " << cct
->_conf
->osd_scrub_end_hour
7393 << " now " << bdt
.tm_hour
<< " = yes" << dendl
;
7398 bool OSD::scrub_load_below_threshold()
7401 if (getloadavg(loadavgs
, 3) != 3) {
7402 dout(10) << __func__
<< " couldn't read loadavgs\n" << dendl
;
7406 // allow scrub if below configured threshold
7407 long cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
7408 double loadavg_per_cpu
= cpus
> 0 ? loadavgs
[0] / cpus
: loadavgs
[0];
7409 if (loadavg_per_cpu
< cct
->_conf
->osd_scrub_load_threshold
) {
7410 dout(20) << __func__
<< " loadavg per cpu " << loadavg_per_cpu
7411 << " < max " << cct
->_conf
->osd_scrub_load_threshold
7412 << " = yes" << dendl
;
7416 // allow scrub if below daily avg and currently decreasing
7417 if (loadavgs
[0] < daily_loadavg
&& loadavgs
[0] < loadavgs
[2]) {
7418 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7419 << " < daily_loadavg " << daily_loadavg
7420 << " and < 15m avg " << loadavgs
[2]
7421 << " = yes" << dendl
;
7425 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7426 << " >= max " << cct
->_conf
->osd_scrub_load_threshold
7427 << " and ( >= daily_loadavg " << daily_loadavg
7428 << " or >= 15m avg " << loadavgs
[2]
7429 << ") = no" << dendl
;
7433 void OSD::sched_scrub()
7435 // if not permitted, fail fast
7436 if (!service
.can_inc_scrubs()) {
7439 bool allow_requested_repair_only
= false;
7440 if (service
.is_recovery_active() && !cct
->_conf
->osd_scrub_during_recovery
) {
7441 if (!cct
->_conf
->osd_repair_during_recovery
) {
7442 dout(20) << __func__
<< " not scheduling scrubs due to active recovery" << dendl
;
7445 dout(10) << __func__
7446 << " will only schedule explicitly requested repair due to active recovery"
7448 allow_requested_repair_only
= true;
7451 utime_t now
= ceph_clock_now();
7452 bool time_permit
= scrub_time_permit(now
);
7453 bool load_is_low
= scrub_load_below_threshold();
7454 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low
<< dendl
;
7456 OSDService::ScrubJob scrub
;
7457 if (service
.first_scrub_stamp(&scrub
)) {
7459 dout(30) << "sched_scrub examine " << scrub
.pgid
<< " at " << scrub
.sched_time
<< dendl
;
7461 if (scrub
.sched_time
> now
) {
7462 // save ourselves some effort
7463 dout(10) << "sched_scrub " << scrub
.pgid
<< " scheduled at " << scrub
.sched_time
7464 << " > " << now
<< dendl
;
7468 if ((scrub
.deadline
.is_zero() || scrub
.deadline
>= now
) && !(time_permit
&& load_is_low
)) {
7469 dout(10) << __func__
<< " not scheduling scrub for " << scrub
.pgid
<< " due to "
7470 << (!time_permit
? "time not permit" : "high load") << dendl
;
7474 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7477 // This has already started, so go on to the next scrub job
7478 if (pg
->scrubber
.active
) {
7480 dout(30) << __func__
<< ": already in progress pgid " << scrub
.pgid
<< dendl
;
7483 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7484 if (allow_requested_repair_only
&& !pg
->scrubber
.must_repair
) {
7486 dout(10) << __func__
<< " skip " << scrub
.pgid
7487 << " because repairing is not explicitly requested on it"
7491 // If it is reserving, let it resolve before going to the next scrub job
7492 if (pg
->scrubber
.local_reserved
&& !pg
->scrubber
.active
) {
7494 dout(30) << __func__
<< ": reserve in progress pgid " << scrub
.pgid
<< dendl
;
7497 dout(10) << "sched_scrub scrubbing " << scrub
.pgid
<< " at " << scrub
.sched_time
7498 << (pg
->get_must_scrub() ? ", explicitly requested" :
7499 (load_is_low
? ", load_is_low" : " deadline < now"))
7501 if (pg
->sched_scrub()) {
7506 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7508 dout(20) << "sched_scrub done" << dendl
;
7511 void OSD::resched_all_scrubs()
7513 dout(10) << __func__
<< ": start" << dendl
;
7514 OSDService::ScrubJob scrub
;
7515 if (service
.first_scrub_stamp(&scrub
)) {
7517 dout(20) << __func__
<< ": examine " << scrub
.pgid
<< dendl
;
7519 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7522 if (!pg
->scrubber
.must_scrub
&& !pg
->scrubber
.need_auto
) {
7523 dout(20) << __func__
<< ": reschedule " << scrub
.pgid
<< dendl
;
7524 pg
->on_info_history_change();
7527 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7529 dout(10) << __func__
<< ": done" << dendl
;
7532 MPGStats
* OSD::collect_pg_stats()
7534 // This implementation unconditionally sends every is_primary PG's
7535 // stats every time we're called. This has equivalent cost to the
7536 // previous implementation's worst case where all PGs are busy and
7537 // their stats are always enqueued for sending.
7538 std::shared_lock l
{map_lock
};
7540 osd_stat_t cur_stat
= service
.get_osd_stat();
7541 cur_stat
.os_perf_stat
= store
->get_cur_stats();
7543 auto m
= new MPGStats(monc
->get_fsid(), get_osdmap_epoch());
7544 m
->osd_stat
= cur_stat
;
7546 std::lock_guard lec
{min_last_epoch_clean_lock
};
7547 min_last_epoch_clean
= get_osdmap_epoch();
7548 min_last_epoch_clean_pgs
.clear();
7550 std::set
<int64_t> pool_set
;
7553 for (auto& pg
: pgs
) {
7554 auto pool
= pg
->pg_id
.pgid
.pool();
7555 pool_set
.emplace((int64_t)pool
);
7556 if (!pg
->is_primary()) {
7559 pg
->get_pg_stats([&](const pg_stat_t
& s
, epoch_t lec
) {
7560 m
->pg_stat
[pg
->pg_id
.pgid
] = s
;
7561 min_last_epoch_clean
= min(min_last_epoch_clean
, lec
);
7562 min_last_epoch_clean_pgs
.push_back(pg
->pg_id
.pgid
);
7566 bool per_pool_stats
= false;
7567 bool per_pool_omap_stats
= false;
7568 for (auto p
: pool_set
) {
7569 int r
= store
->pool_statfs(p
, &st
, &per_pool_omap_stats
);
7570 if (r
== -ENOTSUP
) {
7574 m
->pool_stat
[p
] = st
;
7575 per_pool_stats
= true;
7579 // indicate whether we are reporting per-pool stats
7580 m
->osd_stat
.num_osds
= 1;
7581 m
->osd_stat
.num_per_pool_osds
= per_pool_stats
? 1 : 0;
7582 m
->osd_stat
.num_per_pool_omap_osds
= per_pool_omap_stats
? 1 : 0;
7587 vector
<DaemonHealthMetric
> OSD::get_health_metrics()
7589 vector
<DaemonHealthMetric
> metrics
;
7591 utime_t oldest_secs
;
7592 const utime_t now
= ceph_clock_now();
7594 too_old
-= cct
->_conf
.get_val
<double>("osd_op_complaint_time");
7596 TrackedOpRef oldest_op
;
7597 auto count_slow_ops
= [&](TrackedOp
& op
) {
7598 if (op
.get_initiated() < too_old
) {
7600 ss
<< "slow request " << op
.get_desc()
7602 << op
.get_initiated()
7604 << op
.state_string();
7605 lgeneric_subdout(cct
,osd
,20) << ss
.str() << dendl
;
7606 clog
->warn() << ss
.str();
7608 if (!oldest_op
|| op
.get_initiated() < oldest_op
->get_initiated()) {
7616 if (op_tracker
.visit_ops_in_flight(&oldest_secs
, count_slow_ops
)) {
7618 derr
<< __func__
<< " reporting " << slow
<< " slow ops, oldest is "
7619 << oldest_op
->get_desc() << dendl
;
7621 metrics
.emplace_back(daemon_metric::SLOW_OPS
, slow
, oldest_secs
);
7623 // no news is not good news.
7624 metrics
.emplace_back(daemon_metric::SLOW_OPS
, 0, 0);
7628 std::lock_guard
l(pending_creates_lock
);
7629 auto n_primaries
= pending_creates_from_mon
;
7630 for (const auto& create
: pending_creates_from_osd
) {
7631 if (create
.second
) {
7635 metrics
.emplace_back(daemon_metric::PENDING_CREATING_PGS
, n_primaries
);
7640 // =====================================================
7643 void OSD::wait_for_new_map(OpRequestRef op
)
7646 if (waiting_for_osdmap
.empty()) {
7647 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7650 logger
->inc(l_osd_waiting_for_map
);
7651 waiting_for_osdmap
.push_back(op
);
7652 op
->mark_delayed("wait for new map");
7657 * assimilate new OSDMap(s). scan pgs, etc.
7660 void OSD::note_down_osd(int peer
)
7662 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7663 cluster_messenger
->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer
));
7665 std::lock_guard l
{heartbeat_lock
};
7666 failure_queue
.erase(peer
);
7667 failure_pending
.erase(peer
);
7668 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
7669 if (p
!= heartbeat_peers
.end()) {
7670 p
->second
.clear_mark_down();
7671 heartbeat_peers
.erase(p
);
7675 void OSD::note_up_osd(int peer
)
7677 heartbeat_set_peers_need_update();
7680 struct C_OnMapCommit
: public Context
{
7682 epoch_t first
, last
;
7684 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
7685 : osd(o
), first(f
), last(l
), msg(m
) {}
7686 void finish(int r
) override
{
7687 osd
->_committed_osd_maps(first
, last
, msg
);
7692 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
7694 std::lock_guard
l(osdmap_subscribe_lock
);
7695 if (latest_subscribed_epoch
>= epoch
&& !force_request
)
7698 latest_subscribed_epoch
= std::max
<uint64_t>(epoch
, latest_subscribed_epoch
);
7700 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
7706 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
7708 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
7709 if (min
<= superblock
.oldest_map
)
7713 ObjectStore::Transaction t
;
7714 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
7715 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
7716 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
7717 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
7718 superblock
.oldest_map
= e
+ 1;
7720 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
7721 service
.publish_superblock(superblock
);
7722 write_superblock(t
);
7723 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7724 ceph_assert(tr
== 0);
7727 // skip_maps leaves us with a range of old maps if we fail to remove all
7728 // of them before moving superblock.oldest_map forward to the first map
7729 // in the incoming MOSDMap msg. so we should continue removing them in
7730 // this case, even we could do huge series of delete transactions all at
7737 service
.publish_superblock(superblock
);
7738 write_superblock(t
);
7739 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7740 ceph_assert(tr
== 0);
7742 // we should not remove the cached maps
7743 ceph_assert(min
<= service
.map_cache
.cached_key_lower_bound());
7746 void OSD::handle_osd_map(MOSDMap
*m
)
7748 // wait for pgs to catch up
7750 // we extend the map cache pins to accomodate pgs slow to consume maps
7751 // for some period, until we hit the max_lag_factor bound, at which point
7752 // we block here to stop injesting more maps than they are able to keep
7754 epoch_t max_lag
= cct
->_conf
->osd_map_cache_size
*
7755 m_osd_pg_epoch_max_lag_factor
;
7756 ceph_assert(max_lag
> 0);
7757 epoch_t osd_min
= 0;
7758 for (auto shard
: shards
) {
7759 epoch_t min
= shard
->get_min_pg_epoch();
7760 if (osd_min
== 0 || min
< osd_min
) {
7764 epoch_t osdmap_epoch
= get_osdmap_epoch();
7766 osdmap_epoch
> max_lag
&&
7767 osdmap_epoch
- max_lag
> osd_min
) {
7768 epoch_t need
= osdmap_epoch
- max_lag
;
7769 dout(10) << __func__
<< " waiting for pgs to catch up (need " << need
7770 << " max_lag " << max_lag
<< ")" << dendl
;
7771 for (auto shard
: shards
) {
7772 epoch_t min
= shard
->get_min_pg_epoch();
7774 dout(10) << __func__
<< " waiting for pgs to consume " << need
7775 << " (shard " << shard
->shard_id
<< " min " << min
7776 << ", map cache is " << cct
->_conf
->osd_map_cache_size
7777 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7779 unlock_guard unlock
{osd_lock
};
7780 shard
->wait_min_pg_epoch(need
);
7786 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7787 map
<epoch_t
,OSDMapRef
> added_maps
;
7788 map
<epoch_t
,bufferlist
> added_maps_bl
;
7789 if (m
->fsid
!= monc
->get_fsid()) {
7790 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
7791 << monc
->get_fsid() << dendl
;
7795 if (is_initializing()) {
7796 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
7801 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
7802 if (session
&& !(session
->entity_name
.is_mon() ||
7803 session
->entity_name
.is_osd())) {
7805 dout(10) << "got osd map from Session " << session
7806 << " which we can't take maps from (not a mon or osd)" << dendl
;
7811 // share with the objecter
7813 service
.objecter
->handle_osd_map(m
);
7815 epoch_t first
= m
->get_first();
7816 epoch_t last
= m
->get_last();
7817 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
7818 << superblock
.newest_map
7819 << ", src has [" << m
->oldest_map
<< "," << m
->newest_map
<< "]"
7822 logger
->inc(l_osd_map
);
7823 logger
->inc(l_osd_mape
, last
- first
+ 1);
7824 if (first
<= superblock
.newest_map
)
7825 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
7826 if (service
.max_oldest_map
< m
->oldest_map
) {
7827 service
.max_oldest_map
= m
->oldest_map
;
7828 ceph_assert(service
.max_oldest_map
>= superblock
.oldest_map
);
7831 // make sure there is something new, here, before we bother flushing
7832 // the queues and such
7833 if (last
<= superblock
.newest_map
) {
7834 dout(10) << " no new maps here, dropping" << dendl
;
7840 bool skip_maps
= false;
7841 if (first
> superblock
.newest_map
+ 1) {
7842 dout(10) << "handle_osd_map message skips epochs "
7843 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
7844 if (m
->oldest_map
<= superblock
.newest_map
+ 1) {
7845 osdmap_subscribe(superblock
.newest_map
+ 1, false);
7849 // always try to get the full range of maps--as many as we can. this
7850 // 1- is good to have
7851 // 2- is at present the only way to ensure that we get a *full* map as
7853 if (m
->oldest_map
< first
) {
7854 osdmap_subscribe(m
->oldest_map
- 1, true);
7861 ObjectStore::Transaction t
;
7862 uint64_t txn_size
= 0;
7864 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> purged_snaps
;
7866 // store new maps: queue for disk and put in the osdmap cache
7867 epoch_t start
= std::max(superblock
.newest_map
+ 1, first
);
7868 for (epoch_t e
= start
; e
<= last
; e
++) {
7869 if (txn_size
>= t
.get_num_bytes()) {
7870 derr
<< __func__
<< " transaction size overflowed" << dendl
;
7871 ceph_assert(txn_size
< t
.get_num_bytes());
7873 txn_size
= t
.get_num_bytes();
7874 map
<epoch_t
,bufferlist
>::iterator p
;
7875 p
= m
->maps
.find(e
);
7876 if (p
!= m
->maps
.end()) {
7877 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
7878 OSDMap
*o
= new OSDMap
;
7879 bufferlist
& bl
= p
->second
;
7883 purged_snaps
[e
] = o
->get_new_purged_snaps();
7885 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
7886 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
7887 added_maps
[e
] = add_map(o
);
7888 added_maps_bl
[e
] = bl
;
7893 p
= m
->incremental_maps
.find(e
);
7894 if (p
!= m
->incremental_maps
.end()) {
7895 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
7896 bufferlist
& bl
= p
->second
;
7897 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
7898 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
7900 OSDMap
*o
= new OSDMap
;
7903 bool got
= get_map_bl(e
- 1, obl
);
7905 auto p
= added_maps_bl
.find(e
- 1);
7906 ceph_assert(p
!= added_maps_bl
.end());
7912 OSDMap::Incremental inc
;
7913 auto p
= bl
.cbegin();
7916 if (o
->apply_incremental(inc
) < 0) {
7917 derr
<< "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
7918 ceph_abort_msg("bad fsid");
7922 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
7924 bool injected_failure
= false;
7925 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
7926 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
7927 derr
<< __func__
<< " injecting map crc failure" << dendl
;
7928 injected_failure
= true;
7931 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
7932 dout(2) << "got incremental " << e
7933 << " but failed to encode full with correct crc; requesting"
7935 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
7936 dout(20) << "my encoded map was:\n";
7937 fbl
.hexdump(*_dout
);
7940 request_full_map(e
, last
);
7943 // don't continue committing if we failed to enc the first inc map
7945 dout(10) << __func__
<< " bailing because last < start (" << last
<< "<" << start
<< ")" << dendl
;
7952 purged_snaps
[e
] = o
->get_new_purged_snaps();
7954 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
7955 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
7956 added_maps
[e
] = add_map(o
);
7957 added_maps_bl
[e
] = fbl
;
7961 ceph_abort_msg("MOSDMap lied about what maps it had?");
7964 // even if this map isn't from a mon, we may have satisfied our subscription
7965 monc
->sub_got("osdmap", last
);
7967 if (!m
->maps
.empty() && requested_full_first
) {
7968 dout(10) << __func__
<< " still missing full maps " << requested_full_first
7969 << ".." << requested_full_last
<< dendl
;
7970 rerequest_full_maps();
7973 if (superblock
.oldest_map
) {
7974 // make sure we at least keep pace with incoming maps
7975 trim_maps(m
->oldest_map
, last
- first
+ 1, skip_maps
);
7976 pg_num_history
.prune(superblock
.oldest_map
);
7979 if (!superblock
.oldest_map
|| skip_maps
)
7980 superblock
.oldest_map
= first
;
7981 superblock
.newest_map
= last
;
7982 superblock
.current_epoch
= last
;
7984 // note in the superblock that we were clean thru the prior epoch
7985 epoch_t boot_epoch
= service
.get_boot_epoch();
7986 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
7987 superblock
.mounted
= boot_epoch
;
7988 superblock
.clean_thru
= last
;
7991 // check for pg_num changes and deleted pools
7993 for (auto& i
: added_maps
) {
7995 if (!(lastmap
= service
.try_get_map(i
.first
- 1))) {
7996 dout(10) << __func__
<< " can't get previous map " << i
.first
- 1
7997 << " probably first start of this osd" << dendl
;
8001 ceph_assert(lastmap
->get_epoch() + 1 == i
.second
->get_epoch());
8002 for (auto& j
: lastmap
->get_pools()) {
8003 if (!i
.second
->have_pg_pool(j
.first
)) {
8004 pg_num_history
.log_pool_delete(i
.first
, j
.first
);
8005 dout(10) << __func__
<< " recording final pg_pool_t for pool "
8006 << j
.first
<< dendl
;
8007 // this information is needed by _make_pg() if have to restart before
8008 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8009 ghobject_t obj
= make_final_pool_info_oid(j
.first
);
8011 encode(j
.second
, bl
, CEPH_FEATURES_ALL
);
8012 string name
= lastmap
->get_pool_name(j
.first
);
8014 map
<string
,string
> profile
;
8015 if (lastmap
->get_pg_pool(j
.first
)->is_erasure()) {
8016 profile
= lastmap
->get_erasure_code_profile(
8017 lastmap
->get_pg_pool(j
.first
)->erasure_code_profile
);
8019 encode(profile
, bl
);
8020 t
.write(coll_t::meta(), obj
, 0, bl
.length(), bl
);
8021 } else if (unsigned new_pg_num
= i
.second
->get_pg_num(j
.first
);
8022 new_pg_num
!= j
.second
.get_pg_num()) {
8023 dout(10) << __func__
<< " recording pool " << j
.first
<< " pg_num "
8024 << j
.second
.get_pg_num() << " -> " << new_pg_num
<< dendl
;
8025 pg_num_history
.log_pg_num_change(i
.first
, j
.first
, new_pg_num
);
8028 for (auto& j
: i
.second
->get_pools()) {
8029 if (!lastmap
->have_pg_pool(j
.first
)) {
8030 dout(10) << __func__
<< " recording new pool " << j
.first
<< " pg_num "
8031 << j
.second
.get_pg_num() << dendl
;
8032 pg_num_history
.log_pg_num_change(i
.first
, j
.first
,
8033 j
.second
.get_pg_num());
8038 pg_num_history
.epoch
= last
;
8041 ::encode(pg_num_history
, bl
);
8042 t
.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl
.length(), bl
);
8043 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
8046 // record new purged_snaps
8047 if (superblock
.purged_snaps_last
== start
- 1) {
8048 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
8049 make_purged_snaps_oid(), &t
,
8051 superblock
.purged_snaps_last
= last
;
8053 dout(10) << __func__
<< " superblock purged_snaps_last is "
8054 << superblock
.purged_snaps_last
8055 << ", not recording new purged_snaps" << dendl
;
8058 // superblock and commit
8059 write_superblock(t
);
8060 t
.register_on_commit(new C_OnMapCommit(this, start
, last
, m
));
8061 store
->queue_transaction(
8064 service
.publish_superblock(superblock
);
8067 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
8069 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
8070 if (is_stopping()) {
8071 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8074 std::lock_guard
l(osd_lock
);
8075 if (is_stopping()) {
8076 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8081 ceph_assert(first
<= last
);
8083 bool do_shutdown
= false;
8084 bool do_restart
= false;
8085 bool network_error
= false;
8086 OSDMapRef osdmap
= get_osdmap();
8088 // advance through the new maps
8089 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
8090 dout(10) << " advance to epoch " << cur
8091 << " (<= last " << last
8092 << " <= newest_map " << superblock
.newest_map
8095 OSDMapRef newmap
= get_map(cur
);
8096 ceph_assert(newmap
); // we just cached it above!
8098 // start blacklisting messages sent to peers that go down.
8099 service
.pre_publish_map(newmap
);
8101 // kill connections to newly down osds
8102 bool waited_for_reservations
= false;
8104 osdmap
= get_osdmap();
8105 osdmap
->get_all_osds(old
);
8106 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
8108 osdmap
->is_up(*p
) && // in old map
8109 newmap
->is_down(*p
)) { // but not the new one
8110 if (!waited_for_reservations
) {
8111 service
.await_reserved_maps();
8112 waited_for_reservations
= true;
8115 } else if (*p
!= whoami
&&
8116 osdmap
->is_down(*p
) &&
8117 newmap
->is_up(*p
)) {
8122 if (osdmap
->is_noup(whoami
) != newmap
->is_noup(whoami
)) {
8123 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
8126 // this captures the case where we sent the boot message while
8127 // NOUP was being set on the mon and our boot request was
8128 // dropped, and then later it is cleared. it imperfectly
8129 // handles the case where our original boot message was not
8130 // dropped and we restart even though we might have booted, but
8131 // that is harmless (boot will just take slightly longer).
8136 osdmap
= std::move(newmap
);
8140 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
8142 osdmap
->is_up(whoami
) &&
8143 osdmap
->get_addrs(whoami
) == client_messenger
->get_myaddrs()) {
8144 up_epoch
= osdmap
->get_epoch();
8145 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
8147 boot_epoch
= osdmap
->get_epoch();
8148 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
8150 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
8154 epoch_t _bind_epoch
= service
.get_bind_epoch();
8155 if (osdmap
->is_up(whoami
) &&
8156 osdmap
->get_addrs(whoami
).legacy_equals(
8157 client_messenger
->get_myaddrs()) &&
8158 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
8161 dout(1) << "state: booting -> active" << dendl
;
8162 set_state(STATE_ACTIVE
);
8165 // set incarnation so that osd_reqid_t's we generate for our
8166 // objecter requests are unique across restarts.
8167 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
8168 cancel_pending_failures();
8172 if (osdmap
->get_epoch() > 0 &&
8174 if (!osdmap
->exists(whoami
)) {
8175 derr
<< "map says i do not exist. shutting down." << dendl
;
8176 do_shutdown
= true; // don't call shutdown() while we have
8177 // everything paused
8178 } else if (osdmap
->is_stop(whoami
)) {
8179 derr
<< "map says i am stopped by admin. shutting down." << dendl
;
8181 } else if (!osdmap
->is_up(whoami
) ||
8182 !osdmap
->get_addrs(whoami
).legacy_equals(
8183 client_messenger
->get_myaddrs()) ||
8184 !osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8185 cluster_messenger
->get_myaddrs()) ||
8186 !osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8187 hb_back_server_messenger
->get_myaddrs()) ||
8188 !osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8189 hb_front_server_messenger
->get_myaddrs())) {
8190 if (!osdmap
->is_up(whoami
)) {
8191 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
8192 service
.got_stop_ack();
8194 clog
->warn() << "Monitor daemon marked osd." << whoami
<< " down, "
8195 "but it is still running";
8196 clog
->debug() << "map e" << osdmap
->get_epoch()
8197 << " wrongly marked me down at e"
8198 << osdmap
->get_down_at(whoami
);
8200 if (monc
->monmap
.min_mon_release
>= ceph_release_t::octopus
) {
8201 // note that this is best-effort...
8202 monc
->send_mon_message(
8206 osdmap
->get_epoch()));
8208 } else if (!osdmap
->get_addrs(whoami
).legacy_equals(
8209 client_messenger
->get_myaddrs())) {
8210 clog
->error() << "map e" << osdmap
->get_epoch()
8211 << " had wrong client addr (" << osdmap
->get_addrs(whoami
)
8212 << " != my " << client_messenger
->get_myaddrs() << ")";
8213 } else if (!osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8214 cluster_messenger
->get_myaddrs())) {
8215 clog
->error() << "map e" << osdmap
->get_epoch()
8216 << " had wrong cluster addr ("
8217 << osdmap
->get_cluster_addrs(whoami
)
8218 << " != my " << cluster_messenger
->get_myaddrs() << ")";
8219 } else if (!osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8220 hb_back_server_messenger
->get_myaddrs())) {
8221 clog
->error() << "map e" << osdmap
->get_epoch()
8222 << " had wrong heartbeat back addr ("
8223 << osdmap
->get_hb_back_addrs(whoami
)
8224 << " != my " << hb_back_server_messenger
->get_myaddrs()
8226 } else if (!osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8227 hb_front_server_messenger
->get_myaddrs())) {
8228 clog
->error() << "map e" << osdmap
->get_epoch()
8229 << " had wrong heartbeat front addr ("
8230 << osdmap
->get_hb_front_addrs(whoami
)
8231 << " != my " << hb_front_server_messenger
->get_myaddrs()
8235 if (!service
.is_stopping()) {
8236 epoch_t up_epoch
= 0;
8237 epoch_t bind_epoch
= osdmap
->get_epoch();
8238 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
8242 utime_t now
= ceph_clock_now();
8243 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
8244 osd_markdown_log
.push_back(now
);
8245 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
8246 derr
<< __func__
<< " marked down "
8247 << osd_markdown_log
.size()
8248 << " > osd_max_markdown_count "
8249 << cct
->_conf
->osd_max_markdown_count
8250 << " in last " << grace
<< " seconds, shutting down"
8256 start_waiting_for_healthy();
8258 set
<int> avoid_ports
;
8259 #if defined(__FreeBSD__)
8260 // prevent FreeBSD from grabbing the client_messenger port during
8261 // rebinding. In which case a cluster_meesneger will connect also
8263 client_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8265 cluster_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8267 int r
= cluster_messenger
->rebind(avoid_ports
);
8269 do_shutdown
= true; // FIXME: do_restart?
8270 network_error
= true;
8271 derr
<< __func__
<< " marked down:"
8272 << " rebind cluster_messenger failed" << dendl
;
8275 hb_back_server_messenger
->mark_down_all();
8276 hb_front_server_messenger
->mark_down_all();
8277 hb_front_client_messenger
->mark_down_all();
8278 hb_back_client_messenger
->mark_down_all();
8280 reset_heartbeat_peers(true);
8287 check_osdmap_features();
8292 if (is_active() || is_waiting_for_healthy())
8293 maybe_update_heartbeat_peers();
8300 if (network_error
) {
8301 cancel_pending_failures();
8303 // trigger shutdown in a different thread
8304 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
8305 queue_async_signal(SIGINT
);
8307 else if (m
->newest_map
&& m
->newest_map
> last
) {
8308 dout(10) << " msg say newest map is " << m
->newest_map
8309 << ", requesting more" << dendl
;
8310 osdmap_subscribe(osdmap
->get_epoch()+1, false);
8312 else if (is_preboot()) {
8313 if (m
->get_source().is_mon())
8314 _preboot(m
->oldest_map
, m
->newest_map
);
8318 else if (do_restart
)
8323 void OSD::check_osdmap_features()
8325 // adjust required feature bits?
8327 // we have to be a bit careful here, because we are accessing the
8328 // Policy structures without taking any lock. in particular, only
8329 // modify integer values that can safely be read by a racing CPU.
8330 // since we are only accessing existing Policy structures a their
8331 // current memory location, and setting or clearing bits in integer
8332 // fields, and we are the only writer, this is not a problem.
8334 const auto osdmap
= get_osdmap();
8336 Messenger::Policy p
= client_messenger
->get_default_policy();
8338 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
8339 if ((p
.features_required
& mask
) != features
) {
8340 dout(0) << "crush map has features " << features
8341 << ", adjusting msgr requires for clients" << dendl
;
8342 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8343 client_messenger
->set_default_policy(p
);
8347 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
8349 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
8350 if ((p
.features_required
& mask
) != features
) {
8351 dout(0) << "crush map has features " << features
8352 << " was " << p
.features_required
8353 << ", adjusting msgr requires for mons" << dendl
;
8354 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8355 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
8359 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
8361 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
8363 if ((p
.features_required
& mask
) != features
) {
8364 dout(0) << "crush map has features " << features
8365 << ", adjusting msgr requires for osds" << dendl
;
8366 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8367 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
8370 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
8371 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
8372 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
8373 ObjectStore::Transaction t
;
8374 write_superblock(t
);
8375 int err
= store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
8376 ceph_assert(err
== 0);
8380 if (osdmap
->require_osd_release
< ceph_release_t::nautilus
) {
8381 hb_front_server_messenger
->set_require_authorizer(false);
8382 hb_back_server_messenger
->set_require_authorizer(false);
8384 hb_front_server_messenger
->set_require_authorizer(true);
8385 hb_back_server_messenger
->set_require_authorizer(true);
8388 if (osdmap
->require_osd_release
!= last_require_osd_release
) {
8389 dout(1) << __func__
<< " require_osd_release " << last_require_osd_release
8390 << " -> " << to_string(osdmap
->require_osd_release
) << dendl
;
8391 store
->write_meta("require_osd_release",
8392 stringify((int)osdmap
->require_osd_release
));
8393 last_require_osd_release
= osdmap
->require_osd_release
;
8397 struct C_FinishSplits
: public Context
{
8400 C_FinishSplits(OSD
*osd
, const set
<PGRef
> &in
)
8401 : osd(osd
), pgs(in
) {}
8402 void finish(int r
) override
{
8403 osd
->_finish_splits(pgs
);
8407 void OSD::_finish_splits(set
<PGRef
>& pgs
)
8409 dout(10) << __func__
<< " " << pgs
<< dendl
;
8412 for (set
<PGRef
>::iterator i
= pgs
.begin();
8417 PeeringCtx rctx
= create_context();
8419 dout(10) << __func__
<< " " << *pg
<< dendl
;
8420 epoch_t e
= pg
->get_osdmap_epoch();
8421 pg
->handle_initialize(rctx
);
8422 pg
->queue_null(e
, e
);
8423 dispatch_context(rctx
, pg
, service
.get_osdmap());
8426 unsigned shard_index
= pg
->pg_id
.hash_to_shard(num_shards
);
8427 shards
[shard_index
]->register_and_wake_split_child(pg
);
8431 bool OSD::add_merge_waiter(OSDMapRef nextmap
, spg_t target
, PGRef src
,
8434 std::lock_guard
l(merge_lock
);
8435 auto& p
= merge_waiters
[nextmap
->get_epoch()][target
];
8436 p
[src
->pg_id
] = src
;
8437 dout(10) << __func__
<< " added merge_waiter " << src
->pg_id
8438 << " for " << target
<< ", have " << p
.size() << "/" << need
8440 return p
.size() == need
;
8443 bool OSD::advance_pg(
8446 ThreadPool::TPHandle
&handle
,
8449 if (osd_epoch
<= pg
->get_osdmap_epoch()) {
8452 ceph_assert(pg
->is_locked());
8453 OSDMapRef lastmap
= pg
->get_osdmap();
8454 ceph_assert(lastmap
->get_epoch() < osd_epoch
);
8455 set
<PGRef
> new_pgs
; // any split children
8458 unsigned old_pg_num
= lastmap
->have_pg_pool(pg
->pg_id
.pool()) ?
8459 lastmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8460 for (epoch_t next_epoch
= pg
->get_osdmap_epoch() + 1;
8461 next_epoch
<= osd_epoch
;
8463 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
8465 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
8469 unsigned new_pg_num
=
8470 (old_pg_num
&& nextmap
->have_pg_pool(pg
->pg_id
.pool())) ?
8471 nextmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8472 if (old_pg_num
&& new_pg_num
&& old_pg_num
!= new_pg_num
) {
8474 if (nextmap
->have_pg_pool(pg
->pg_id
.pool())) {
8476 if (pg
->pg_id
.is_merge_source(
8480 // we are merge source
8481 PGRef spg
= pg
; // carry a ref
8482 dout(1) << __func__
<< " " << pg
->pg_id
8483 << " is merge source, target is " << parent
8485 pg
->write_if_dirty(rctx
);
8486 if (!new_pgs
.empty()) {
8487 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8491 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8493 // release backoffs explicitly, since the on_shutdown path
8494 // aggressively tears down backoff state.
8495 if (pg
->is_primary()) {
8496 pg
->release_pg_backoffs();
8499 OSDShard
*sdata
= pg
->osd_shard
;
8501 std::lock_guard
l(sdata
->shard_lock
);
8503 sdata
->_detach_pg(pg
->pg_slot
);
8504 // update pg count now since we might not get an osdmap
8506 if (pg
->is_primary())
8507 logger
->dec(l_osd_pg_primary
);
8508 else if (pg
->is_nonprimary())
8509 logger
->dec(l_osd_pg_replica
); // misnomer
8511 logger
->dec(l_osd_pg_stray
);
8516 set
<spg_t
> children
;
8517 parent
.is_split(new_pg_num
, old_pg_num
, &children
);
8518 if (add_merge_waiter(nextmap
, parent
, pg
, children
.size())) {
8519 enqueue_peering_evt(
8522 std::make_shared
<PGPeeringEvent
>(
8523 nextmap
->get_epoch(),
8524 nextmap
->get_epoch(),
8529 } else if (pg
->pg_id
.is_merge_target(old_pg_num
, new_pg_num
)) {
8530 // we are merge target
8531 set
<spg_t
> children
;
8532 pg
->pg_id
.is_split(new_pg_num
, old_pg_num
, &children
);
8533 dout(20) << __func__
<< " " << pg
->pg_id
8534 << " is merge target, sources are " << children
8536 map
<spg_t
,PGRef
> sources
;
8538 std::lock_guard
l(merge_lock
);
8539 auto& s
= merge_waiters
[nextmap
->get_epoch()][pg
->pg_id
];
8540 unsigned need
= children
.size();
8541 dout(20) << __func__
<< " have " << s
.size() << "/"
8543 if (s
.size() == need
) {
8545 merge_waiters
[nextmap
->get_epoch()].erase(pg
->pg_id
);
8546 if (merge_waiters
[nextmap
->get_epoch()].empty()) {
8547 merge_waiters
.erase(nextmap
->get_epoch());
8551 if (!sources
.empty()) {
8552 unsigned new_pg_num
= nextmap
->get_pg_num(pg
->pg_id
.pool());
8553 unsigned split_bits
= pg
->pg_id
.get_split_bits(new_pg_num
);
8554 dout(1) << __func__
<< " merging " << pg
->pg_id
<< dendl
;
8556 sources
, rctx
, split_bits
,
8557 nextmap
->get_pg_pool(
8558 pg
->pg_id
.pool())->last_pg_merge_meta
);
8559 pg
->pg_slot
->waiting_for_merge_epoch
= 0;
8561 dout(20) << __func__
<< " not ready to merge yet" << dendl
;
8562 pg
->write_if_dirty(rctx
);
8563 if (!new_pgs
.empty()) {
8564 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8568 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8570 // kick source(s) to get them ready
8571 for (auto& i
: children
) {
8572 dout(20) << __func__
<< " kicking source " << i
<< dendl
;
8573 enqueue_peering_evt(
8576 std::make_shared
<PGPeeringEvent
>(
8577 nextmap
->get_epoch(),
8578 nextmap
->get_epoch(),
8588 vector
<int> newup
, newacting
;
8589 int up_primary
, acting_primary
;
8590 nextmap
->pg_to_up_acting_osds(
8592 &newup
, &up_primary
,
8593 &newacting
, &acting_primary
);
8594 pg
->handle_advance_map(
8595 nextmap
, lastmap
, newup
, up_primary
,
8596 newacting
, acting_primary
, rctx
);
8598 auto oldpool
= lastmap
->get_pools().find(pg
->pg_id
.pool());
8599 auto newpool
= nextmap
->get_pools().find(pg
->pg_id
.pool());
8600 if (oldpool
!= lastmap
->get_pools().end()
8601 && newpool
!= nextmap
->get_pools().end()) {
8602 dout(20) << __func__
8603 << " new pool opts " << newpool
->second
.opts
8604 << " old pool opts " << oldpool
->second
.opts
8607 double old_min_interval
= 0, new_min_interval
= 0;
8608 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &old_min_interval
);
8609 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &new_min_interval
);
8611 double old_max_interval
= 0, new_max_interval
= 0;
8612 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &old_max_interval
);
8613 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &new_max_interval
);
8615 // Assume if an interval is change from set to unset or vice versa the actual config
8616 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8618 if (old_min_interval
!= new_min_interval
|| old_max_interval
!= new_max_interval
) {
8619 pg
->on_info_history_change();
8623 if (new_pg_num
&& old_pg_num
!= new_pg_num
) {
8625 set
<spg_t
> children
;
8626 if (pg
->pg_id
.is_split(
8631 pg
, children
, &new_pgs
, lastmap
, nextmap
,
8637 old_pg_num
= new_pg_num
;
8638 handle
.reset_tp_timeout();
8640 pg
->handle_activate_map(rctx
);
8644 if (!new_pgs
.empty()) {
8645 rctx
.transaction
.register_on_applied(new C_FinishSplits(this, new_pgs
));
8650 void OSD::consume_map()
8652 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8653 auto osdmap
= get_osdmap();
8654 dout(7) << "consume_map version " << osdmap
->get_epoch() << dendl
;
8656 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8657 * speak the older sorting version any more. Be careful not to force
8658 * a shutdown if we are merely processing old maps, though.
8660 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
) && is_active()) {
8661 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
8665 service
.pre_publish_map(osdmap
);
8666 service
.await_reserved_maps();
8667 service
.publish_map(osdmap
);
8669 // prime splits and merges
8670 set
<pair
<spg_t
,epoch_t
>> newly_split
; // splits, and when
8671 set
<pair
<spg_t
,epoch_t
>> merge_pgs
; // merge participants, and when
8672 for (auto& shard
: shards
) {
8673 shard
->identify_splits_and_merges(osdmap
, &newly_split
, &merge_pgs
);
8675 if (!newly_split
.empty()) {
8676 for (auto& shard
: shards
) {
8677 shard
->prime_splits(osdmap
, &newly_split
);
8679 ceph_assert(newly_split
.empty());
8682 // prune sent_ready_to_merge
8683 service
.prune_sent_ready_to_merge(osdmap
);
8685 // FIXME, maybe: We could race against an incoming peering message
8686 // that instantiates a merge PG after identify_merges() below and
8687 // never set up its peer to complete the merge. An OSD restart
8688 // would clear it up. This is a hard race to resolve,
8689 // extraordinarily rare (we only merge PGs that are stable and
8690 // clean, so it'd have to be an imported PG to an OSD with a
8691 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8692 // replace all of this with a seastar-based code soon anyway.
8693 if (!merge_pgs
.empty()) {
8694 // mark the pgs we already have, or create new and empty merge
8695 // participants for those we are missing. do this all under the
8696 // shard lock so we don't have to worry about racing pg creates
8698 for (auto& shard
: shards
) {
8699 shard
->prime_merges(osdmap
, &merge_pgs
);
8701 ceph_assert(merge_pgs
.empty());
8704 service
.prune_pg_created();
8706 unsigned pushes_to_free
= 0;
8707 for (auto& shard
: shards
) {
8708 shard
->consume_map(osdmap
, &pushes_to_free
);
8711 vector
<spg_t
> pgids
;
8714 // count (FIXME, probably during seastar rewrite)
8715 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
8718 for (auto& pg
: pgs
) {
8719 // FIXME (probably during seastar rewrite): this is lockless and
8720 // racy, but we don't want to take pg lock here.
8721 if (pg
->is_primary())
8723 else if (pg
->is_nonprimary())
8724 num_pg_replica
++; // misnomer
8730 // FIXME (as part of seastar rewrite): move to OSDShard
8731 std::lock_guard
l(pending_creates_lock
);
8732 for (auto pg
= pending_creates_from_osd
.begin();
8733 pg
!= pending_creates_from_osd
.end();) {
8734 if (osdmap
->get_pg_acting_role(pg
->first
, whoami
) < 0) {
8735 dout(10) << __func__
<< " pg " << pg
->first
<< " doesn't map here, "
8736 << "discarding pending_create_from_osd" << dendl
;
8737 pg
= pending_creates_from_osd
.erase(pg
);
8744 service
.maybe_inject_dispatch_delay();
8746 dispatch_sessions_waiting_on_map();
8748 service
.maybe_inject_dispatch_delay();
8750 service
.release_reserved_pushes(pushes_to_free
);
8752 // queue null events to push maps down to individual PGs
8753 for (auto pgid
: pgids
) {
8754 enqueue_peering_evt(
8757 std::make_shared
<PGPeeringEvent
>(
8758 osdmap
->get_epoch(),
8759 osdmap
->get_epoch(),
8762 logger
->set(l_osd_pg
, pgids
.size());
8763 logger
->set(l_osd_pg_primary
, num_pg_primary
);
8764 logger
->set(l_osd_pg_replica
, num_pg_replica
);
8765 logger
->set(l_osd_pg_stray
, num_pg_stray
);
8768 void OSD::activate_map()
8770 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8771 auto osdmap
= get_osdmap();
8773 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
8776 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
8777 if (!service
.recovery_is_paused()) {
8778 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
8779 service
.pause_recovery();
8782 if (service
.recovery_is_paused()) {
8783 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
8784 service
.unpause_recovery();
8788 service
.activate_map();
8791 take_waiters(waiting_for_osdmap
);
8794 bool OSD::require_mon_peer(const Message
*m
)
8796 if (!m
->get_connection()->peer_is_mon()) {
8797 dout(0) << "require_mon_peer received from non-mon "
8798 << m
->get_connection()->get_peer_addr()
8799 << " " << *m
<< dendl
;
8805 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
8807 if (!m
->get_connection()->peer_is_mon() &&
8808 !m
->get_connection()->peer_is_mgr()) {
8809 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8810 << m
->get_connection()->get_peer_addr()
8811 << " " << *m
<< dendl
;
8817 bool OSD::require_osd_peer(const Message
*m
)
8819 if (!m
->get_connection()->peer_is_osd()) {
8820 dout(0) << "require_osd_peer received from non-osd "
8821 << m
->get_connection()->get_peer_addr()
8822 << " " << *m
<< dendl
;
8828 bool OSD::require_self_aliveness(const Message
*m
, epoch_t epoch
)
8830 epoch_t up_epoch
= service
.get_up_epoch();
8831 if (epoch
< up_epoch
) {
8832 dout(7) << "from pre-up epoch " << epoch
<< " < " << up_epoch
<< dendl
;
8837 dout(7) << "still in boot state, dropping message " << *m
<< dendl
;
8844 bool OSD::require_same_peer_instance(const Message
*m
, const OSDMapRef
& map
,
8845 bool is_fast_dispatch
)
8847 int from
= m
->get_source().num();
8849 if (map
->is_down(from
) ||
8850 (map
->get_cluster_addrs(from
) != m
->get_source_addrs())) {
8851 dout(5) << "from dead osd." << from
<< ", marking down, "
8852 << " msg was " << m
->get_source_inst().addr
8854 << (map
->is_up(from
) ?
8855 map
->get_cluster_addrs(from
) : entity_addrvec_t())
8857 ConnectionRef con
= m
->get_connection();
8859 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); s
) {
8860 if (!is_fast_dispatch
)
8861 s
->session_dispatch_lock
.lock();
8862 clear_session_waiting_on_map(s
);
8863 con
->set_priv(nullptr); // break ref <-> session cycle, if any
8865 if (!is_fast_dispatch
)
8866 s
->session_dispatch_lock
.unlock();
8875 * require that we have same (or newer) map, and that
8876 * the source is the pg primary.
8878 bool OSD::require_same_or_newer_map(OpRequestRef
& op
, epoch_t epoch
,
8879 bool is_fast_dispatch
)
8881 const Message
*m
= op
->get_req();
8882 const auto osdmap
= get_osdmap();
8883 dout(15) << "require_same_or_newer_map " << epoch
8884 << " (i am " << osdmap
->get_epoch() << ") " << m
<< dendl
;
8886 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8888 // do they have a newer map?
8889 if (epoch
> osdmap
->get_epoch()) {
8890 dout(7) << "waiting for newer map epoch " << epoch
8891 << " > my " << osdmap
->get_epoch() << " with " << m
<< dendl
;
8892 wait_for_new_map(op
);
8896 if (!require_self_aliveness(op
->get_req(), epoch
)) {
8900 // ok, our map is same or newer.. do they still exist?
8901 if (m
->get_connection()->get_messenger() == cluster_messenger
&&
8902 !require_same_peer_instance(op
->get_req(), osdmap
, is_fast_dispatch
)) {
8913 // ----------------------------------------
8916 void OSD::split_pgs(
8918 const set
<spg_t
> &childpgids
, set
<PGRef
> *out_pgs
,
8923 unsigned pg_num
= nextmap
->get_pg_num(parent
->pg_id
.pool());
8924 parent
->update_snap_mapper_bits(parent
->get_pgid().get_split_bits(pg_num
));
8926 vector
<object_stat_sum_t
> updated_stats
;
8927 parent
->start_split_stats(childpgids
, &updated_stats
);
8929 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
8930 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
8931 i
!= childpgids
.end();
8933 ceph_assert(stat_iter
!= updated_stats
.end());
8934 dout(10) << __func__
<< " splitting " << *parent
<< " into " << *i
<< dendl
;
8935 PG
* child
= _make_pg(nextmap
, *i
);
8937 out_pgs
->insert(child
);
8938 child
->ch
= store
->create_new_collection(child
->coll
);
8941 uint32_t shard_index
= i
->hash_to_shard(shards
.size());
8942 assert(NULL
!= shards
[shard_index
]);
8943 store
->set_collection_commit_queue(child
->coll
, &(shards
[shard_index
]->context_queue
));
8946 unsigned split_bits
= i
->get_split_bits(pg_num
);
8947 dout(10) << " pg_num is " << pg_num
8948 << ", m_seed " << i
->ps()
8949 << ", split_bits is " << split_bits
<< dendl
;
8950 parent
->split_colls(
8954 &child
->get_pool().info
,
8961 child
->init_collection_pool_opts();
8963 child
->finish_split_stats(*stat_iter
, rctx
.transaction
);
8966 ceph_assert(stat_iter
!= updated_stats
.end());
8967 parent
->finish_split_stats(*stat_iter
, rctx
.transaction
);
8973 void OSD::handle_pg_create(OpRequestRef op
)
8975 // NOTE: this can be removed in P release (mimic is the last version to
8976 // send MOSDPGCreate messages).
8978 auto m
= op
->get_req
<MOSDPGCreate
>();
8979 ceph_assert(m
->get_type() == MSG_OSD_PG_CREATE
);
8981 dout(10) << "handle_pg_create " << *m
<< dendl
;
8983 if (!require_mon_peer(op
->get_req())) {
8987 if (!require_same_or_newer_map(op
, m
->epoch
, false))
8992 const auto osdmap
= get_osdmap();
8993 map
<pg_t
,utime_t
>::const_iterator ci
= m
->ctimes
.begin();
8994 for (map
<pg_t
,pg_create_t
>::const_iterator p
= m
->mkpg
.begin();
8997 ceph_assert(ci
!= m
->ctimes
.end() && ci
->first
== p
->first
);
8998 epoch_t created
= p
->second
.created
;
8999 if (p
->second
.split_bits
) // Skip split pgs
9003 if (!osdmap
->have_pg_pool(on
.pool())) {
9004 dout(20) << "ignoring pg on deleted pool " << on
<< dendl
;
9008 dout(20) << "mkpg " << on
<< " e" << created
<< "@" << ci
->second
<< dendl
;
9011 bool mapped
= osdmap
->get_primary_shard(on
, &pgid
);
9012 ceph_assert(mapped
);
9014 // is it still ours?
9015 vector
<int> up
, acting
;
9016 int up_primary
= -1;
9017 int acting_primary
= -1;
9018 osdmap
->pg_to_up_acting_osds(on
, &up
, &up_primary
, &acting
, &acting_primary
);
9019 int role
= osdmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
9021 if (acting_primary
!= whoami
) {
9022 dout(10) << "mkpg " << on
<< " not acting_primary (" << acting_primary
9023 << "), my role=" << role
<< ", skipping" << dendl
;
9029 pg_history_t history
;
9030 build_initial_pg_history(pgid
, created
, ci
->second
, &history
, &pi
);
9032 // The mon won't resend unless the primary changed, so we ignore
9033 // same_interval_since. We'll pass this history with the current
9034 // epoch as the event.
9035 if (history
.same_primary_since
> m
->epoch
) {
9036 dout(10) << __func__
<< ": got obsolete pg create on pgid "
9037 << pgid
<< " from epoch " << m
->epoch
9038 << ", primary changed in " << history
.same_primary_since
9042 enqueue_peering_evt(
9045 std::make_shared
<PGPeeringEvent
>(
9046 osdmap
->get_epoch(),
9047 osdmap
->get_epoch(),
9052 osdmap
->get_epoch(),
9060 std::lock_guard
l(pending_creates_lock
);
9061 if (pending_creates_from_mon
== 0) {
9062 last_pg_create_epoch
= m
->epoch
;
9066 maybe_update_heartbeat_peers();
9070 // ----------------------------------------
9071 // peering and recovery
9073 PeeringCtx
OSD::create_context()
9075 return PeeringCtx(get_osdmap()->require_osd_release
);
9078 void OSD::dispatch_context(PeeringCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
9079 ThreadPool::TPHandle
*handle
)
9081 if (!service
.get_osdmap()->is_up(whoami
)) {
9082 dout(20) << __func__
<< " not up in osdmap" << dendl
;
9083 } else if (!is_active()) {
9084 dout(20) << __func__
<< " not active" << dendl
;
9086 for (auto& [osd
, ls
] : ctx
.message_map
) {
9087 if (!curmap
->is_up(osd
)) {
9088 dout(20) << __func__
<< " skipping down osd." << osd
<< dendl
;
9091 ConnectionRef con
= service
.get_con_osd_cluster(
9092 osd
, curmap
->get_epoch());
9094 dout(20) << __func__
<< " skipping osd." << osd
<< " (NULL con)"
9098 service
.maybe_share_map(con
.get(), curmap
);
9100 con
->send_message2(m
);
9105 if ((!ctx
.transaction
.empty() || ctx
.transaction
.has_contexts()) && pg
) {
9106 int tr
= store
->queue_transaction(
9108 std::move(ctx
.transaction
), TrackedOpRef(),
9110 ceph_assert(tr
== 0);
9114 void OSD::handle_fast_pg_create(MOSDPGCreate2
*m
)
9116 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9117 if (!require_mon_peer(m
)) {
9121 for (auto& p
: m
->pgs
) {
9122 spg_t pgid
= p
.first
;
9123 epoch_t created
= p
.second
.first
;
9124 utime_t created_stamp
= p
.second
.second
;
9125 auto q
= m
->pg_extra
.find(pgid
);
9126 if (q
== m
->pg_extra
.end()) {
9127 dout(20) << __func__
<< " " << pgid
<< " e" << created
9128 << "@" << created_stamp
9129 << " (no history or past_intervals)" << dendl
;
9130 // pre-octopus ... no pg history. this can be removed in Q release.
9131 enqueue_peering_evt(
9134 std::make_shared
<PGPeeringEvent
>(
9142 pg_history_t(created
, created_stamp
),
9147 dout(20) << __func__
<< " " << pgid
<< " e" << created
9148 << "@" << created_stamp
9149 << " history " << q
->second
.first
9150 << " pi " << q
->second
.second
<< dendl
;
9151 if (!q
->second
.second
.empty() &&
9152 m
->epoch
< q
->second
.second
.get_bounds().second
) {
9153 clog
->error() << "got pg_create on " << pgid
<< " epoch " << m
->epoch
9154 << " and unmatched past_intervals " << q
->second
.second
9155 << " (history " << q
->second
.first
<< ")";
9157 enqueue_peering_evt(
9160 std::make_shared
<PGPeeringEvent
>(
9177 std::lock_guard
l(pending_creates_lock
);
9178 if (pending_creates_from_mon
== 0) {
9179 last_pg_create_epoch
= m
->epoch
;
9186 void OSD::handle_fast_pg_query(MOSDPGQuery
*m
)
9188 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9189 if (!require_osd_peer(m
)) {
9193 int from
= m
->get_source().num();
9194 for (auto& p
: m
->pg_list
) {
9195 enqueue_peering_evt(
9198 std::make_shared
<PGPeeringEvent
>(
9199 p
.second
.epoch_sent
, p
.second
.epoch_sent
,
9202 pg_shard_t(from
, p
.second
.from
),
9204 p
.second
.epoch_sent
),
9211 void OSD::handle_fast_pg_notify(MOSDPGNotify
* m
)
9213 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9214 if (!require_osd_peer(m
)) {
9218 int from
= m
->get_source().num();
9219 for (auto& p
: m
->get_pg_list()) {
9220 spg_t
pgid(p
.info
.pgid
.pgid
, p
.to
);
9221 enqueue_peering_evt(
9224 std::make_shared
<PGPeeringEvent
>(
9228 pgid
, pg_shard_t(from
, p
.from
),
9230 m
->get_connection()->get_features()),
9243 void OSD::handle_fast_pg_info(MOSDPGInfo
* m
)
9245 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9246 if (!require_osd_peer(m
)) {
9250 int from
= m
->get_source().num();
9251 for (auto& p
: m
->pg_list
) {
9252 enqueue_peering_evt(
9253 spg_t(p
.info
.pgid
.pgid
, p
.to
),
9255 std::make_shared
<PGPeeringEvent
>(
9256 p
.epoch_sent
, p
.query_epoch
,
9258 pg_shard_t(from
, p
.from
),
9266 void OSD::handle_fast_pg_remove(MOSDPGRemove
*m
)
9268 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9269 if (!require_osd_peer(m
)) {
9273 for (auto& pgid
: m
->pg_list
) {
9274 enqueue_peering_evt(
9277 std::make_shared
<PGPeeringEvent
>(
9278 m
->get_epoch(), m
->get_epoch(),
9279 PeeringState::DeleteStart())));
9284 void OSD::handle_fast_force_recovery(MOSDForceRecovery
*m
)
9286 dout(10) << __func__
<< " " << *m
<< dendl
;
9287 if (!require_mon_or_mgr_peer(m
)) {
9291 epoch_t epoch
= get_osdmap_epoch();
9292 for (auto pgid
: m
->forced_pgs
) {
9293 if (m
->options
& OFR_BACKFILL
) {
9294 if (m
->options
& OFR_CANCEL
) {
9295 enqueue_peering_evt(
9298 std::make_shared
<PGPeeringEvent
>(
9300 PeeringState::UnsetForceBackfill())));
9302 enqueue_peering_evt(
9305 std::make_shared
<PGPeeringEvent
>(
9307 PeeringState::SetForceBackfill())));
9309 } else if (m
->options
& OFR_RECOVERY
) {
9310 if (m
->options
& OFR_CANCEL
) {
9311 enqueue_peering_evt(
9314 std::make_shared
<PGPeeringEvent
>(
9316 PeeringState::UnsetForceRecovery())));
9318 enqueue_peering_evt(
9321 std::make_shared
<PGPeeringEvent
>(
9323 PeeringState::SetForceRecovery())));
9330 void OSD::handle_pg_query_nopg(const MQuery
& q
)
9332 spg_t pgid
= q
.pgid
;
9333 dout(10) << __func__
<< " " << pgid
<< dendl
;
9335 OSDMapRef osdmap
= get_osdmap();
9336 if (!osdmap
->have_pg_pool(pgid
.pool()))
9339 dout(10) << " pg " << pgid
<< " dne" << dendl
;
9340 pg_info_t
empty(spg_t(pgid
.pgid
, q
.query
.to
));
9341 ConnectionRef con
= service
.get_con_osd_cluster(q
.from
.osd
, osdmap
->get_epoch());
9344 if (q
.query
.type
== pg_query_t::LOG
||
9345 q
.query
.type
== pg_query_t::FULLLOG
) {
9347 q
.query
.from
, q
.query
.to
,
9348 osdmap
->get_epoch(), empty
,
9349 q
.query
.epoch_sent
);
9351 vector
<pg_notify_t
> ls
;
9354 q
.query
.from
, q
.query
.to
,
9356 osdmap
->get_epoch(),
9359 m
= new MOSDPGNotify(osdmap
->get_epoch(), std::move(ls
));
9361 service
.maybe_share_map(con
.get(), osdmap
);
9362 con
->send_message(m
);
9366 void OSDService::queue_check_readable(spg_t spgid
,
9368 ceph::signedspan delay
)
9370 if (delay
== ceph::signedspan::zero()) {
9371 osd
->enqueue_peering_evt(
9374 std::make_shared
<PGPeeringEvent
>(
9376 PeeringState::CheckReadable())));
9378 mono_timer
.add_event(
9380 [this, spgid
, lpr
]() {
9381 queue_check_readable(spgid
, lpr
);
9387 // =========================================================
9390 void OSDService::_maybe_queue_recovery() {
9391 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
9392 uint64_t available_pushes
;
9393 while (!awaiting_throttle
.empty() &&
9394 _recover_now(&available_pushes
)) {
9395 uint64_t to_start
= std::min(
9397 cct
->_conf
->osd_recovery_max_single_start
);
9398 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
9399 awaiting_throttle
.pop_front();
9400 dout(10) << __func__
<< " starting " << to_start
9401 << ", recovery_ops_reserved " << recovery_ops_reserved
9402 << " -> " << (recovery_ops_reserved
+ to_start
) << dendl
;
9403 recovery_ops_reserved
+= to_start
;
9407 bool OSDService::_recover_now(uint64_t *available_pushes
)
9409 if (available_pushes
)
9410 *available_pushes
= 0;
9412 if (ceph_clock_now() < defer_recovery_until
) {
9413 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
9417 if (recovery_paused
) {
9418 dout(15) << __func__
<< " paused" << dendl
;
9422 uint64_t max
= osd
->get_recovery_max_active();
9423 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
9424 dout(15) << __func__
<< " active " << recovery_ops_active
9425 << " + reserved " << recovery_ops_reserved
9426 << " >= max " << max
<< dendl
;
9430 if (available_pushes
)
9431 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
9436 unsigned OSDService::get_target_pg_log_entries() const
9438 auto num_pgs
= osd
->get_num_pgs();
9439 auto target
= cct
->_conf
->osd_target_pg_log_entries_per_osd
;
9440 if (num_pgs
> 0 && target
> 0) {
9441 // target an even spread of our budgeted log entries across all
9442 // PGs. note that while we only get to control the entry count
9443 // for primary PGs, we'll normally be responsible for a mix of
9444 // primary and replica PGs (for the same pool(s) even), so this
9446 return std::max
<unsigned>(
9447 std::min
<unsigned>(target
/ num_pgs
,
9448 cct
->_conf
->osd_max_pg_log_entries
),
9449 cct
->_conf
->osd_min_pg_log_entries
);
9451 // fall back to a per-pg value.
9452 return cct
->_conf
->osd_min_pg_log_entries
;
9456 void OSD::do_recovery(
9457 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
,
9458 ThreadPool::TPHandle
&handle
)
9460 uint64_t started
= 0;
9463 * When the value of osd_recovery_sleep is set greater than zero, recovery
9464 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9465 * recovery event's schedule time. This is done by adding a
9466 * recovery_requeue_callback event, which re-queues the recovery op using
9467 * queue_recovery_after_sleep.
9469 float recovery_sleep
= get_osd_recovery_sleep();
9471 std::lock_guard
l(service
.sleep_lock
);
9472 if (recovery_sleep
> 0 && service
.recovery_needs_sleep
) {
9474 auto recovery_requeue_callback
= new LambdaContext([this, pgref
, queued
, reserved_pushes
](int r
) {
9475 dout(20) << "do_recovery wake up at "
9477 << ", re-queuing recovery" << dendl
;
9478 std::lock_guard
l(service
.sleep_lock
);
9479 service
.recovery_needs_sleep
= false;
9480 service
.queue_recovery_after_sleep(pgref
.get(), queued
, reserved_pushes
);
9483 // This is true for the first recovery op and when the previous recovery op
9484 // has been scheduled in the past. The next recovery op is scheduled after
9485 // completing the sleep from now.
9487 if (auto now
= ceph::real_clock::now();
9488 service
.recovery_schedule_time
< now
) {
9489 service
.recovery_schedule_time
= now
;
9491 service
.recovery_schedule_time
+= ceph::make_timespan(recovery_sleep
);
9492 service
.sleep_timer
.add_event_at(service
.recovery_schedule_time
,
9493 recovery_requeue_callback
);
9494 dout(20) << "Recovery event scheduled at "
9495 << service
.recovery_schedule_time
<< dendl
;
9502 std::lock_guard
l(service
.sleep_lock
);
9503 service
.recovery_needs_sleep
= true;
9506 if (pg
->pg_has_reset_since(queued
)) {
9510 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
9511 #ifdef DEBUG_RECOVERY_OIDS
9512 dout(20) << " active was " << service
.recovery_oids
[pg
->pg_id
] << dendl
;
9515 bool do_unfound
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
9516 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
9517 << " on " << *pg
<< dendl
;
9520 PeeringCtx rctx
= create_context();
9521 rctx
.handle
= &handle
;
9522 pg
->find_unfound(queued
, rctx
);
9523 dispatch_context(rctx
, pg
, pg
->get_osdmap());
9528 ceph_assert(started
<= reserved_pushes
);
9529 service
.release_reserved_pushes(reserved_pushes
);
9532 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
9534 std::lock_guard
l(recovery_lock
);
9535 dout(10) << "start_recovery_op " << *pg
<< " " << soid
9536 << " (" << recovery_ops_active
<< "/"
9537 << osd
->get_recovery_max_active() << " rops)"
9539 recovery_ops_active
++;
9541 #ifdef DEBUG_RECOVERY_OIDS
9542 dout(20) << " active was " << recovery_oids
[pg
->pg_id
] << dendl
;
9543 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
) == 0);
9544 recovery_oids
[pg
->pg_id
].insert(soid
);
9548 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
9550 std::lock_guard
l(recovery_lock
);
9551 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
9552 << " dequeue=" << dequeue
9553 << " (" << recovery_ops_active
<< "/"
9554 << osd
->get_recovery_max_active() << " rops)"
9558 ceph_assert(recovery_ops_active
> 0);
9559 recovery_ops_active
--;
9561 #ifdef DEBUG_RECOVERY_OIDS
9562 dout(20) << " active oids was " << recovery_oids
[pg
->pg_id
] << dendl
;
9563 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
));
9564 recovery_oids
[pg
->pg_id
].erase(soid
);
9567 _maybe_queue_recovery();
9570 bool OSDService::is_recovery_active()
9572 if (cct
->_conf
->osd_debug_pretend_recovery_active
) {
9575 return local_reserver
.has_reservation() || remote_reserver
.has_reservation();
9578 void OSDService::release_reserved_pushes(uint64_t pushes
)
9580 std::lock_guard
l(recovery_lock
);
9581 dout(10) << __func__
<< "(" << pushes
<< "), recovery_ops_reserved "
9582 << recovery_ops_reserved
<< " -> " << (recovery_ops_reserved
-pushes
)
9584 ceph_assert(recovery_ops_reserved
>= pushes
);
9585 recovery_ops_reserved
-= pushes
;
9586 _maybe_queue_recovery();
9589 // =========================================================
9592 bool OSD::op_is_discardable(const MOSDOp
*op
)
9594 // drop client request if they are not connected and can't get the
9596 if (!op
->get_connection()->is_connected()) {
9602 void OSD::enqueue_op(spg_t pg
, OpRequestRef
&& op
, epoch_t epoch
)
9604 const utime_t stamp
= op
->get_req()->get_recv_stamp();
9605 const utime_t latency
= ceph_clock_now() - stamp
;
9606 const unsigned priority
= op
->get_req()->get_priority();
9607 const int cost
= op
->get_req()->get_cost();
9608 const uint64_t owner
= op
->get_req()->get_source().num();
9610 dout(15) << "enqueue_op " << op
<< " prio " << priority
9612 << " latency " << latency
9613 << " epoch " << epoch
9614 << " " << *(op
->get_req()) << dendl
;
9615 op
->osd_trace
.event("enqueue op");
9616 op
->osd_trace
.keyval("priority", priority
);
9617 op
->osd_trace
.keyval("cost", cost
);
9618 op
->mark_queued_for_pg();
9619 logger
->tinc(l_osd_op_before_queue_op_lat
, latency
);
9622 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGOpItem(pg
, std::move(op
))),
9623 cost
, priority
, stamp
, owner
, epoch
));
9626 void OSD::enqueue_peering_evt(spg_t pgid
, PGPeeringEventRef evt
)
9628 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
9631 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
9633 cct
->_conf
->osd_peering_op_priority
,
9636 evt
->get_epoch_sent()));
9640 * NOTE: dequeue called in worker thread, with pg lock
9642 void OSD::dequeue_op(
9643 PGRef pg
, OpRequestRef op
,
9644 ThreadPool::TPHandle
&handle
)
9646 const Message
*m
= op
->get_req();
9649 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_BEGIN", false);
9651 utime_t now
= ceph_clock_now();
9652 op
->set_dequeued_time(now
);
9654 utime_t latency
= now
- m
->get_recv_stamp();
9655 dout(10) << "dequeue_op " << op
<< " prio " << m
->get_priority()
9656 << " cost " << m
->get_cost()
9657 << " latency " << latency
9659 << " pg " << *pg
<< dendl
;
9661 logger
->tinc(l_osd_op_before_dequeue_op_lat
, latency
);
9663 service
.maybe_share_map(m
->get_connection().get(),
9667 if (pg
->is_deleting())
9670 op
->mark_reached_pg();
9671 op
->osd_trace
.event("dequeue_op");
9673 pg
->do_request(op
, handle
);
9676 dout(10) << "dequeue_op " << op
<< " finish" << dendl
;
9677 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_END", false);
9681 void OSD::dequeue_peering_evt(
9684 PGPeeringEventRef evt
,
9685 ThreadPool::TPHandle
& handle
)
9687 PeeringCtx rctx
= create_context();
9688 auto curmap
= sdata
->get_osdmap();
9689 bool need_up_thru
= false;
9690 epoch_t same_interval_since
= 0;
9692 if (const MQuery
*q
= dynamic_cast<const MQuery
*>(evt
->evt
.get())) {
9693 handle_pg_query_nopg(*q
);
9695 derr
<< __func__
<< " unrecognized pg-less event " << evt
->get_desc() << dendl
;
9698 } else if (advance_pg(curmap
->get_epoch(), pg
, handle
, rctx
)) {
9699 pg
->do_peering_event(evt
, rctx
);
9700 if (pg
->is_deleted()) {
9704 dispatch_context(rctx
, pg
, curmap
, &handle
);
9705 need_up_thru
= pg
->get_need_up_thru();
9706 same_interval_since
= pg
->get_same_interval_since();
9711 queue_want_up_thru(same_interval_since
);
9714 service
.send_pg_temp();
9717 void OSD::dequeue_delete(
9721 ThreadPool::TPHandle
& handle
)
9723 dequeue_peering_evt(
9727 std::make_shared
<PGPeeringEvent
>(
9729 PeeringState::DeleteSome())),
9735 // --------------------------------
9737 const char** OSD::get_tracked_conf_keys() const
9739 static const char* KEYS
[] = {
9740 "osd_max_backfills",
9741 "osd_min_recovery_priority",
9742 "osd_max_trimming_pgs",
9743 "osd_op_complaint_time",
9744 "osd_op_log_threshold",
9745 "osd_op_history_size",
9746 "osd_op_history_duration",
9747 "osd_op_history_slow_op_size",
9748 "osd_op_history_slow_op_threshold",
9749 "osd_enable_op_tracker",
9750 "osd_map_cache_size",
9751 "osd_pg_epoch_max_lag_factor",
9752 "osd_pg_epoch_persisted_max_stale",
9753 // clog & admin clog
9756 "clog_to_syslog_facility",
9757 "clog_to_syslog_level",
9758 "osd_objectstore_fuse",
9760 "clog_to_graylog_host",
9761 "clog_to_graylog_port",
9764 "osd_recovery_delay_start",
9765 "osd_client_message_size_cap",
9766 "osd_client_message_cap",
9767 "osd_heartbeat_min_size",
9768 "osd_heartbeat_interval",
9769 "osd_object_clean_region_max_num_intervals",
9770 "osd_scrub_min_interval",
9771 "osd_scrub_max_interval",
9777 void OSD::handle_conf_change(const ConfigProxy
& conf
,
9778 const std::set
<std::string
> &changed
)
9780 std::lock_guard l
{osd_lock
};
9781 if (changed
.count("osd_max_backfills")) {
9782 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9783 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9785 if (changed
.count("osd_min_recovery_priority")) {
9786 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9787 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9789 if (changed
.count("osd_max_trimming_pgs")) {
9790 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
9792 if (changed
.count("osd_op_complaint_time") ||
9793 changed
.count("osd_op_log_threshold")) {
9794 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
9795 cct
->_conf
->osd_op_log_threshold
);
9797 if (changed
.count("osd_op_history_size") ||
9798 changed
.count("osd_op_history_duration")) {
9799 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
9800 cct
->_conf
->osd_op_history_duration
);
9802 if (changed
.count("osd_op_history_slow_op_size") ||
9803 changed
.count("osd_op_history_slow_op_threshold")) {
9804 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
9805 cct
->_conf
->osd_op_history_slow_op_threshold
);
9807 if (changed
.count("osd_enable_op_tracker")) {
9808 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
9810 if (changed
.count("osd_map_cache_size")) {
9811 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9812 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9813 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9815 if (changed
.count("clog_to_monitors") ||
9816 changed
.count("clog_to_syslog") ||
9817 changed
.count("clog_to_syslog_level") ||
9818 changed
.count("clog_to_syslog_facility") ||
9819 changed
.count("clog_to_graylog") ||
9820 changed
.count("clog_to_graylog_host") ||
9821 changed
.count("clog_to_graylog_port") ||
9822 changed
.count("host") ||
9823 changed
.count("fsid")) {
9824 update_log_config();
9826 if (changed
.count("osd_pg_epoch_max_lag_factor")) {
9827 m_osd_pg_epoch_max_lag_factor
= conf
.get_val
<double>(
9828 "osd_pg_epoch_max_lag_factor");
9832 if (changed
.count("osd_objectstore_fuse")) {
9834 enable_disable_fuse(false);
9839 if (changed
.count("osd_recovery_delay_start")) {
9840 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
9841 service
.kick_recovery_queue();
9844 if (changed
.count("osd_client_message_cap")) {
9845 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
9846 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9847 if (pol
.throttler_messages
&& newval
> 0) {
9848 pol
.throttler_messages
->reset_max(newval
);
9851 if (changed
.count("osd_client_message_size_cap")) {
9852 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
9853 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9854 if (pol
.throttler_bytes
&& newval
> 0) {
9855 pol
.throttler_bytes
->reset_max(newval
);
9858 if (changed
.count("osd_object_clean_region_max_num_intervals")) {
9859 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
9862 if (changed
.count("osd_scrub_min_interval") ||
9863 changed
.count("osd_scrub_max_interval")) {
9864 resched_all_scrubs();
9865 dout(0) << __func__
<< ": scrub interval change" << dendl
;
9870 void OSD::update_log_config()
9872 map
<string
,string
> log_to_monitors
;
9873 map
<string
,string
> log_to_syslog
;
9874 map
<string
,string
> log_channel
;
9875 map
<string
,string
> log_prio
;
9876 map
<string
,string
> log_to_graylog
;
9877 map
<string
,string
> log_to_graylog_host
;
9878 map
<string
,string
> log_to_graylog_port
;
9882 if (parse_log_client_options(cct
, log_to_monitors
, log_to_syslog
,
9883 log_channel
, log_prio
, log_to_graylog
,
9884 log_to_graylog_host
, log_to_graylog_port
,
9886 clog
->update_config(log_to_monitors
, log_to_syslog
,
9887 log_channel
, log_prio
, log_to_graylog
,
9888 log_to_graylog_host
, log_to_graylog_port
,
9890 derr
<< "log_to_monitors " << log_to_monitors
<< dendl
;
9893 void OSD::check_config()
9895 // some sanity checks
9896 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
9897 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
9898 << " is not > osd_pg_epoch_persisted_max_stale ("
9899 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
9901 if (cct
->_conf
->osd_object_clean_region_max_num_intervals
< 0) {
9902 clog
->warn() << "osd_object_clean_region_max_num_intervals ("
9903 << cct
->_conf
->osd_object_clean_region_max_num_intervals
9908 // --------------------------------
9910 void OSD::get_latest_osdmap()
9912 dout(10) << __func__
<< " -- start" << dendl
;
9915 service
.objecter
->wait_for_latest_osdmap(&cond
);
9918 dout(10) << __func__
<< " -- finish" << dendl
;
9921 // --------------------------------
9923 void OSD::set_perf_queries(const ConfigPayload
&config_payload
) {
9924 const OSDConfigPayload
&osd_config_payload
= boost::get
<OSDConfigPayload
>(config_payload
);
9925 const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
= osd_config_payload
.config
;
9926 dout(10) << "setting " << queries
.size() << " queries" << dendl
;
9928 std::list
<OSDPerfMetricQuery
> supported_queries
;
9929 for (auto &it
: queries
) {
9930 auto &query
= it
.first
;
9931 if (!query
.key_descriptor
.empty()) {
9932 supported_queries
.push_back(query
);
9935 if (supported_queries
.size() < queries
.size()) {
9936 dout(1) << queries
.size() - supported_queries
.size()
9937 << " unsupported queries" << dendl
;
9940 std::lock_guard locker
{m_perf_queries_lock
};
9941 m_perf_queries
= supported_queries
;
9942 m_perf_limits
= queries
;
9944 std::vector
<PGRef
> pgs
;
9946 for (auto& pg
: pgs
) {
9947 std::scoped_lock l
{*pg
};
9948 pg
->set_dynamic_perf_stats_queries(supported_queries
);
9952 MetricPayload
OSD::get_perf_reports() {
9953 OSDMetricPayload payload
;
9954 std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> &reports
= payload
.report
;
9956 std::vector
<PGRef
> pgs
;
9958 DynamicPerfStats dps
;
9959 for (auto& pg
: pgs
) {
9960 // m_perf_queries can be modified only in set_perf_queries by mgr client
9961 // request, and it is protected by by mgr client's lock, which is held
9962 // when set_perf_queries/get_perf_reports are called, so we may not hold
9963 // m_perf_queries_lock here.
9964 DynamicPerfStats
pg_dps(m_perf_queries
);
9966 pg
->get_dynamic_perf_stats(&pg_dps
);
9970 dps
.add_to_reports(m_perf_limits
, &reports
);
9971 dout(20) << "reports for " << reports
.size() << " queries" << dendl
;
9976 // =============================================================
9979 #define dout_context cct
9981 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
9983 void OSDShard::_attach_pg(OSDShardPGSlot
*slot
, PG
*pg
)
9985 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
9987 pg
->osd_shard
= this;
9991 slot
->epoch
= pg
->get_osdmap_epoch();
9992 pg_slots_by_epoch
.insert(*slot
);
9995 void OSDShard::_detach_pg(OSDShardPGSlot
*slot
)
9997 dout(10) << slot
->pg
->pg_id
<< " " << slot
->pg
<< dendl
;
9998 slot
->pg
->osd_shard
= nullptr;
9999 slot
->pg
->pg_slot
= nullptr;
10000 slot
->pg
= nullptr;
10001 osd
->dec_num_pgs();
10003 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10005 if (waiting_for_min_pg_epoch
) {
10006 min_pg_epoch_cond
.notify_all();
10010 void OSDShard::update_pg_epoch(OSDShardPGSlot
*slot
, epoch_t e
)
10012 std::lock_guard
l(shard_lock
);
10013 dout(30) << "min was " << pg_slots_by_epoch
.begin()->epoch
10014 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10015 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10016 dout(20) << slot
->pg
->pg_id
<< " " << slot
->epoch
<< " -> " << e
<< dendl
;
10018 pg_slots_by_epoch
.insert(*slot
);
10019 dout(30) << "min is now " << pg_slots_by_epoch
.begin()->epoch
10020 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10021 if (waiting_for_min_pg_epoch
) {
10022 min_pg_epoch_cond
.notify_all();
10026 epoch_t
OSDShard::get_min_pg_epoch()
10028 std::lock_guard
l(shard_lock
);
10029 auto p
= pg_slots_by_epoch
.begin();
10030 if (p
== pg_slots_by_epoch
.end()) {
10036 void OSDShard::wait_min_pg_epoch(epoch_t need
)
10038 std::unique_lock l
{shard_lock
};
10039 ++waiting_for_min_pg_epoch
;
10040 min_pg_epoch_cond
.wait(l
, [need
, this] {
10041 if (pg_slots_by_epoch
.empty()) {
10043 } else if (pg_slots_by_epoch
.begin()->epoch
>= need
) {
10046 dout(10) << need
<< " waiting on "
10047 << pg_slots_by_epoch
.begin()->epoch
<< dendl
;
10051 --waiting_for_min_pg_epoch
;
10054 epoch_t
OSDShard::get_max_waiting_epoch()
10056 std::lock_guard
l(shard_lock
);
10058 for (auto& i
: pg_slots
) {
10059 if (!i
.second
->waiting_peering
.empty()) {
10060 r
= std::max(r
, i
.second
->waiting_peering
.rbegin()->first
);
10066 void OSDShard::consume_map(
10067 const OSDMapRef
& new_osdmap
,
10068 unsigned *pushes_to_free
)
10070 std::lock_guard
l(shard_lock
);
10071 OSDMapRef old_osdmap
;
10073 std::lock_guard
l(osdmap_lock
);
10074 old_osdmap
= std::move(shard_osdmap
);
10075 shard_osdmap
= new_osdmap
;
10077 dout(10) << new_osdmap
->get_epoch()
10078 << " (was " << (old_osdmap
? old_osdmap
->get_epoch() : 0) << ")"
10080 bool queued
= false;
10083 auto p
= pg_slots
.begin();
10084 while (p
!= pg_slots
.end()) {
10085 OSDShardPGSlot
*slot
= p
->second
.get();
10086 const spg_t
& pgid
= p
->first
;
10087 dout(20) << __func__
<< " " << pgid
<< dendl
;
10088 if (!slot
->waiting_for_split
.empty()) {
10089 dout(20) << __func__
<< " " << pgid
10090 << " waiting for split " << slot
->waiting_for_split
<< dendl
;
10094 if (slot
->waiting_for_merge_epoch
> new_osdmap
->get_epoch()) {
10095 dout(20) << __func__
<< " " << pgid
10096 << " waiting for merge by epoch " << slot
->waiting_for_merge_epoch
10101 if (!slot
->waiting_peering
.empty()) {
10102 epoch_t first
= slot
->waiting_peering
.begin()->first
;
10103 if (first
<= new_osdmap
->get_epoch()) {
10104 dout(20) << __func__
<< " " << pgid
10105 << " pending_peering first epoch " << first
10106 << " <= " << new_osdmap
->get_epoch() << ", requeueing" << dendl
;
10107 _wake_pg_slot(pgid
, slot
);
10113 if (!slot
->waiting
.empty()) {
10114 if (new_osdmap
->is_up_acting_osd_shard(pgid
, osd
->get_nodeid())) {
10115 dout(20) << __func__
<< " " << pgid
<< " maps to us, keeping"
10120 while (!slot
->waiting
.empty() &&
10121 slot
->waiting
.front().get_map_epoch() <= new_osdmap
->get_epoch()) {
10122 auto& qi
= slot
->waiting
.front();
10123 dout(20) << __func__
<< " " << pgid
10124 << " waiting item " << qi
10125 << " epoch " << qi
.get_map_epoch()
10126 << " <= " << new_osdmap
->get_epoch()
10128 << (qi
.get_map_epoch() < new_osdmap
->get_epoch() ? "stale" :
10130 << ", dropping" << dendl
;
10131 *pushes_to_free
+= qi
.get_reserved_pushes();
10132 slot
->waiting
.pop_front();
10135 if (slot
->waiting
.empty() &&
10136 slot
->num_running
== 0 &&
10137 slot
->waiting_for_split
.empty() &&
10139 dout(20) << __func__
<< " " << pgid
<< " empty, pruning" << dendl
;
10140 p
= pg_slots
.erase(p
);
10147 std::lock_guard l
{sdata_wait_lock
};
10148 sdata_cond
.notify_one();
10152 void OSDShard::_wake_pg_slot(
10154 OSDShardPGSlot
*slot
)
10156 dout(20) << __func__
<< " " << pgid
10157 << " to_process " << slot
->to_process
10158 << " waiting " << slot
->waiting
10159 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10160 for (auto i
= slot
->to_process
.rbegin();
10161 i
!= slot
->to_process
.rend();
10163 scheduler
->enqueue_front(std::move(*i
));
10165 slot
->to_process
.clear();
10166 for (auto i
= slot
->waiting
.rbegin();
10167 i
!= slot
->waiting
.rend();
10169 scheduler
->enqueue_front(std::move(*i
));
10171 slot
->waiting
.clear();
10172 for (auto i
= slot
->waiting_peering
.rbegin();
10173 i
!= slot
->waiting_peering
.rend();
10175 // this is overkill; we requeue everything, even if some of these
10176 // items are waiting for maps we don't have yet. FIXME, maybe,
10177 // someday, if we decide this inefficiency matters
10178 for (auto j
= i
->second
.rbegin(); j
!= i
->second
.rend(); ++j
) {
10179 scheduler
->enqueue_front(std::move(*j
));
10182 slot
->waiting_peering
.clear();
10183 ++slot
->requeue_seq
;
10186 void OSDShard::identify_splits_and_merges(
10187 const OSDMapRef
& as_of_osdmap
,
10188 set
<pair
<spg_t
,epoch_t
>> *split_pgs
,
10189 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10191 std::lock_guard
l(shard_lock
);
10192 if (shard_osdmap
) {
10193 for (auto& i
: pg_slots
) {
10194 const spg_t
& pgid
= i
.first
;
10195 auto *slot
= i
.second
.get();
10197 osd
->service
.identify_splits_and_merges(
10198 shard_osdmap
, as_of_osdmap
, pgid
,
10199 split_pgs
, merge_pgs
);
10200 } else if (!slot
->waiting_for_split
.empty()) {
10201 osd
->service
.identify_splits_and_merges(
10202 shard_osdmap
, as_of_osdmap
, pgid
,
10203 split_pgs
, nullptr);
10205 dout(20) << __func__
<< " slot " << pgid
10206 << " has no pg and waiting_for_split " << dendl
;
10212 void OSDShard::prime_splits(const OSDMapRef
& as_of_osdmap
,
10213 set
<pair
<spg_t
,epoch_t
>> *pgids
)
10215 std::lock_guard
l(shard_lock
);
10216 _prime_splits(pgids
);
10217 if (shard_osdmap
->get_epoch() > as_of_osdmap
->get_epoch()) {
10218 set
<pair
<spg_t
,epoch_t
>> newer_children
;
10219 for (auto i
: *pgids
) {
10220 osd
->service
.identify_splits_and_merges(
10221 as_of_osdmap
, shard_osdmap
, i
.first
,
10222 &newer_children
, nullptr);
10224 newer_children
.insert(pgids
->begin(), pgids
->end());
10225 dout(10) << "as_of_osdmap " << as_of_osdmap
->get_epoch() << " < shard "
10226 << shard_osdmap
->get_epoch() << ", new children " << newer_children
10228 _prime_splits(&newer_children
);
10229 // note: we don't care what is left over here for other shards.
10230 // if this shard is ahead of us and one isn't, e.g., one thread is
10231 // calling into prime_splits via _process (due to a newly created
10232 // pg) and this shard has a newer map due to a racing consume_map,
10233 // then any grandchildren left here will be identified (or were
10234 // identified) when the slower shard's osdmap is advanced.
10235 // _prime_splits() will tolerate the case where the pgid is
10240 void OSDShard::_prime_splits(set
<pair
<spg_t
,epoch_t
>> *pgids
)
10242 dout(10) << *pgids
<< dendl
;
10243 auto p
= pgids
->begin();
10244 while (p
!= pgids
->end()) {
10245 unsigned shard_index
= p
->first
.hash_to_shard(osd
->num_shards
);
10246 if (shard_index
== shard_id
) {
10247 auto r
= pg_slots
.emplace(p
->first
, nullptr);
10249 dout(10) << "priming slot " << p
->first
<< " e" << p
->second
<< dendl
;
10250 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10251 r
.first
->second
->waiting_for_split
.insert(p
->second
);
10254 ceph_assert(q
!= pg_slots
.end());
10255 dout(10) << "priming (existing) slot " << p
->first
<< " e" << p
->second
10257 q
->second
->waiting_for_split
.insert(p
->second
);
10259 p
= pgids
->erase(p
);
10266 void OSDShard::prime_merges(const OSDMapRef
& as_of_osdmap
,
10267 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10269 std::lock_guard
l(shard_lock
);
10270 dout(20) << __func__
<< " checking shard " << shard_id
10271 << " for remaining merge pgs " << merge_pgs
<< dendl
;
10272 auto p
= merge_pgs
->begin();
10273 while (p
!= merge_pgs
->end()) {
10274 spg_t pgid
= p
->first
;
10275 epoch_t epoch
= p
->second
;
10276 unsigned shard_index
= pgid
.hash_to_shard(osd
->num_shards
);
10277 if (shard_index
!= shard_id
) {
10281 OSDShardPGSlot
*slot
;
10282 auto r
= pg_slots
.emplace(pgid
, nullptr);
10284 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10286 slot
= r
.first
->second
.get();
10289 dout(20) << __func__
<< " have merge participant pg " << pgid
10290 << " " << slot
->pg
<< dendl
;
10291 } else if (!slot
->waiting_for_split
.empty() &&
10292 *slot
->waiting_for_split
.begin() < epoch
) {
10293 dout(20) << __func__
<< " pending split on merge participant pg " << pgid
10294 << " " << slot
->waiting_for_split
<< dendl
;
10296 dout(20) << __func__
<< " creating empty merge participant " << pgid
10297 << " for merge in " << epoch
<< dendl
;
10298 // leave history zeroed; PG::merge_from() will fill it in.
10299 pg_history_t history
;
10300 PGCreateInfo
cinfo(pgid
, epoch
- 1,
10301 history
, PastIntervals(), false);
10302 PGRef pg
= osd
->handle_pg_create_info(shard_osdmap
, &cinfo
);
10303 _attach_pg(r
.first
->second
.get(), pg
.get());
10304 _wake_pg_slot(pgid
, slot
);
10307 // mark slot for merge
10308 dout(20) << __func__
<< " marking merge participant " << pgid
<< dendl
;
10309 slot
->waiting_for_merge_epoch
= epoch
;
10310 p
= merge_pgs
->erase(p
);
10314 void OSDShard::register_and_wake_split_child(PG
*pg
)
10318 std::lock_guard
l(shard_lock
);
10319 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10320 auto p
= pg_slots
.find(pg
->pg_id
);
10321 ceph_assert(p
!= pg_slots
.end());
10322 auto *slot
= p
->second
.get();
10323 dout(20) << pg
->pg_id
<< " waiting_for_split " << slot
->waiting_for_split
10325 ceph_assert(!slot
->pg
);
10326 ceph_assert(!slot
->waiting_for_split
.empty());
10327 _attach_pg(slot
, pg
);
10329 epoch
= pg
->get_osdmap_epoch();
10330 ceph_assert(slot
->waiting_for_split
.count(epoch
));
10331 slot
->waiting_for_split
.erase(epoch
);
10332 if (slot
->waiting_for_split
.empty()) {
10333 _wake_pg_slot(pg
->pg_id
, slot
);
10335 dout(10) << __func__
<< " still waiting for split on "
10336 << slot
->waiting_for_split
<< dendl
;
10340 // kick child to ensure it pulls up to the latest osdmap
10341 osd
->enqueue_peering_evt(
10344 std::make_shared
<PGPeeringEvent
>(
10349 std::lock_guard l
{sdata_wait_lock
};
10350 sdata_cond
.notify_one();
10353 void OSDShard::unprime_split_children(spg_t parent
, unsigned old_pg_num
)
10355 std::lock_guard
l(shard_lock
);
10356 vector
<spg_t
> to_delete
;
10357 for (auto& i
: pg_slots
) {
10358 if (i
.first
!= parent
&&
10359 i
.first
.get_ancestor(old_pg_num
) == parent
) {
10360 dout(10) << __func__
<< " parent " << parent
<< " clearing " << i
.first
10362 _wake_pg_slot(i
.first
, i
.second
.get());
10363 to_delete
.push_back(i
.first
);
10366 for (auto pgid
: to_delete
) {
10367 pg_slots
.erase(pgid
);
10371 OSDShard::OSDShard(
10378 shard_name(string("OSDShard.") + stringify(id
)),
10379 sdata_wait_lock_name(shard_name
+ "::sdata_wait_lock"),
10380 sdata_wait_lock
{make_mutex(sdata_wait_lock_name
)},
10381 osdmap_lock
{make_mutex(shard_name
+ "::osdmap_lock")},
10382 shard_lock_name(shard_name
+ "::shard_lock"),
10383 shard_lock
{make_mutex(shard_lock_name
)},
10384 scheduler(ceph::osd::scheduler::make_scheduler(cct
)),
10385 context_queue(sdata_wait_lock
, sdata_cond
)
10387 dout(0) << "using op scheduler " << *scheduler
<< dendl
;
10391 // =============================================================
10393 #undef dout_context
10394 #define dout_context osd->cct
10396 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10398 void OSD::ShardedOpWQ::_add_slot_waiter(
10400 OSDShardPGSlot
*slot
,
10401 OpSchedulerItem
&& qi
)
10403 if (qi
.is_peering()) {
10404 dout(20) << __func__
<< " " << pgid
10405 << " peering, item epoch is "
10406 << qi
.get_map_epoch()
10407 << ", will wait on " << qi
<< dendl
;
10408 slot
->waiting_peering
[qi
.get_map_epoch()].push_back(std::move(qi
));
10410 dout(20) << __func__
<< " " << pgid
10411 << " item epoch is "
10412 << qi
.get_map_epoch()
10413 << ", will wait on " << qi
<< dendl
;
10414 slot
->waiting
.push_back(std::move(qi
));
10419 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10421 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
10423 uint32_t shard_index
= thread_index
% osd
->num_shards
;
10424 auto& sdata
= osd
->shards
[shard_index
];
10425 ceph_assert(sdata
);
10427 // If all threads of shards do oncommits, there is a out-of-order
10428 // problem. So we choose the thread which has the smallest
10429 // thread_index(thread_index < num_shards) of shard to do oncommit
10431 bool is_smallest_thread_index
= thread_index
< osd
->num_shards
;
10434 sdata
->shard_lock
.lock();
10435 if (sdata
->scheduler
->empty() &&
10436 (!is_smallest_thread_index
|| sdata
->context_queue
.empty())) {
10437 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10438 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
10439 // we raced with a context_queue addition, don't wait
10440 wait_lock
.unlock();
10441 } else if (!sdata
->stop_waiting
) {
10442 dout(20) << __func__
<< " empty q, waiting" << dendl
;
10443 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
10444 sdata
->shard_lock
.unlock();
10445 sdata
->sdata_cond
.wait(wait_lock
);
10446 wait_lock
.unlock();
10447 sdata
->shard_lock
.lock();
10448 if (sdata
->scheduler
->empty() &&
10449 !(is_smallest_thread_index
&& !sdata
->context_queue
.empty())) {
10450 sdata
->shard_lock
.unlock();
10453 // found a work item; reapply default wq timeouts
10454 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10455 timeout_interval
, suicide_interval
);
10457 dout(20) << __func__
<< " need return immediately" << dendl
;
10458 wait_lock
.unlock();
10459 sdata
->shard_lock
.unlock();
10464 list
<Context
*> oncommits
;
10465 if (is_smallest_thread_index
) {
10466 sdata
->context_queue
.move_to(oncommits
);
10469 if (sdata
->scheduler
->empty()) {
10470 if (osd
->is_stopping()) {
10471 sdata
->shard_lock
.unlock();
10472 for (auto c
: oncommits
) {
10473 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10476 return; // OSD shutdown, discard.
10478 sdata
->shard_lock
.unlock();
10479 handle_oncommits(oncommits
);
10483 OpSchedulerItem item
= sdata
->scheduler
->dequeue();
10484 if (osd
->is_stopping()) {
10485 sdata
->shard_lock
.unlock();
10486 for (auto c
: oncommits
) {
10487 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10490 return; // OSD shutdown, discard.
10493 const auto token
= item
.get_ordering_token();
10494 auto r
= sdata
->pg_slots
.emplace(token
, nullptr);
10496 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10498 OSDShardPGSlot
*slot
= r
.first
->second
.get();
10499 dout(20) << __func__
<< " " << token
10500 << (r
.second
? " (new)" : "")
10501 << " to_process " << slot
->to_process
10502 << " waiting " << slot
->waiting
10503 << " waiting_peering " << slot
->waiting_peering
10505 slot
->to_process
.push_back(std::move(item
));
10506 dout(20) << __func__
<< " " << slot
->to_process
.back()
10507 << " queued" << dendl
;
10510 PGRef pg
= slot
->pg
;
10512 // lock pg (if we have it)
10514 // note the requeue seq now...
10515 uint64_t requeue_seq
= slot
->requeue_seq
;
10516 ++slot
->num_running
;
10518 sdata
->shard_lock
.unlock();
10519 osd
->service
.maybe_inject_dispatch_delay();
10521 osd
->service
.maybe_inject_dispatch_delay();
10522 sdata
->shard_lock
.lock();
10524 auto q
= sdata
->pg_slots
.find(token
);
10525 if (q
== sdata
->pg_slots
.end()) {
10526 // this can happen if we race with pg removal.
10527 dout(20) << __func__
<< " slot " << token
<< " no longer there" << dendl
;
10529 sdata
->shard_lock
.unlock();
10530 handle_oncommits(oncommits
);
10533 slot
= q
->second
.get();
10534 --slot
->num_running
;
10536 if (slot
->to_process
.empty()) {
10537 // raced with _wake_pg_slot or consume_map
10538 dout(20) << __func__
<< " " << token
10539 << " nothing queued" << dendl
;
10541 sdata
->shard_lock
.unlock();
10542 handle_oncommits(oncommits
);
10545 if (requeue_seq
!= slot
->requeue_seq
) {
10546 dout(20) << __func__
<< " " << token
10547 << " requeue_seq " << slot
->requeue_seq
<< " > our "
10548 << requeue_seq
<< ", we raced with _wake_pg_slot"
10551 sdata
->shard_lock
.unlock();
10552 handle_oncommits(oncommits
);
10555 if (slot
->pg
!= pg
) {
10556 // this can happen if we race with pg removal.
10557 dout(20) << __func__
<< " slot " << token
<< " no longer attached to "
10564 dout(20) << __func__
<< " " << token
10565 << " to_process " << slot
->to_process
10566 << " waiting " << slot
->waiting
10567 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10569 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
10573 auto qi
= std::move(slot
->to_process
.front());
10574 slot
->to_process
.pop_front();
10575 dout(20) << __func__
<< " " << qi
<< " pg " << pg
<< dendl
;
10576 set
<pair
<spg_t
,epoch_t
>> new_children
;
10580 // should this pg shard exist on this osd in this (or a later) epoch?
10581 osdmap
= sdata
->shard_osdmap
;
10582 const PGCreateInfo
*create_info
= qi
.creates_pg();
10583 if (!slot
->waiting_for_split
.empty()) {
10584 dout(20) << __func__
<< " " << token
10585 << " splitting " << slot
->waiting_for_split
<< dendl
;
10586 _add_slot_waiter(token
, slot
, std::move(qi
));
10587 } else if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10588 dout(20) << __func__
<< " " << token
10589 << " map " << qi
.get_map_epoch() << " > "
10590 << osdmap
->get_epoch() << dendl
;
10591 _add_slot_waiter(token
, slot
, std::move(qi
));
10592 } else if (qi
.is_peering()) {
10593 if (!qi
.peering_requires_pg()) {
10594 // for pg-less events, we run them under the ordering lock, since
10595 // we don't have the pg lock to keep them ordered.
10596 qi
.run(osd
, sdata
, pg
, tp_handle
);
10597 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10599 if (create_info
->by_mon
&&
10600 osdmap
->get_pg_acting_primary(token
.pgid
) != osd
->whoami
) {
10601 dout(20) << __func__
<< " " << token
10602 << " no pg, no longer primary, ignoring mon create on "
10605 dout(20) << __func__
<< " " << token
10606 << " no pg, should create on " << qi
<< dendl
;
10607 pg
= osd
->handle_pg_create_info(osdmap
, create_info
);
10609 // we created the pg! drop out and continue "normally"!
10610 sdata
->_attach_pg(slot
, pg
.get());
10611 sdata
->_wake_pg_slot(token
, slot
);
10613 // identify split children between create epoch and shard epoch.
10614 osd
->service
.identify_splits_and_merges(
10615 pg
->get_osdmap(), osdmap
, pg
->pg_id
, &new_children
, nullptr);
10616 sdata
->_prime_splits(&new_children
);
10617 // distribute remaining split children to other shards below!
10620 dout(20) << __func__
<< " ignored create on " << qi
<< dendl
;
10623 dout(20) << __func__
<< " " << token
10624 << " no pg, peering, !create, discarding " << qi
<< dendl
;
10627 dout(20) << __func__
<< " " << token
10628 << " no pg, peering, doesn't map here e" << osdmap
->get_epoch()
10629 << ", discarding " << qi
10632 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10633 dout(20) << __func__
<< " " << token
10634 << " no pg, should exist e" << osdmap
->get_epoch()
10635 << ", will wait on " << qi
<< dendl
;
10636 _add_slot_waiter(token
, slot
, std::move(qi
));
10638 dout(20) << __func__
<< " " << token
10639 << " no pg, shouldn't exist e" << osdmap
->get_epoch()
10640 << ", dropping " << qi
<< dendl
;
10641 // share map with client?
10642 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10643 osd
->service
.maybe_share_map((*_op
)->get_req()->get_connection().get(),
10644 sdata
->shard_osdmap
,
10645 (*_op
)->sent_epoch
);
10647 unsigned pushes_to_free
= qi
.get_reserved_pushes();
10648 if (pushes_to_free
> 0) {
10649 sdata
->shard_lock
.unlock();
10650 osd
->service
.release_reserved_pushes(pushes_to_free
);
10651 handle_oncommits(oncommits
);
10655 sdata
->shard_lock
.unlock();
10656 handle_oncommits(oncommits
);
10659 if (qi
.is_peering()) {
10660 OSDMapRef osdmap
= sdata
->shard_osdmap
;
10661 if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10662 _add_slot_waiter(token
, slot
, std::move(qi
));
10663 sdata
->shard_lock
.unlock();
10665 handle_oncommits(oncommits
);
10669 sdata
->shard_lock
.unlock();
10671 if (!new_children
.empty()) {
10672 for (auto shard
: osd
->shards
) {
10673 shard
->prime_splits(osdmap
, &new_children
);
10675 ceph_assert(new_children
.empty());
10678 // osd_opwq_process marks the point at which an operation has been dequeued
10679 // and will begin to be handled by a worker thread.
10683 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10684 reqid
= (*_op
)->get_reqid();
10687 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
10688 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
10691 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
10692 Formatter
*f
= Formatter::create("json");
10693 f
->open_object_section("q");
10695 f
->close_section();
10700 qi
.run(osd
, sdata
, pg
, tp_handle
);
10705 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10706 reqid
= (*_op
)->get_reqid();
10709 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
10710 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
10713 handle_oncommits(oncommits
);
10716 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem
&& item
) {
10717 uint32_t shard_index
=
10718 item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
10720 dout(20) << __func__
<< " " << item
<< dendl
;
10722 OSDShard
* sdata
= osd
->shards
[shard_index
];
10723 assert (NULL
!= sdata
);
10727 std::lock_guard l
{sdata
->shard_lock
};
10728 empty
= sdata
->scheduler
->empty();
10729 sdata
->scheduler
->enqueue(std::move(item
));
10733 std::lock_guard l
{sdata
->sdata_wait_lock
};
10734 sdata
->sdata_cond
.notify_all();
10738 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem
&& item
)
10740 auto shard_index
= item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
10741 auto& sdata
= osd
->shards
[shard_index
];
10742 ceph_assert(sdata
);
10743 sdata
->shard_lock
.lock();
10744 auto p
= sdata
->pg_slots
.find(item
.get_ordering_token());
10745 if (p
!= sdata
->pg_slots
.end() &&
10746 !p
->second
->to_process
.empty()) {
10747 // we may be racing with _process, which has dequeued a new item
10748 // from scheduler, put it on to_process, and is now busy taking the
10749 // pg lock. ensure this old requeued item is ordered before any
10750 // such newer item in to_process.
10751 p
->second
->to_process
.push_front(std::move(item
));
10752 item
= std::move(p
->second
->to_process
.back());
10753 p
->second
->to_process
.pop_back();
10754 dout(20) << __func__
10755 << " " << p
->second
->to_process
.front()
10756 << " shuffled w/ " << item
<< dendl
;
10758 dout(20) << __func__
<< " " << item
<< dendl
;
10760 sdata
->scheduler
->enqueue_front(std::move(item
));
10761 sdata
->shard_lock
.unlock();
10762 std::lock_guard l
{sdata
->sdata_wait_lock
};
10763 sdata
->sdata_cond
.notify_one();
10767 namespace osd_cmds
{
10769 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
,
10772 if (!ceph_using_tcmalloc()) {
10773 os
<< "could not issue heap profiler command -- not using tcmalloc!";
10774 return -EOPNOTSUPP
;
10778 if (!cmd_getval(cmdmap
, "heapcmd", cmd
)) {
10779 os
<< "unable to get value for command \"" << cmd
<< "\"";
10783 std::vector
<std::string
> cmd_vec
;
10784 get_str_vec(cmd
, cmd_vec
);
10787 if (cmd_getval(cmdmap
, "value", val
)) {
10788 cmd_vec
.push_back(val
);
10791 ceph_heap_profiler_handle_command(cmd_vec
, os
);
10796 }} // namespace ceph::osd_cmds