1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
47 #include "osdc/Objecter.h"
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_releases.h"
52 #include "common/ceph_time.h"
53 #include "common/version.h"
54 #include "common/pick_address.h"
55 #include "common/blkdev.h"
56 #include "common/numa.h"
58 #include "os/ObjectStore.h"
60 #include "os/FuseStore.h"
63 #include "PrimaryLogPG.h"
65 #include "msg/Messenger.h"
66 #include "msg/Message.h"
68 #include "mon/MonClient.h"
70 #include "messages/MLog.h"
72 #include "messages/MGenericMessage.h"
73 #include "messages/MOSDPing.h"
74 #include "messages/MOSDFailure.h"
75 #include "messages/MOSDMarkMeDown.h"
76 #include "messages/MOSDMarkMeDead.h"
77 #include "messages/MOSDFull.h"
78 #include "messages/MOSDOp.h"
79 #include "messages/MOSDOpReply.h"
80 #include "messages/MOSDBackoff.h"
81 #include "messages/MOSDBeacon.h"
82 #include "messages/MOSDRepOp.h"
83 #include "messages/MOSDRepOpReply.h"
84 #include "messages/MOSDBoot.h"
85 #include "messages/MOSDPGTemp.h"
86 #include "messages/MOSDPGReadyToMerge.h"
88 #include "messages/MOSDMap.h"
89 #include "messages/MMonGetOSDMap.h"
90 #include "messages/MOSDPGNotify.h"
91 #include "messages/MOSDPGNotify2.h"
92 #include "messages/MOSDPGQuery.h"
93 #include "messages/MOSDPGQuery2.h"
94 #include "messages/MOSDPGLog.h"
95 #include "messages/MOSDPGRemove.h"
96 #include "messages/MOSDPGInfo.h"
97 #include "messages/MOSDPGInfo2.h"
98 #include "messages/MOSDPGCreate.h"
99 #include "messages/MOSDPGCreate2.h"
100 #include "messages/MOSDPGScan.h"
101 #include "messages/MBackfillReserve.h"
102 #include "messages/MRecoveryReserve.h"
103 #include "messages/MOSDForceRecovery.h"
104 #include "messages/MOSDECSubOpWrite.h"
105 #include "messages/MOSDECSubOpWriteReply.h"
106 #include "messages/MOSDECSubOpRead.h"
107 #include "messages/MOSDECSubOpReadReply.h"
108 #include "messages/MOSDPGCreated.h"
109 #include "messages/MOSDPGUpdateLogMissing.h"
110 #include "messages/MOSDPGUpdateLogMissingReply.h"
112 #include "messages/MOSDPeeringOp.h"
114 #include "messages/MOSDAlive.h"
116 #include "messages/MOSDScrub.h"
117 #include "messages/MOSDScrub2.h"
118 #include "messages/MOSDRepScrub.h"
120 #include "messages/MCommand.h"
121 #include "messages/MCommandReply.h"
123 #include "messages/MPGStats.h"
124 #include "messages/MPGStatsAck.h"
126 #include "messages/MWatchNotify.h"
127 #include "messages/MOSDPGPush.h"
128 #include "messages/MOSDPGPushReply.h"
129 #include "messages/MOSDPGPull.h"
131 #include "messages/MMonGetPurgedSnaps.h"
132 #include "messages/MMonGetPurgedSnapsReply.h"
134 #include "common/perf_counters.h"
135 #include "common/Timer.h"
136 #include "common/LogClient.h"
137 #include "common/AsyncReserver.h"
138 #include "common/HeartbeatMap.h"
139 #include "common/admin_socket.h"
140 #include "common/ceph_context.h"
142 #include "global/signal_handler.h"
143 #include "global/pidfile.h"
145 #include "include/color.h"
146 #include "perfglue/cpu_profiler.h"
147 #include "perfglue/heap_profiler.h"
149 #include "osd/OpRequest.h"
151 #include "auth/AuthAuthorizeHandler.h"
152 #include "auth/RotatingKeyRing.h"
154 #include "objclass/objclass.h"
156 #include "common/cmdparse.h"
157 #include "include/str_list.h"
158 #include "include/util.h"
160 #include "include/ceph_assert.h"
161 #include "common/config.h"
162 #include "common/EventTrace.h"
164 #include "json_spirit/json_spirit_reader.h"
165 #include "json_spirit/json_spirit_writer.h"
168 #define TRACEPOINT_DEFINE
169 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170 #include "tracing/osd.h"
171 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #undef TRACEPOINT_DEFINE
174 #define tracepoint(...)
177 #define dout_context cct
178 #define dout_subsys ceph_subsys_osd
180 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
182 using namespace ceph::osd::scheduler
;
183 using TOPNSPC::common::cmd_getval
;
185 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
186 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
189 //Initial features in new superblock.
190 //Features here are also automatically upgraded
191 CompatSet
OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat
;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
194 CompatSet::FeatureSet ceph_osd_feature_incompat
;
195 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
196 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
197 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
198 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
199 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
200 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
201 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
202 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
203 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
204 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
205 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
206 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
207 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
208 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
209 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES
);
210 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
);
211 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
212 ceph_osd_feature_incompat
);
215 //Features are added here that this OSD supports.
216 CompatSet
OSD::get_osd_compat_set() {
217 CompatSet compat
= get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
223 OSDService::OSDService(OSD
*osd
) :
226 whoami(osd
->whoami
), store(osd
->store
),
227 log_client(osd
->log_client
), clog(osd
->clog
),
228 pg_recovery_stats(osd
->pg_recovery_stats
),
229 cluster_messenger(osd
->cluster_messenger
),
230 client_messenger(osd
->client_messenger
),
232 recoverystate_perf(osd
->recoverystate_perf
),
234 osd_max_object_size(cct
->_conf
, "osd_max_object_size"),
235 osd_skip_data_digest(cct
->_conf
, "osd_skip_data_digest"),
236 publish_lock
{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock
{ceph::make_mutex("OSDService::pre_publish_lock")},
241 agent_valid_iterator(false),
243 flush_mode_high_count(0),
246 agent_stop_flag(false),
247 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
251 objecter(make_unique
<Objecter
>(osd
->client_messenger
->cct
,
252 osd
->objecter_messenger
,
253 osd
->monc
, nullptr)),
254 m_objecter_finishers(cct
->_conf
->osd_objecter_finishers
),
255 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
257 recovery_request_timer(cct
, recovery_request_lock
, false),
258 sleep_timer(cct
, sleep_lock
, false),
259 reserver_finisher(cct
),
260 local_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
261 cct
->_conf
->osd_min_recovery_priority
),
262 remote_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
263 cct
->_conf
->osd_min_recovery_priority
),
264 snap_reserver(cct
, &reserver_finisher
,
265 cct
->_conf
->osd_max_trimming_pgs
),
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
269 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
270 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
271 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
273 cur_ratio(0), physical_ratio(0),
274 boot_epoch(0), up_epoch(0), bind_epoch(0)
278 for (int i
= 0; i
< m_objecter_finishers
; i
++) {
280 str
<< "objecter-finisher-" << i
;
281 auto fin
= make_unique
<Finisher
>(osd
->client_messenger
->cct
, str
.str(), "finisher");
282 objecter_finishers
.push_back(std::move(fin
));
287 void OSDService::add_pgid(spg_t pgid
, PG
*pg
){
288 std::lock_guard
l(pgid_lock
);
289 if (!pgid_tracker
.count(pgid
)) {
292 pgid_tracker
[pgid
]++;
294 void OSDService::remove_pgid(spg_t pgid
, PG
*pg
)
296 std::lock_guard
l(pgid_lock
);
297 ceph_assert(pgid_tracker
.count(pgid
));
298 ceph_assert(pgid_tracker
[pgid
] > 0);
299 pgid_tracker
[pgid
]--;
300 if (pgid_tracker
[pgid
] == 0) {
301 pgid_tracker
.erase(pgid
);
302 live_pgs
.erase(pgid
);
305 void OSDService::dump_live_pgids()
307 std::lock_guard
l(pgid_lock
);
308 derr
<< "live pgids:" << dendl
;
309 for (map
<spg_t
, int>::const_iterator i
= pgid_tracker
.cbegin();
310 i
!= pgid_tracker
.cend();
312 derr
<< "\t" << *i
<< dendl
;
313 live_pgs
[i
->first
]->dump_live_ids();
319 ceph::signedspan
OSDService::get_mnow()
321 return ceph::mono_clock::now() - osd
->startup_time
;
324 void OSDService::identify_splits_and_merges(
328 set
<pair
<spg_t
,epoch_t
>> *split_children
,
329 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
331 if (!old_map
->have_pg_pool(pgid
.pool())) {
334 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
335 auto p
= osd
->pg_num_history
.pg_nums
.find(pgid
.pool());
336 if (p
== osd
->pg_num_history
.pg_nums
.end()) {
339 dout(20) << __func__
<< " " << pgid
<< " e" << old_map
->get_epoch()
340 << " to e" << new_map
->get_epoch()
341 << " pg_nums " << p
->second
<< dendl
;
343 queue
.push_back(pgid
);
345 while (!queue
.empty()) {
346 auto cur
= queue
.front();
349 unsigned pgnum
= old_pgnum
;
350 for (auto q
= p
->second
.lower_bound(old_map
->get_epoch());
351 q
!= p
->second
.end() &&
352 q
->first
<= new_map
->get_epoch();
354 if (pgnum
< q
->second
) {
356 if (cur
.ps() < pgnum
) {
358 if (cur
.is_split(pgnum
, q
->second
, &children
)) {
359 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
360 << " pg_num " << pgnum
<< " -> " << q
->second
361 << " children " << children
<< dendl
;
362 for (auto i
: children
) {
363 split_children
->insert(make_pair(i
, q
->first
));
368 } else if (cur
.ps() < q
->second
) {
369 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
370 << " pg_num " << pgnum
<< " -> " << q
->second
371 << " is a child" << dendl
;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children
->insert(make_pair(cur
, q
->first
));
378 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
379 << " pg_num " << pgnum
<< " -> " << q
->second
380 << " is post-split, skipping" << dendl
;
382 } else if (merge_pgs
) {
384 if (cur
.ps() >= q
->second
) {
385 if (cur
.ps() < pgnum
) {
387 if (cur
.is_merge_source(pgnum
, q
->second
, &parent
)) {
389 parent
.is_split(q
->second
, pgnum
, &children
);
390 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
391 << " pg_num " << pgnum
<< " -> " << q
->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children
<< dendl
;
394 merge_pgs
->insert(make_pair(parent
, q
->first
));
395 if (!did
.count(parent
)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue
.push_back(parent
);
400 for (auto c
: children
) {
401 merge_pgs
->insert(make_pair(c
, q
->first
));
407 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
408 << " pg_num " << pgnum
<< " -> " << q
->second
409 << " is beyond old pgnum, skipping" << dendl
;
413 if (cur
.is_split(q
->second
, pgnum
, &children
)) {
414 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
415 << " pg_num " << pgnum
<< " -> " << q
->second
416 << " is merge target, source " << children
<< dendl
;
417 for (auto c
: children
) {
418 merge_pgs
->insert(make_pair(c
, q
->first
));
422 merge_pgs
->insert(make_pair(cur
, q
->first
));
431 void OSDService::need_heartbeat_peer_update()
433 osd
->need_heartbeat_peer_update();
436 HeartbeatStampsRef
OSDService::get_hb_stamps(unsigned peer
)
438 std::lock_guard
l(hb_stamp_lock
);
439 if (peer
>= hb_stamps
.size()) {
440 hb_stamps
.resize(peer
+ 1);
442 if (!hb_stamps
[peer
]) {
443 hb_stamps
[peer
] = ceph::make_ref
<HeartbeatStamps
>(peer
);
445 return hb_stamps
[peer
];
448 void OSDService::queue_renew_lease(epoch_t epoch
, spg_t spgid
)
450 osd
->enqueue_peering_evt(
453 std::make_shared
<PGPeeringEvent
>(
458 void OSDService::start_shutdown()
461 std::lock_guard
l(agent_timer_lock
);
462 agent_timer
.shutdown();
466 std::lock_guard
l(sleep_lock
);
467 sleep_timer
.shutdown();
471 std::lock_guard
l(recovery_request_lock
);
472 recovery_request_timer
.shutdown();
476 void OSDService::shutdown_reserver()
478 reserver_finisher
.wait_for_empty();
479 reserver_finisher
.stop();
482 void OSDService::shutdown()
484 mono_timer
.suspend();
487 std::lock_guard
l(watch_lock
);
488 watch_timer
.shutdown();
491 objecter
->shutdown();
492 for (auto& f
: objecter_finishers
) {
497 publish_map(OSDMapRef());
498 next_osdmap
= OSDMapRef();
501 void OSDService::init()
503 reserver_finisher
.start();
504 for (auto& f
: objecter_finishers
) {
507 objecter
->set_client_incarnation(0);
509 // deprioritize objecter in daemonperf output
510 objecter
->get_logger()->set_prio_adjust(-3);
516 agent_thread
.create("osd_srv_agent");
518 if (cct
->_conf
->osd_recovery_delay_start
)
519 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
522 void OSDService::final_init()
524 objecter
->start(osdmap
.get());
527 void OSDService::activate_map()
529 // wake/unwake the tiering agent
530 std::lock_guard l
{agent_lock
};
532 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
534 agent_cond
.notify_all();
537 void OSDService::request_osdmap_update(epoch_t e
)
539 osd
->osdmap_subscribe(e
, false);
543 class AgentTimeoutCB
: public Context
{
546 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
547 void finish(int) override
{
548 pg
->agent_choose_mode_restart();
552 void OSDService::agent_entry()
554 dout(10) << __func__
<< " start" << dendl
;
555 std::unique_lock agent_locker
{agent_lock
};
557 while (!agent_stop_flag
) {
558 if (agent_queue
.empty()) {
559 dout(20) << __func__
<< " empty queue" << dendl
;
560 agent_cond
.wait(agent_locker
);
563 uint64_t level
= agent_queue
.rbegin()->first
;
564 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
566 << " tiers " << agent_queue
.size()
567 << ", top is " << level
568 << " with pgs " << top
.size()
569 << ", ops " << agent_ops
<< "/"
570 << cct
->_conf
->osd_agent_max_ops
571 << (agent_active
? " active" : " NOT ACTIVE")
573 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
574 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
575 int agent_flush_quota
= max
;
576 if (!flush_mode_high_count
)
577 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
578 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
579 agent_cond
.wait(agent_locker
);
583 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
584 agent_queue_pos
= top
.begin();
585 agent_valid_iterator
= true;
587 PGRef pg
= *agent_queue_pos
;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota
<< dendl
;
591 agent_locker
.unlock();
592 if (!pg
->agent_work(max
, agent_flush_quota
)) {
593 dout(10) << __func__
<< " " << pg
->pg_id
594 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
595 << " seconds" << dendl
;
597 osd
->logger
->inc(l_osd_tier_delay
);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
599 std::lock_guard timer_locker
{agent_timer_lock
};
600 Context
*cb
= new AgentTimeoutCB(pg
);
601 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
605 dout(10) << __func__
<< " finish" << dendl
;
608 void OSDService::agent_stop()
611 std::lock_guard
l(agent_lock
);
613 // By this time all ops should be cancelled
614 ceph_assert(agent_ops
== 0);
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue
.empty()) {
617 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
618 derr
<< "agent queue not empty, for example " << (*top
.begin())->get_pgid() << dendl
;
619 ceph_abort_msg("agent queue not empty");
622 agent_stop_flag
= true;
623 agent_cond
.notify_all();
628 // -------------------------------------
630 void OSDService::promote_throttle_recalibrate()
632 utime_t now
= ceph_clock_now();
633 double dur
= now
- last_recalibrate
;
634 last_recalibrate
= now
;
635 unsigned prob
= promote_probability_millis
;
637 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
638 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
640 unsigned min_prob
= 1;
642 uint64_t attempts
, obj
, bytes
;
643 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
644 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
645 << obj
<< " objects and " << byte_u_t(bytes
) << "; target "
646 << target_obj_sec
<< " obj/sec or "
647 << byte_u_t(target_bytes_sec
) << "/sec"
650 // calculate what the probability *should* be, given the targets
652 if (attempts
&& dur
> 0) {
653 uint64_t avg_size
= 1;
655 avg_size
= std::max
<uint64_t>(bytes
/ obj
, 1);
656 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
657 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
659 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
660 << avg_size
<< dendl
;
661 if (target_obj_sec
&& target_bytes_sec
)
662 new_prob
= std::min(po
, pb
);
663 else if (target_obj_sec
)
665 else if (target_bytes_sec
)
672 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
674 // correct for persistent skew between target rate and actual rate, adjust
677 if (attempts
&& obj
) {
678 actual
= obj
* 1000 / attempts
;
679 ratio
= (double)actual
/ (double)prob
;
680 new_prob
= (double)new_prob
/ ratio
;
682 new_prob
= std::max(new_prob
, min_prob
);
683 new_prob
= std::min(new_prob
, 1000u);
686 prob
= (prob
+ new_prob
) / 2;
687 prob
= std::max(prob
, min_prob
);
688 prob
= std::min(prob
, 1000u);
689 dout(10) << __func__
<< " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis
<< " -> " << prob
694 promote_probability_millis
= prob
;
696 // set hard limits for this interval to mitigate stampedes
697 promote_max_objects
= target_obj_sec
* osd
->OSD_TICK_INTERVAL
* 2;
698 promote_max_bytes
= target_bytes_sec
* osd
->OSD_TICK_INTERVAL
* 2;
701 // -------------------------------------
703 float OSDService::get_failsafe_full_ratio()
705 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
706 if (full_ratio
> 1.0) full_ratio
/= 100.0;
710 OSDService::s_names
OSDService::recalc_full_state(float ratio
, float pratio
, string
&inject
)
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap
= get_osdmap();
717 if (!osdmap
|| osdmap
->get_epoch() == 0) {
720 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
721 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
722 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
723 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
725 if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio
= failsafe_ratio
;
729 backfillfull_ratio
= failsafe_ratio
;
730 nearfull_ratio
= failsafe_ratio
;
731 } else if (full_ratio
<= 0 ||
732 backfillfull_ratio
<= 0 ||
733 nearfull_ratio
<= 0) {
734 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio
= failsafe_ratio
;
738 backfillfull_ratio
= failsafe_ratio
;
739 nearfull_ratio
= failsafe_ratio
;
742 if (injectfull_state
> NONE
&& injectfull
) {
743 inject
= "(Injected)";
744 return injectfull_state
;
745 } else if (pratio
> failsafe_ratio
) {
747 } else if (ratio
> full_ratio
) {
749 } else if (ratio
> backfillfull_ratio
) {
751 } else if (pratio
> nearfull_ratio
) {
757 void OSDService::check_full_status(float ratio
, float pratio
)
759 std::lock_guard
l(full_status_lock
);
762 physical_ratio
= pratio
;
766 new_state
= recalc_full_state(ratio
, pratio
, inject
);
768 dout(20) << __func__
<< " cur ratio " << ratio
769 << ", physical ratio " << pratio
770 << ", new state " << get_full_state_name(new_state
)
775 if (cur_state
!= new_state
) {
776 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
777 << " -> " << get_full_state_name(new_state
) << dendl
;
778 if (new_state
== FAILSAFE
) {
779 clog
->error() << "full status failsafe engaged, dropping updates, now "
780 << (int)roundf(ratio
* 100) << "% full";
781 } else if (cur_state
== FAILSAFE
) {
782 clog
->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio
* 100) << "% full";
785 cur_state
= new_state
;
789 bool OSDService::need_fullness_update()
791 OSDMapRef osdmap
= get_osdmap();
793 if (osdmap
->exists(whoami
)) {
794 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
796 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
798 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
805 else if (is_backfillfull())
807 else if (is_nearfull())
812 bool OSDService::_check_inject_full(DoutPrefixProvider
*dpp
, s_names type
) const
814 if (injectfull
&& injectfull_state
>= type
) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
819 ldpp_dout(dpp
, 10) << __func__
<< " Injected " << get_full_state_name(type
) << " OSD ("
820 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")"
827 bool OSDService::_check_full(DoutPrefixProvider
*dpp
, s_names type
) const
829 std::lock_guard
l(full_status_lock
);
831 if (_check_inject_full(dpp
, type
))
834 if (cur_state
>= type
)
835 ldpp_dout(dpp
, 10) << __func__
<< " current usage is " << cur_ratio
836 << " physical " << physical_ratio
<< dendl
;
838 return cur_state
>= type
;
841 bool OSDService::_tentative_full(DoutPrefixProvider
*dpp
, s_names type
, uint64_t adjust_used
, osd_stat_t adjusted_stat
)
843 ldpp_dout(dpp
, 20) << __func__
<< " type " << get_full_state_name(type
) << " adjust_used " << (adjust_used
>> 10) << "KiB" << dendl
;
845 std::lock_guard
l(full_status_lock
);
846 if (_check_inject_full(dpp
, type
)) {
852 float ratio
= compute_adjusted_ratio(adjusted_stat
, &pratio
, adjust_used
);
855 s_names tentative_state
= recalc_full_state(ratio
, pratio
, notused
);
857 if (tentative_state
>= type
)
858 ldpp_dout(dpp
, 10) << __func__
<< " tentative usage is " << ratio
<< dendl
;
860 return tentative_state
>= type
;
863 bool OSDService::check_failsafe_full(DoutPrefixProvider
*dpp
) const
865 return _check_full(dpp
, FAILSAFE
);
868 bool OSDService::check_full(DoutPrefixProvider
*dpp
) const
870 return _check_full(dpp
, FULL
);
873 bool OSDService::tentative_backfill_full(DoutPrefixProvider
*dpp
, uint64_t adjust_used
, osd_stat_t stats
)
875 return _tentative_full(dpp
, BACKFILLFULL
, adjust_used
, stats
);
878 bool OSDService::check_backfill_full(DoutPrefixProvider
*dpp
) const
880 return _check_full(dpp
, BACKFILLFULL
);
883 bool OSDService::check_nearfull(DoutPrefixProvider
*dpp
) const
885 return _check_full(dpp
, NEARFULL
);
888 bool OSDService::is_failsafe_full() const
890 std::lock_guard
l(full_status_lock
);
891 return cur_state
== FAILSAFE
;
894 bool OSDService::is_full() const
896 std::lock_guard
l(full_status_lock
);
897 return cur_state
>= FULL
;
900 bool OSDService::is_backfillfull() const
902 std::lock_guard
l(full_status_lock
);
903 return cur_state
>= BACKFILLFULL
;
906 bool OSDService::is_nearfull() const
908 std::lock_guard
l(full_status_lock
);
909 return cur_state
>= NEARFULL
;
912 void OSDService::set_injectfull(s_names type
, int64_t count
)
914 std::lock_guard
l(full_status_lock
);
915 injectfull_state
= type
;
919 void OSDService::set_statfs(const struct store_statfs_t
&stbuf
,
920 osd_alert_list_t
& alerts
)
922 uint64_t bytes
= stbuf
.total
;
923 uint64_t avail
= stbuf
.available
;
924 uint64_t used
= stbuf
.get_used_raw();
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct
->_conf
->fake_statfs_for_testing
) {
929 uint64_t total_num_bytes
= 0;
933 total_num_bytes
+= p
->get_stats_num_bytes();
935 bytes
= cct
->_conf
->fake_statfs_for_testing
;
936 if (total_num_bytes
< bytes
)
937 avail
= bytes
- total_num_bytes
;
940 dout(0) << __func__
<< " fake total " << cct
->_conf
->fake_statfs_for_testing
941 << " adjust available " << avail
943 used
= bytes
- avail
;
946 osd
->logger
->set(l_osd_stat_bytes
, bytes
);
947 osd
->logger
->set(l_osd_stat_bytes_used
, used
);
948 osd
->logger
->set(l_osd_stat_bytes_avail
, avail
);
950 std::lock_guard
l(stat_lock
);
951 osd_stat
.statfs
= stbuf
;
952 osd_stat
.os_alerts
.clear();
953 osd_stat
.os_alerts
[whoami
].swap(alerts
);
954 if (cct
->_conf
->fake_statfs_for_testing
) {
955 osd_stat
.statfs
.total
= bytes
;
956 osd_stat
.statfs
.available
= avail
;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat
.statfs
.internally_reserved
= 0;
962 osd_stat_t
OSDService::set_osd_stat(vector
<int>& hb_peers
,
965 utime_t now
= ceph_clock_now();
966 auto stale_time
= g_conf().get_val
<int64_t>("osd_mon_heartbeat_stat_stale");
967 std::lock_guard
l(stat_lock
);
968 osd_stat
.hb_peers
.swap(hb_peers
);
969 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
970 osd_stat
.num_pgs
= num_pgs
;
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i
: osd_stat
.hb_pingtime
) {
974 if (i
.second
.last_update
== 0)
976 if (stale_time
&& now
.sec() - i
.second
.last_update
> stale_time
) {
977 dout(20) << __func__
<< " time out heartbeat for osd " << i
.first
978 << " last_update " << i
.second
.last_update
<< dendl
;
979 osd_stat
.hb_pingtime
.erase(i
.first
);
986 void OSDService::inc_osd_stat_repaired()
988 std::lock_guard
l(stat_lock
);
989 osd_stat
.num_shards_repaired
++;
993 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat
, float *pratio
,
994 uint64_t adjust_used
)
997 ((float)new_stat
.statfs
.get_used_raw()) / ((float)new_stat
.statfs
.total
);
1000 dout(20) << __func__
<< " Before kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1001 if (new_stat
.statfs
.available
> adjust_used
)
1002 new_stat
.statfs
.available
-= adjust_used
;
1004 new_stat
.statfs
.available
= 0;
1005 dout(20) << __func__
<< " After kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted
= 0;
1011 osd
->_get_pgs(&pgs
);
1012 for (auto p
: pgs
) {
1013 backfill_adjusted
+= p
->pg_stat_adjust(&new_stat
);
1015 if (backfill_adjusted
) {
1016 dout(20) << __func__
<< " backfill adjusted " << new_stat
<< dendl
;
1018 return ((float)new_stat
.statfs
.get_used_raw()) / ((float)new_stat
.statfs
.total
);
1021 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
1023 OSDMapRef next_map
= get_nextmap_reserved();
1024 // service map is always newer/newest
1025 ceph_assert(from_epoch
<= next_map
->get_epoch());
1027 if (next_map
->is_down(peer
) ||
1028 next_map
->get_info(peer
).up_from
> from_epoch
) {
1030 release_map(next_map
);
1033 ConnectionRef peer_con
;
1034 if (peer
== whoami
) {
1035 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1037 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1038 next_map
->get_cluster_addrs(peer
), false, true);
1040 maybe_share_map(peer_con
.get(), next_map
);
1041 peer_con
->send_message(m
);
1042 release_map(next_map
);
1045 void OSDService::send_message_osd_cluster(std::vector
<std::pair
<int, Message
*>>& messages
, epoch_t from_epoch
)
1047 OSDMapRef next_map
= get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch
<= next_map
->get_epoch());
1051 for (auto& iter
: messages
) {
1052 if (next_map
->is_down(iter
.first
) ||
1053 next_map
->get_info(iter
.first
).up_from
> from_epoch
) {
1057 ConnectionRef peer_con
;
1058 if (iter
.first
== whoami
) {
1059 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1061 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1062 next_map
->get_cluster_addrs(iter
.first
), false, true);
1064 maybe_share_map(peer_con
.get(), next_map
);
1065 peer_con
->send_message(iter
.second
);
1067 release_map(next_map
);
1069 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
1071 OSDMapRef next_map
= get_nextmap_reserved();
1072 // service map is always newer/newest
1073 ceph_assert(from_epoch
<= next_map
->get_epoch());
1075 if (next_map
->is_down(peer
) ||
1076 next_map
->get_info(peer
).up_from
> from_epoch
) {
1077 release_map(next_map
);
1081 if (peer
== whoami
) {
1082 con
= osd
->cluster_messenger
->get_loopback_connection();
1084 con
= osd
->cluster_messenger
->connect_to_osd(
1085 next_map
->get_cluster_addrs(peer
), false, true);
1087 release_map(next_map
);
1091 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
1093 OSDMapRef next_map
= get_nextmap_reserved();
1094 // service map is always newer/newest
1095 ceph_assert(from_epoch
<= next_map
->get_epoch());
1097 pair
<ConnectionRef
,ConnectionRef
> ret
;
1098 if (next_map
->is_down(peer
) ||
1099 next_map
->get_info(peer
).up_from
> from_epoch
) {
1100 release_map(next_map
);
1103 ret
.first
= osd
->hb_back_client_messenger
->connect_to_osd(
1104 next_map
->get_hb_back_addrs(peer
));
1105 ret
.second
= osd
->hb_front_client_messenger
->connect_to_osd(
1106 next_map
->get_hb_front_addrs(peer
));
1107 release_map(next_map
);
1111 entity_name_t
OSDService::get_cluster_msgr_name() const
1113 return cluster_messenger
->get_myname();
1116 void OSDService::queue_want_pg_temp(pg_t pgid
,
1117 const vector
<int>& want
,
1120 std::lock_guard
l(pg_temp_lock
);
1121 auto p
= pg_temp_pending
.find(pgid
);
1122 if (p
== pg_temp_pending
.end() ||
1123 p
->second
.acting
!= want
||
1125 pg_temp_wanted
[pgid
] = {want
, forced
};
1129 void OSDService::remove_want_pg_temp(pg_t pgid
)
1131 std::lock_guard
l(pg_temp_lock
);
1132 pg_temp_wanted
.erase(pgid
);
1133 pg_temp_pending
.erase(pgid
);
1136 void OSDService::_sent_pg_temp()
1138 #ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending
.merge(pg_temp_wanted
);
1141 pg_temp_pending
.insert(make_move_iterator(begin(pg_temp_wanted
)),
1142 make_move_iterator(end(pg_temp_wanted
)));
1144 pg_temp_wanted
.clear();
1147 void OSDService::requeue_pg_temp()
1149 std::lock_guard
l(pg_temp_lock
);
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted
= pg_temp_wanted
.size();
1153 unsigned old_pending
= pg_temp_pending
.size();
1155 pg_temp_wanted
.swap(pg_temp_pending
);
1156 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1157 << pg_temp_wanted
.size() << dendl
;
1160 std::ostream
& operator<<(std::ostream
& out
,
1161 const OSDService::pg_temp_t
& pg_temp
)
1163 out
<< pg_temp
.acting
;
1164 if (pg_temp
.forced
) {
1170 void OSDService::send_pg_temp()
1172 std::lock_guard
l(pg_temp_lock
);
1173 if (pg_temp_wanted
.empty())
1175 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1176 MOSDPGTemp
*ms
[2] = {nullptr, nullptr};
1177 for (auto& [pgid
, pg_temp
] : pg_temp_wanted
) {
1178 auto& m
= ms
[pg_temp
.forced
];
1180 m
= new MOSDPGTemp(osdmap
->get_epoch());
1181 m
->forced
= pg_temp
.forced
;
1183 m
->pg_temp
.emplace(pgid
, pg_temp
.acting
);
1187 monc
->send_mon_message(m
);
1193 void OSDService::send_pg_created(pg_t pgid
)
1195 std::lock_guard
l(pg_created_lock
);
1196 dout(20) << __func__
<< dendl
;
1197 auto o
= get_osdmap();
1198 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1199 pg_created
.insert(pgid
);
1200 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1204 void OSDService::send_pg_created()
1206 std::lock_guard
l(pg_created_lock
);
1207 dout(20) << __func__
<< dendl
;
1208 auto o
= get_osdmap();
1209 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1210 for (auto pgid
: pg_created
) {
1211 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1216 void OSDService::prune_pg_created()
1218 std::lock_guard
l(pg_created_lock
);
1219 dout(20) << __func__
<< dendl
;
1220 auto o
= get_osdmap();
1221 auto i
= pg_created
.begin();
1222 while (i
!= pg_created
.end()) {
1223 auto p
= o
->get_pg_pool(i
->pool());
1224 if (!p
|| !p
->has_flag(pg_pool_t::FLAG_CREATING
)) {
1225 dout(20) << __func__
<< " pruning " << *i
<< dendl
;
1226 i
= pg_created
.erase(i
);
1228 dout(20) << __func__
<< " keeping " << *i
<< dendl
;
1235 // --------------------------------------
1238 bool OSDService::can_inc_scrubs()
1240 bool can_inc
= false;
1241 std::lock_guard
l(sched_scrub_lock
);
1243 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1244 dout(20) << __func__
<< " == true " << scrubs_local
<< " local + " << scrubs_remote
1245 << " remote < max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1248 dout(20) << __func__
<< " == false " << scrubs_local
<< " local + " << scrubs_remote
1249 << " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1255 bool OSDService::inc_scrubs_local()
1257 bool result
= false;
1258 std::lock_guard l
{sched_scrub_lock
};
1259 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1260 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
+1)
1261 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1265 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1270 void OSDService::dec_scrubs_local()
1272 std::lock_guard l
{sched_scrub_lock
};
1273 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
-1)
1274 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1276 ceph_assert(scrubs_local
>= 0);
1279 bool OSDService::inc_scrubs_remote()
1281 bool result
= false;
1282 std::lock_guard l
{sched_scrub_lock
};
1283 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1284 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
+1)
1285 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1289 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1294 void OSDService::dec_scrubs_remote()
1296 std::lock_guard l
{sched_scrub_lock
};
1297 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
-1)
1298 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1300 ceph_assert(scrubs_remote
>= 0);
1303 void OSDService::dump_scrub_reservations(Formatter
*f
)
1305 std::lock_guard l
{sched_scrub_lock
};
1306 f
->dump_int("scrubs_local", scrubs_local
);
1307 f
->dump_int("scrubs_remote", scrubs_remote
);
1308 f
->dump_int("osd_max_scrubs", cct
->_conf
->osd_max_scrubs
);
1311 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1312 epoch_t
*_bind_epoch
) const
1314 std::lock_guard
l(epoch_lock
);
1316 *_boot_epoch
= boot_epoch
;
1318 *_up_epoch
= up_epoch
;
1320 *_bind_epoch
= bind_epoch
;
1323 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1324 const epoch_t
*_bind_epoch
)
1326 std::lock_guard
l(epoch_lock
);
1328 ceph_assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1329 boot_epoch
= *_boot_epoch
;
1332 ceph_assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1333 up_epoch
= *_up_epoch
;
1336 ceph_assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1337 bind_epoch
= *_bind_epoch
;
1341 bool OSDService::prepare_to_stop()
1343 std::unique_lock
l(is_stopping_lock
);
1344 if (get_state() != NOT_STOPPING
)
1347 OSDMapRef osdmap
= get_osdmap();
1348 if (osdmap
&& osdmap
->is_up(whoami
)) {
1349 dout(0) << __func__
<< " telling mon we are shutting down" << dendl
;
1350 set_state(PREPARING_TO_STOP
);
1351 monc
->send_mon_message(
1355 osdmap
->get_addrs(whoami
),
1356 osdmap
->get_epoch(),
1359 const auto timeout
= ceph::make_timespan(cct
->_conf
->osd_mon_shutdown_timeout
);
1360 is_stopping_cond
.wait_for(l
, timeout
,
1361 [this] { return get_state() == STOPPING
; });
1363 dout(0) << __func__
<< " starting shutdown" << dendl
;
1364 set_state(STOPPING
);
1368 void OSDService::got_stop_ack()
1370 std::scoped_lock
l(is_stopping_lock
);
1371 if (get_state() == PREPARING_TO_STOP
) {
1372 dout(0) << __func__
<< " starting shutdown" << dendl
;
1373 set_state(STOPPING
);
1374 is_stopping_cond
.notify_all();
1376 dout(10) << __func__
<< " ignoring msg" << dendl
;
1380 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1381 OSDSuperblock
& sblock
)
1383 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1384 osdmap
->get_encoding_features());
1385 m
->oldest_map
= max_oldest_map
;
1386 m
->newest_map
= sblock
.newest_map
;
1388 int max
= cct
->_conf
->osd_map_message_max
;
1389 ssize_t max_bytes
= cct
->_conf
->osd_map_message_max_bytes
;
1391 if (since
< m
->oldest_map
) {
1392 // we don't have the next map the target wants, so start with a
1395 dout(10) << __func__
<< " oldest map " << max_oldest_map
<< " > since "
1396 << since
<< ", starting with full map" << dendl
;
1397 since
= m
->oldest_map
;
1398 if (!get_map_bl(since
, bl
)) {
1399 derr
<< __func__
<< " missing full map " << since
<< dendl
;
1403 max_bytes
-= bl
.length();
1404 m
->maps
[since
].claim(bl
);
1406 for (epoch_t e
= since
+ 1; e
<= to
; ++e
) {
1408 if (get_inc_map_bl(e
, bl
)) {
1409 m
->incremental_maps
[e
].claim(bl
);
1411 dout(10) << __func__
<< " missing incremental map " << e
<< dendl
;
1412 if (!get_map_bl(e
, bl
)) {
1413 derr
<< __func__
<< " also missing full map " << e
<< dendl
;
1416 m
->maps
[e
].claim(bl
);
1419 max_bytes
-= bl
.length();
1420 if (max
<= 0 || max_bytes
<= 0) {
1427 if (!m
->maps
.empty() ||
1428 !m
->incremental_maps
.empty()) {
1429 // send what we have so far
1434 if (get_inc_map_bl(m
->newest_map
, bl
)) {
1435 m
->incremental_maps
[m
->newest_map
].claim(bl
);
1437 derr
<< __func__
<< " unable to load latest map " << m
->newest_map
<< dendl
;
1438 if (!get_map_bl(m
->newest_map
, bl
)) {
1439 derr
<< __func__
<< " unable to load latest full map " << m
->newest_map
1443 m
->maps
[m
->newest_map
].claim(bl
);
1448 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1450 con
->send_message(m
);
1453 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1454 const OSDMapRef
& osdmap
)
1456 epoch_t to
= osdmap
->get_epoch();
1457 dout(10) << "send_incremental_map " << since
<< " -> " << to
1458 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1462 OSDSuperblock
sblock(get_superblock());
1463 if (since
< sblock
.oldest_map
) {
1464 // just send latest full map
1465 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1466 osdmap
->get_encoding_features());
1467 m
->oldest_map
= max_oldest_map
;
1468 m
->newest_map
= sblock
.newest_map
;
1469 get_map_bl(to
, m
->maps
[to
]);
1474 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1475 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl
;
1477 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1480 m
= build_incremental_map_msg(since
, to
, sblock
);
1485 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1487 bool found
= map_bl_cache
.lookup(e
, &bl
);
1490 logger
->inc(l_osd_map_bl_cache_hit
);
1494 logger
->inc(l_osd_map_bl_cache_miss
);
1495 found
= store
->read(meta_ch
,
1496 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1504 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1506 std::lock_guard
l(map_cache_lock
);
1507 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1510 logger
->inc(l_osd_map_bl_cache_hit
);
1514 logger
->inc(l_osd_map_bl_cache_miss
);
1515 found
= store
->read(meta_ch
,
1516 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1519 _add_map_inc_bl(e
, bl
);
1524 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1526 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1527 // cache a contiguous buffer
1528 if (bl
.get_num_buffers() > 1) {
1531 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1532 map_bl_cache
.add(e
, bl
);
1535 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1537 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1538 // cache a contiguous buffer
1539 if (bl
.get_num_buffers() > 1) {
1542 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1543 map_bl_inc_cache
.add(e
, bl
);
1546 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1548 epoch_t e
= o
->get_epoch();
1550 if (cct
->_conf
->osd_map_dedup
) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1554 OSDMap::dedup(for_dedup
.get(), o
);
1558 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1565 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1567 std::lock_guard
l(map_cache_lock
);
1568 OSDMapRef retval
= map_cache
.lookup(epoch
);
1570 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1572 logger
->inc(l_osd_map_cache_hit
);
1577 logger
->inc(l_osd_map_cache_miss
);
1578 epoch_t lb
= map_cache
.cached_key_lower_bound();
1580 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1581 logger
->inc(l_osd_map_cache_miss_low
);
1582 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1586 OSDMap
*map
= new OSDMap
;
1588 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1590 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1591 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1597 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1599 return _add_map(map
);
1605 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1607 reply_op_error(op
, err
, eversion_t(), 0, {});
1610 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1612 vector
<pg_log_op_return_item_t
> op_returns
)
1614 auto m
= op
->get_req
<MOSDOp
>();
1615 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1617 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1619 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
,
1620 !m
->has_flag(CEPH_OSD_FLAG_RETURNVEC
));
1621 reply
->set_reply_versions(v
, uv
);
1622 reply
->set_op_returns(op_returns
);
1623 m
->get_connection()->send_message(reply
);
1626 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1628 if (!cct
->_conf
->osd_debug_misdirected_ops
) {
1632 auto m
= op
->get_req
<MOSDOp
>();
1633 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1635 ceph_assert(m
->get_map_epoch() >= pg
->get_history().same_primary_since
);
1637 if (pg
->is_ec_pg()) {
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1654 ceph_assert(m
->get_map_epoch() <= superblock
.newest_map
);
1655 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1657 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1658 << m
->get_map_epoch() << ", dropping" << dendl
;
1661 pg_t _pgid
= m
->get_raw_pg();
1663 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1664 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1665 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1666 pgid
.shard
!= pg
->pg_id
.shard
) {
1667 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1668 << m
->get_map_epoch() << ", dropping" << dendl
;
1673 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1674 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1675 << " pg " << m
->get_raw_pg()
1676 << " to osd." << whoami
1677 << " not " << pg
->get_acting()
1678 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1681 void OSDService::enqueue_back(OpSchedulerItem
&& qi
)
1683 osd
->op_shardedwq
.queue(std::move(qi
));
1686 void OSDService::enqueue_front(OpSchedulerItem
&& qi
)
1688 osd
->op_shardedwq
.queue_front(std::move(qi
));
1691 void OSDService::queue_recovery_context(
1693 GenContext
<ThreadPool::TPHandle
&> *c
)
1695 epoch_t e
= get_osdmap_epoch();
1698 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1699 new PGRecoveryContext(pg
->get_pgid(), c
, e
)),
1700 cct
->_conf
->osd_recovery_cost
,
1701 cct
->_conf
->osd_recovery_priority
,
1707 void OSDService::queue_for_snap_trim(PG
*pg
)
1709 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1712 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1713 new PGSnapTrim(pg
->get_pgid(), pg
->get_osdmap_epoch())),
1714 cct
->_conf
->osd_snap_trim_cost
,
1715 cct
->_conf
->osd_snap_trim_priority
,
1718 pg
->get_osdmap_epoch()));
1721 void OSDService::queue_for_scrub(PG
*pg
, bool with_high_priority
)
1723 unsigned scrub_queue_priority
= pg
->scrubber
.priority
;
1724 if (with_high_priority
&& scrub_queue_priority
< cct
->_conf
->osd_client_op_priority
) {
1725 scrub_queue_priority
= cct
->_conf
->osd_client_op_priority
;
1727 const auto epoch
= pg
->get_osdmap_epoch();
1730 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGScrub(pg
->get_pgid(), epoch
)),
1731 cct
->_conf
->osd_scrub_cost
,
1732 scrub_queue_priority
,
1738 void OSDService::queue_for_pg_delete(spg_t pgid
, epoch_t e
)
1740 dout(10) << __func__
<< " on " << pgid
<< " e " << e
<< dendl
;
1743 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1744 new PGDelete(pgid
, e
)),
1745 cct
->_conf
->osd_pg_delete_cost
,
1746 cct
->_conf
->osd_pg_delete_priority
,
1752 bool OSDService::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
1754 return osd
->try_finish_pg_delete(pg
, old_pg_num
);
1759 void OSDService::set_ready_to_merge_source(PG
*pg
, eversion_t version
)
1761 std::lock_guard
l(merge_lock
);
1762 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1763 ready_to_merge_source
[pg
->pg_id
.pgid
] = version
;
1764 assert(not_ready_to_merge_source
.count(pg
->pg_id
.pgid
) == 0);
1765 _send_ready_to_merge();
1768 void OSDService::set_ready_to_merge_target(PG
*pg
,
1770 epoch_t last_epoch_started
,
1771 epoch_t last_epoch_clean
)
1773 std::lock_guard
l(merge_lock
);
1774 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1775 ready_to_merge_target
.insert(make_pair(pg
->pg_id
.pgid
,
1778 last_epoch_clean
)));
1779 assert(not_ready_to_merge_target
.count(pg
->pg_id
.pgid
) == 0);
1780 _send_ready_to_merge();
1783 void OSDService::set_not_ready_to_merge_source(pg_t source
)
1785 std::lock_guard
l(merge_lock
);
1786 dout(10) << __func__
<< " " << source
<< dendl
;
1787 not_ready_to_merge_source
.insert(source
);
1788 assert(ready_to_merge_source
.count(source
) == 0);
1789 _send_ready_to_merge();
1792 void OSDService::set_not_ready_to_merge_target(pg_t target
, pg_t source
)
1794 std::lock_guard
l(merge_lock
);
1795 dout(10) << __func__
<< " " << target
<< " source " << source
<< dendl
;
1796 not_ready_to_merge_target
[target
] = source
;
1797 assert(ready_to_merge_target
.count(target
) == 0);
1798 _send_ready_to_merge();
1801 void OSDService::send_ready_to_merge()
1803 std::lock_guard
l(merge_lock
);
1804 _send_ready_to_merge();
1807 void OSDService::_send_ready_to_merge()
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1816 for (auto src
: not_ready_to_merge_source
) {
1817 if (sent_ready_to_merge_source
.count(src
) == 0) {
1818 monc
->send_mon_message(new MOSDPGReadyToMerge(
1822 osdmap
->get_epoch()));
1823 sent_ready_to_merge_source
.insert(src
);
1826 for (auto p
: not_ready_to_merge_target
) {
1827 if (sent_ready_to_merge_source
.count(p
.second
) == 0) {
1828 monc
->send_mon_message(new MOSDPGReadyToMerge(
1832 osdmap
->get_epoch()));
1833 sent_ready_to_merge_source
.insert(p
.second
);
1836 for (auto src
: ready_to_merge_source
) {
1837 if (not_ready_to_merge_source
.count(src
.first
) ||
1838 not_ready_to_merge_target
.count(src
.first
.get_parent())) {
1841 auto p
= ready_to_merge_target
.find(src
.first
.get_parent());
1842 if (p
!= ready_to_merge_target
.end() &&
1843 sent_ready_to_merge_source
.count(src
.first
) == 0) {
1844 monc
->send_mon_message(new MOSDPGReadyToMerge(
1845 src
.first
, // source pgid
1846 src
.second
, // src version
1847 std::get
<0>(p
->second
), // target version
1848 std::get
<1>(p
->second
), // PG's last_epoch_started
1849 std::get
<2>(p
->second
), // PG's last_epoch_clean
1851 osdmap
->get_epoch()));
1852 sent_ready_to_merge_source
.insert(src
.first
);
1857 void OSDService::clear_ready_to_merge(PG
*pg
)
1859 std::lock_guard
l(merge_lock
);
1860 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1861 ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1862 ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1863 not_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1864 not_ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1865 sent_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1868 void OSDService::clear_sent_ready_to_merge()
1870 std::lock_guard
l(merge_lock
);
1871 sent_ready_to_merge_source
.clear();
1874 void OSDService::prune_sent_ready_to_merge(const OSDMapRef
& osdmap
)
1876 std::lock_guard
l(merge_lock
);
1877 auto i
= sent_ready_to_merge_source
.begin();
1878 while (i
!= sent_ready_to_merge_source
.end()) {
1879 if (!osdmap
->pg_exists(*i
)) {
1880 dout(10) << __func__
<< " " << *i
<< dendl
;
1881 i
= sent_ready_to_merge_source
.erase(i
);
1890 void OSDService::_queue_for_recovery(
1891 std::pair
<epoch_t
, PGRef
> p
,
1892 uint64_t reserved_pushes
)
1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
1897 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1899 p
.second
->get_pgid(), p
.first
, reserved_pushes
)),
1900 cct
->_conf
->osd_recovery_cost
,
1901 cct
->_conf
->osd_recovery_priority
,
1907 // ====================================================================
1911 #define dout_prefix *_dout
1913 // Commands shared between OSD's console and admin console:
1915 namespace osd_cmds
{
1917 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
);
1919 }} // namespace ceph::osd_cmds
1921 int OSD::mkfs(CephContext
*cct
, ObjectStore
*store
, uuid_d fsid
, int whoami
, string osdspec_affinity
)
1927 ObjectStore::CollectionHandle ch
;
1929 // if we are fed a uuid for this osd, use it.
1930 store
->set_fsid(cct
->_conf
->osd_uuid
);
1932 ret
= store
->mkfs();
1934 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret
) << dendl
;
1939 store
->set_cache_shards(1); // doesn't matter for mkfs!
1941 ret
= store
->mount();
1943 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret
) << dendl
;
1948 ch
= store
->open_collection(coll_t::meta());
1950 ret
= store
->read(ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
1952 derr
<< "OSD::mkfs: have meta collection but no superblock" << dendl
;
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl
;
1957 auto p
= sbbl
.cbegin();
1959 if (whoami
!= sb
.whoami
) {
1960 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
1965 if (fsid
!= sb
.cluster_fsid
) {
1966 derr
<< "provided cluster fsid " << fsid
1967 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
1972 // create superblock
1973 sb
.cluster_fsid
= fsid
;
1974 sb
.osd_fsid
= store
->get_fsid();
1976 sb
.compat_features
= get_osd_initial_compat_set();
1981 ObjectStore::CollectionHandle ch
= store
->create_new_collection(
1983 ObjectStore::Transaction t
;
1984 t
.create_collection(coll_t::meta(), 0);
1985 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
1986 ret
= store
->queue_transaction(ch
, std::move(t
));
1988 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1989 << "queue_transaction returned " << cpp_strerror(ret
) << dendl
;
1995 ret
= write_meta(cct
, store
, sb
.cluster_fsid
, sb
.osd_fsid
, whoami
, osdspec_affinity
);
1997 derr
<< "OSD::mkfs: failed to write fsid file: error "
1998 << cpp_strerror(ret
) << dendl
;
2012 int OSD::write_meta(CephContext
*cct
, ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
, string
& osdspec_affinity
)
2017 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
2018 r
= store
->write_meta("magic", val
);
2022 snprintf(val
, sizeof(val
), "%d", whoami
);
2023 r
= store
->write_meta("whoami", val
);
2027 cluster_fsid
.print(val
);
2028 r
= store
->write_meta("ceph_fsid", val
);
2032 string key
= cct
->_conf
.get_val
<string
>("key");
2034 r
= store
->write_meta("osd_key", key
);
2038 string keyfile
= cct
->_conf
.get_val
<string
>("keyfile");
2039 if (!keyfile
.empty()) {
2042 r
= keybl
.read_file(keyfile
.c_str(), &err
);
2044 derr
<< __func__
<< " failed to read keyfile " << keyfile
<< ": "
2045 << err
<< ": " << cpp_strerror(r
) << dendl
;
2048 r
= store
->write_meta("osd_key", keybl
.to_str());
2053 if (!osdspec_affinity
.empty()) {
2054 r
= store
->write_meta("osdspec_affinity", osdspec_affinity
.c_str());
2059 r
= store
->write_meta("ready", "ready");
2066 int OSD::peek_meta(ObjectStore
*store
,
2068 uuid_d
*cluster_fsid
,
2071 ceph_release_t
*require_osd_release
)
2075 int r
= store
->read_meta("magic", &val
);
2080 r
= store
->read_meta("whoami", &val
);
2083 *whoami
= atoi(val
.c_str());
2085 r
= store
->read_meta("ceph_fsid", &val
);
2088 r
= cluster_fsid
->parse(val
.c_str());
2092 r
= store
->read_meta("fsid", &val
);
2094 *osd_fsid
= uuid_d();
2096 r
= osd_fsid
->parse(val
.c_str());
2101 r
= store
->read_meta("require_osd_release", &val
);
2103 *require_osd_release
= ceph_release_from_name(val
);
2111 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2115 OSD::OSD(CephContext
*cct_
, ObjectStore
*store_
,
2117 Messenger
*internal_messenger
,
2118 Messenger
*external_messenger
,
2119 Messenger
*hb_client_front
,
2120 Messenger
*hb_client_back
,
2121 Messenger
*hb_front_serverm
,
2122 Messenger
*hb_back_serverm
,
2123 Messenger
*osdc_messenger
,
2125 const std::string
&dev
, const std::string
&jdev
) :
2127 tick_timer(cct
, osd_lock
),
2128 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
2129 gss_ktfile_client(cct
->_conf
.get_val
<std::string
>("gss_ktab_client_file")),
2130 cluster_messenger(internal_messenger
),
2131 client_messenger(external_messenger
),
2132 objecter_messenger(osdc_messenger
),
2134 mgrc(cct_
, client_messenger
, &mc
->monmap
),
2136 recoverystate_perf(NULL
),
2138 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
2139 clog(log_client
.create_channel()),
2141 dev_path(dev
), journal_path(jdev
),
2142 store_is_rotational(store
->is_rotational()),
2143 trace_endpoint("0.0.0.0", 0, "osd"),
2145 m_osd_pg_epoch_max_lag_factor(cct
->_conf
.get_val
<double>(
2146 "osd_pg_epoch_max_lag_factor")),
2147 osd_compat(get_osd_compat_set()),
2148 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
2149 get_num_op_threads()),
2150 heartbeat_stop(false),
2151 heartbeat_need_update(true),
2152 hb_front_client_messenger(hb_client_front
),
2153 hb_back_client_messenger(hb_client_back
),
2154 hb_front_server_messenger(hb_front_serverm
),
2155 hb_back_server_messenger(hb_back_serverm
),
2157 heartbeat_thread(this),
2158 heartbeat_dispatcher(this),
2159 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
2160 cct
->_conf
->osd_num_op_tracker_shard
),
2161 test_ops_hook(NULL
),
2164 cct
->_conf
->osd_op_thread_timeout
,
2165 cct
->_conf
->osd_op_thread_suicide_timeout
,
2167 last_pg_create_epoch(0),
2170 requested_full_first(0),
2171 requested_full_last(0),
2175 if (!gss_ktfile_client
.empty()) {
2176 // Assert we can export environment variable
2178 The default client keytab is used, if it is present and readable,
2179 to automatically obtain initial credentials for GSSAPI client
2180 applications. The principal name of the first entry in the client
2181 keytab is used by default when obtaining initial credentials.
2182 1. The KRB5_CLIENT_KTNAME environment variable.
2183 2. The default_client_keytab_name profile variable in [libdefaults].
2184 3. The hardcoded default, DEFCKTNAME.
2186 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2187 gss_ktfile_client
.c_str(), 1));
2188 ceph_assert(set_result
== 0);
2191 monc
->set_messenger(client_messenger
);
2192 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
2193 cct
->_conf
->osd_op_log_threshold
);
2194 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
2195 cct
->_conf
->osd_op_history_duration
);
2196 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
2197 cct
->_conf
->osd_op_history_slow_op_threshold
);
2198 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
2200 std::stringstream ss
;
2201 ss
<< "osd." << whoami
;
2202 trace_endpoint
.copy_name(ss
.str());
2205 // initialize shards
2206 num_shards
= get_num_op_shards();
2207 for (uint32_t i
= 0; i
< num_shards
; i
++) {
2208 OSDShard
*one_shard
= new OSDShard(
2212 shards
.push_back(one_shard
);
2218 while (!shards
.empty()) {
2219 delete shards
.back();
2222 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
2223 cct
->get_perfcounters_collection()->remove(logger
);
2224 delete recoverystate_perf
;
2229 double OSD::get_tick_interval() const
2231 // vary +/- 5% to avoid scrub scheduling livelocks
2232 constexpr auto delta
= 0.05;
2233 return (OSD_TICK_INTERVAL
*
2234 ceph::util::generate_random_number(1.0 - delta
, 1.0 + delta
));
2237 void OSD::handle_signal(int signum
)
2239 ceph_assert(signum
== SIGINT
|| signum
== SIGTERM
);
2240 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
2246 std::lock_guard
lock(osd_lock
);
2250 if (store
->test_mount_in_use()) {
2251 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
2252 << "currently in use. (Is ceph-osd already running?)" << dendl
;
2256 cct
->_conf
.add_observer(this);
2260 int OSD::set_numa_affinity()
2262 // storage numa node
2263 int store_node
= -1;
2264 store
->get_numa_node(&store_node
, nullptr, nullptr);
2265 if (store_node
>= 0) {
2266 dout(1) << __func__
<< " storage numa node " << store_node
<< dendl
;
2269 // check network numa node(s)
2270 int front_node
= -1, back_node
= -1;
2271 string front_iface
= pick_iface(
2273 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
2274 string back_iface
= pick_iface(
2276 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
2277 int r
= get_iface_numa_node(front_iface
, &front_node
);
2278 if (r
>= 0 && front_node
>= 0) {
2279 dout(1) << __func__
<< " public network " << front_iface
<< " numa node "
2280 << front_node
<< dendl
;
2281 r
= get_iface_numa_node(back_iface
, &back_node
);
2282 if (r
>= 0 && back_node
>= 0) {
2283 dout(1) << __func__
<< " cluster network " << back_iface
<< " numa node "
2284 << back_node
<< dendl
;
2285 if (front_node
== back_node
&&
2286 front_node
== store_node
) {
2287 dout(1) << " objectstore and network numa nodes all match" << dendl
;
2288 if (g_conf().get_val
<bool>("osd_numa_auto_affinity")) {
2289 numa_node
= front_node
;
2291 } else if (front_node
!= back_node
) {
2292 dout(1) << __func__
<< " public and cluster network numa nodes do not match"
2295 dout(1) << __func__
<< " objectstore and network numa nodes do not match"
2298 } else if (back_node
== -2) {
2299 dout(1) << __func__
<< " cluster network " << back_iface
2300 << " ports numa nodes do not match" << dendl
;
2302 derr
<< __func__
<< " unable to identify cluster interface '" << back_iface
2303 << "' numa node: " << cpp_strerror(r
) << dendl
;
2305 } else if (front_node
== -2) {
2306 dout(1) << __func__
<< " public network " << front_iface
2307 << " ports numa nodes do not match" << dendl
;
2309 derr
<< __func__
<< " unable to identify public interface '" << front_iface
2310 << "' numa node: " << cpp_strerror(r
) << dendl
;
2312 if (int node
= g_conf().get_val
<int64_t>("osd_numa_node"); node
>= 0) {
2313 // this takes precedence over the automagic logic above
2316 if (numa_node
>= 0) {
2317 int r
= get_numa_node_cpu_set(numa_node
, &numa_cpu_set_size
, &numa_cpu_set
);
2319 dout(1) << __func__
<< " unable to determine numa node " << numa_node
2320 << " CPUs" << dendl
;
2323 dout(1) << __func__
<< " setting numa affinity to node " << numa_node
2325 << cpu_set_to_str_list(numa_cpu_set_size
, &numa_cpu_set
)
2327 r
= set_cpu_affinity_all_threads(numa_cpu_set_size
, &numa_cpu_set
);
2330 derr
<< __func__
<< " failed to set numa affinity: " << cpp_strerror(r
)
2336 dout(1) << __func__
<< " not setting numa affinity" << dendl
;
2343 class OSDSocketHook
: public AdminSocketHook
{
2346 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
2347 int call(std::string_view prefix
, const cmdmap_t
& cmdmap
,
2350 bufferlist
& out
) override
{
2351 ceph_abort("should use async hook");
2354 std::string_view prefix
,
2355 const cmdmap_t
& cmdmap
,
2357 const bufferlist
& inbl
,
2358 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
) override
{
2360 osd
->asok_command(prefix
, cmdmap
, f
, inbl
, on_finish
);
2361 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2363 on_finish(-EINVAL
, e
.what(), empty
);
2368 std::set
<int64_t> OSD::get_mapped_pools()
2370 std::set
<int64_t> pools
;
2371 std::vector
<spg_t
> pgids
;
2373 for (const auto &pgid
: pgids
) {
2374 pools
.insert(pgid
.pool());
2379 void OSD::asok_command(
2380 std::string_view prefix
, const cmdmap_t
& cmdmap
,
2382 const bufferlist
& inbl
,
2383 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
2386 stringstream ss
; // stderr error message stream
2387 bufferlist outbl
; // if empty at end, we'll dump formatter as output
2389 // --- PG commands are routed here to PG::do_command ---
2390 if (prefix
== "pg" ||
2391 prefix
== "query" ||
2392 prefix
== "mark_unfound_lost" ||
2393 prefix
== "list_unfound" ||
2394 prefix
== "scrub" ||
2395 prefix
== "deep_scrub"
2399 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
2400 ss
<< "no pgid specified";
2404 if (!pgid
.parse(pgidstr
.c_str())) {
2405 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
2411 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
2412 (pg
= _lookup_lock_pg(pcand
))) {
2413 if (pg
->is_primary()) {
2414 cmdmap_t new_cmdmap
= cmdmap
;
2416 pg
->do_command(prefix
, new_cmdmap
, inbl
, on_finish
);
2418 return; // the pg handler calls on_finish directly
2419 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2426 ss
<< "not primary for pgid " << pgid
;
2427 // do not reply; they will get newer maps and realize they
2434 ss
<< "i don't have pgid " << pgid
;
2439 // --- OSD commands follow ---
2441 else if (prefix
== "status") {
2442 lock_guard
l(osd_lock
);
2443 f
->open_object_section("status");
2444 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
2445 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
2446 f
->dump_unsigned("whoami", superblock
.whoami
);
2447 f
->dump_string("state", get_state_name(get_state()));
2448 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
2449 f
->dump_unsigned("newest_map", superblock
.newest_map
);
2450 f
->dump_unsigned("num_pgs", num_pgs
);
2452 } else if (prefix
== "flush_journal") {
2453 store
->flush_journal();
2454 } else if (prefix
== "dump_ops_in_flight" ||
2456 prefix
== "dump_blocked_ops" ||
2457 prefix
== "dump_historic_ops" ||
2458 prefix
== "dump_historic_ops_by_duration" ||
2459 prefix
== "dump_historic_slow_ops") {
2461 const string error_str
= "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2462 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2463 will start to track new ops received afterwards.";
2465 set
<string
> filters
;
2466 vector
<string
> filter_str
;
2467 if (cmd_getval(cmdmap
, "filterstr", filter_str
)) {
2468 copy(filter_str
.begin(), filter_str
.end(),
2469 inserter(filters
, filters
.end()));
2472 if (prefix
== "dump_ops_in_flight" ||
2474 if (!op_tracker
.dump_ops_in_flight(f
, false, filters
)) {
2480 if (prefix
== "dump_blocked_ops") {
2481 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
)) {
2487 if (prefix
== "dump_historic_ops") {
2488 if (!op_tracker
.dump_historic_ops(f
, false, filters
)) {
2494 if (prefix
== "dump_historic_ops_by_duration") {
2495 if (!op_tracker
.dump_historic_ops(f
, true, filters
)) {
2501 if (prefix
== "dump_historic_slow_ops") {
2502 if (!op_tracker
.dump_historic_slow_ops(f
, filters
)) {
2508 } else if (prefix
== "dump_op_pq_state") {
2509 f
->open_object_section("pq");
2510 op_shardedwq
.dump(f
);
2512 } else if (prefix
== "dump_blacklist") {
2513 list
<pair
<entity_addr_t
,utime_t
> > bl
;
2514 OSDMapRef curmap
= service
.get_osdmap();
2516 f
->open_array_section("blacklist");
2517 curmap
->get_blacklist(&bl
);
2518 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
2519 it
!= bl
.end(); ++it
) {
2520 f
->open_object_section("entry");
2521 f
->open_object_section("entity_addr_t");
2523 f
->close_section(); //entity_addr_t
2524 it
->second
.localtime(f
->dump_stream("expire_time"));
2525 f
->close_section(); //entry
2527 f
->close_section(); //blacklist
2528 } else if (prefix
== "dump_watchers") {
2529 list
<obj_watch_item_t
> watchers
;
2533 for (auto& pg
: pgs
) {
2534 list
<obj_watch_item_t
> pg_watchers
;
2535 pg
->get_watchers(&pg_watchers
);
2536 watchers
.splice(watchers
.end(), pg_watchers
);
2539 f
->open_array_section("watchers");
2540 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2541 it
!= watchers
.end(); ++it
) {
2543 f
->open_object_section("watch");
2545 f
->dump_string("namespace", it
->obj
.nspace
);
2546 f
->dump_string("object", it
->obj
.oid
.name
);
2548 f
->open_object_section("entity_name");
2549 it
->wi
.name
.dump(f
);
2550 f
->close_section(); //entity_name_t
2552 f
->dump_unsigned("cookie", it
->wi
.cookie
);
2553 f
->dump_unsigned("timeout", it
->wi
.timeout_seconds
);
2555 f
->open_object_section("entity_addr_t");
2556 it
->wi
.addr
.dump(f
);
2557 f
->close_section(); //entity_addr_t
2559 f
->close_section(); //watch
2562 f
->close_section(); //watchers
2563 } else if (prefix
== "dump_recovery_reservations") {
2564 f
->open_object_section("reservations");
2565 f
->open_object_section("local_reservations");
2566 service
.local_reserver
.dump(f
);
2568 f
->open_object_section("remote_reservations");
2569 service
.remote_reserver
.dump(f
);
2572 } else if (prefix
== "dump_scrub_reservations") {
2573 f
->open_object_section("scrub_reservations");
2574 service
.dump_scrub_reservations(f
);
2576 } else if (prefix
== "get_latest_osdmap") {
2577 get_latest_osdmap();
2578 } else if (prefix
== "set_heap_property") {
2582 bool success
= false;
2583 if (!cmd_getval(cmdmap
, "property", property
)) {
2584 error
= "unable to get property";
2586 } else if (!cmd_getval(cmdmap
, "value", value
)) {
2587 error
= "unable to get value";
2589 } else if (value
< 0) {
2590 error
= "negative value not allowed";
2592 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2593 error
= "invalid property";
2598 f
->open_object_section("result");
2599 f
->dump_string("error", error
);
2600 f
->dump_bool("success", success
);
2602 } else if (prefix
== "get_heap_property") {
2606 bool success
= false;
2607 if (!cmd_getval(cmdmap
, "property", property
)) {
2608 error
= "unable to get property";
2610 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2611 error
= "invalid property";
2616 f
->open_object_section("result");
2617 f
->dump_string("error", error
);
2618 f
->dump_bool("success", success
);
2619 f
->dump_int("value", value
);
2621 } else if (prefix
== "dump_objectstore_kv_stats") {
2622 store
->get_db_statistics(f
);
2623 } else if (prefix
== "dump_scrubs") {
2624 service
.dumps_scrub(f
);
2625 } else if (prefix
== "calc_objectstore_db_histogram") {
2626 store
->generate_db_histogram(f
);
2627 } else if (prefix
== "flush_store_cache") {
2628 store
->flush_cache(&ss
);
2629 } else if (prefix
== "dump_pgstate_history") {
2630 f
->open_object_section("pgstate_history");
2631 f
->open_array_section("pgs");
2634 for (auto& pg
: pgs
) {
2635 f
->open_object_section("pg");
2636 f
->dump_stream("pg") << pg
->pg_id
;
2637 f
->dump_string("currently", pg
->get_current_state());
2638 pg
->dump_pgstate_history(f
);
2643 } else if (prefix
== "compact") {
2644 dout(1) << "triggering manual compaction" << dendl
;
2645 auto start
= ceph::coarse_mono_clock::now();
2647 auto end
= ceph::coarse_mono_clock::now();
2648 double duration
= std::chrono::duration
<double>(end
-start
).count();
2649 dout(1) << "finished manual compaction in "
2651 << " seconds" << dendl
;
2652 f
->open_object_section("compact_result");
2653 f
->dump_float("elapsed_time", duration
);
2655 } else if (prefix
== "get_mapped_pools") {
2656 f
->open_array_section("mapped_pools");
2657 set
<int64_t> poollist
= get_mapped_pools();
2658 for (auto pool
: poollist
) {
2659 f
->dump_int("pool_id", pool
);
2662 } else if (prefix
== "smart") {
2664 cmd_getval(cmdmap
, "devid", devid
);
2666 probe_smart(devid
, out
);
2667 outbl
.append(out
.str());
2668 } else if (prefix
== "list_devices") {
2669 set
<string
> devnames
;
2670 store
->get_devices(&devnames
);
2671 f
->open_array_section("list_devices");
2672 for (auto dev
: devnames
) {
2673 if (dev
.find("dm-") == 0) {
2677 f
->open_object_section("device");
2678 f
->dump_string("device", "/dev/" + dev
);
2679 f
->dump_string("device_id", get_device_id(dev
, &err
));
2683 } else if (prefix
== "send_beacon") {
2684 lock_guard
l(osd_lock
);
2686 send_beacon(ceph::coarse_mono_clock::now());
2690 else if (prefix
== "cluster_log") {
2692 cmd_getval(cmdmap
, "message", msg
);
2695 ss
<< "ignoring empty log message";
2698 string message
= msg
.front();
2699 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
2700 message
+= " " + *a
;
2702 cmd_getval(cmdmap
, "level", lvl
);
2703 clog_type level
= string_to_clog_type(lvl
);
2706 ss
<< "unknown level '" << lvl
<< "'";
2709 clog
->do_log(level
, message
);
2712 else if (prefix
== "bench") {
2715 int64_t osize
, onum
;
2716 // default count 1G, size 4MB
2717 cmd_getval(cmdmap
, "count", count
, (int64_t)1 << 30);
2718 cmd_getval(cmdmap
, "size", bsize
, (int64_t)4 << 20);
2719 cmd_getval(cmdmap
, "object_size", osize
, (int64_t)0);
2720 cmd_getval(cmdmap
, "object_num", onum
, (int64_t)0);
2722 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
2724 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
2725 // let us limit the block size because the next checks rely on it
2726 // having a sane value. If we allow any block size to be set things
2727 // can still go sideways.
2728 ss
<< "block 'size' values are capped at "
2729 << byte_u_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
2730 << " a higher value, please adjust 'osd_bench_max_block_size'";
2733 } else if (bsize
< (int64_t) (1 << 20)) {
2734 // entering the realm of small block sizes.
2735 // limit the count to a sane value, assuming a configurable amount of
2736 // IOPS and duration, so that the OSD doesn't get hung up on this,
2737 // preventing timeouts from going off
2739 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
2740 if (count
> max_count
) {
2741 ss
<< "'count' values greater than " << max_count
2742 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
2743 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
2744 << " for " << duration
<< " seconds,"
2745 << " can cause ill effects on osd. "
2746 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2747 << " value if you wish to use a higher 'count'.";
2752 // 1MB block sizes are big enough so that we get more stuff done.
2753 // However, to avoid the osd from getting hung on this and having
2754 // timers being triggered, we are going to limit the count assuming
2755 // a configurable throughput and duration.
2756 // NOTE: max_count is the total amount of bytes that we believe we
2757 // will be able to write during 'duration' for the given
2758 // throughput. The block size hardly impacts this unless it's
2759 // way too big. Given we already check how big the block size
2760 // is, it's safe to assume everything will check out.
2762 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
2763 if (count
> max_count
) {
2764 ss
<< "'count' values greater than " << max_count
2765 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
2766 << byte_u_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
2767 << " for " << duration
<< " seconds,"
2768 << " can cause ill effects on osd. "
2769 << " Please adjust 'osd_bench_large_size_max_throughput'"
2770 << " with a higher value if you wish to use a higher 'count'.";
2776 if (osize
&& bsize
> osize
)
2779 dout(1) << " bench count " << count
2780 << " bsize " << byte_u_t(bsize
) << dendl
;
2782 ObjectStore::Transaction cleanupt
;
2784 if (osize
&& onum
) {
2786 bufferptr
bp(osize
);
2788 bl
.push_back(std::move(bp
));
2789 bl
.rebuild_page_aligned();
2790 for (int i
=0; i
<onum
; ++i
) {
2792 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
2794 hobject_t
soid(sobject_t(oid
, 0));
2795 ObjectStore::Transaction t
;
2796 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
2797 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
2798 cleanupt
.remove(coll_t(), ghobject_t(soid
));
2803 bufferptr
bp(bsize
);
2805 bl
.push_back(std::move(bp
));
2806 bl
.rebuild_page_aligned();
2810 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2815 utime_t start
= ceph_clock_now();
2816 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
2818 unsigned offset
= 0;
2819 if (onum
&& osize
) {
2820 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
2821 offset
= rand() % (osize
/ bsize
) * bsize
;
2823 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
2826 hobject_t
soid(sobject_t(oid
, 0));
2827 ObjectStore::Transaction t
;
2828 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
2829 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
2830 if (!onum
|| !osize
)
2831 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
2836 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2840 utime_t end
= ceph_clock_now();
2843 store
->queue_transaction(service
.meta_ch
, std::move(cleanupt
), NULL
);
2846 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2851 double elapsed
= end
- start
;
2852 double rate
= count
/ elapsed
;
2853 double iops
= rate
/ bsize
;
2854 f
->open_object_section("osd_bench_results");
2855 f
->dump_int("bytes_written", count
);
2856 f
->dump_int("blocksize", bsize
);
2857 f
->dump_float("elapsed_sec", elapsed
);
2858 f
->dump_float("bytes_per_sec", rate
);
2859 f
->dump_float("iops", iops
);
2863 else if (prefix
== "flush_pg_stats") {
2864 mgrc
.send_pgstats();
2865 f
->dump_unsigned("stat_seq", service
.get_osd_stat_seq());
2868 else if (prefix
== "heap") {
2869 ret
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ss
);
2872 else if (prefix
== "debug dump_missing") {
2873 f
->open_array_section("pgs");
2876 for (auto& pg
: pgs
) {
2877 string s
= stringify(pg
->pg_id
);
2878 f
->open_array_section(s
.c_str());
2880 pg
->dump_missing(f
);
2887 else if (prefix
== "debug kick_recovery_wq") {
2889 cmd_getval(cmdmap
, "delay", delay
);
2892 ret
= cct
->_conf
.set_val("osd_recovery_delay_start", oss
.str().c_str());
2894 ss
<< "kick_recovery_wq: error setting "
2895 << "osd_recovery_delay_start to '" << delay
<< "': error "
2899 cct
->_conf
.apply_changes(nullptr);
2900 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
2901 << "to " << cct
->_conf
->osd_recovery_delay_start
;
2904 else if (prefix
== "cpu_profiler") {
2907 cmd_getval(cmdmap
, "arg", arg
);
2908 vector
<string
> argvec
;
2909 get_str_vec(arg
, argvec
);
2910 cpu_profiler_handle_command(argvec
, ds
);
2911 outbl
.append(ds
.str());
2914 else if (prefix
== "dump_pg_recovery_stats") {
2915 lock_guard
l(osd_lock
);
2916 pg_recovery_stats
.dump_formatted(f
);
2919 else if (prefix
== "reset_pg_recovery_stats") {
2920 lock_guard
l(osd_lock
);
2921 pg_recovery_stats
.reset();
2924 else if (prefix
== "perf histogram dump") {
2926 std::string counter
;
2927 cmd_getval(cmdmap
, "logger", logger
);
2928 cmd_getval(cmdmap
, "counter", counter
);
2929 cct
->get_perfcounters_collection()->dump_formatted_histograms(
2930 f
, false, logger
, counter
);
2933 else if (prefix
== "cache drop") {
2934 lock_guard
l(osd_lock
);
2935 dout(20) << "clearing all caches" << dendl
;
2936 // Clear the objectstore's cache - onode and buffer for Bluestore,
2937 // system's pagecache for Filestore
2938 ret
= store
->flush_cache(&ss
);
2940 ss
<< "Error flushing objectstore cache: " << cpp_strerror(ret
);
2943 // Clear the objectcontext cache (per PG)
2946 for (auto& pg
: pgs
) {
2951 else if (prefix
== "cache status") {
2952 lock_guard
l(osd_lock
);
2953 int obj_ctx_count
= 0;
2956 for (auto& pg
: pgs
) {
2957 obj_ctx_count
+= pg
->get_cache_obj_count();
2959 f
->open_object_section("cache_status");
2960 f
->dump_int("object_ctx", obj_ctx_count
);
2961 store
->dump_cache_stats(f
);
2965 else if (prefix
== "scrub_purged_snaps") {
2966 lock_guard
l(osd_lock
);
2967 scrub_purged_snaps();
2970 else if (prefix
== "dump_osd_network") {
2971 lock_guard
l(osd_lock
);
2973 if (!(cmd_getval(cmdmap
, "value", value
))) {
2974 // Convert milliseconds to microseconds
2975 value
= static_cast<double>(g_conf().get_val
<double>(
2976 "mon_warn_on_slow_ping_time")) * 1000;
2978 double ratio
= g_conf().get_val
<double>("mon_warn_on_slow_ping_ratio");
2979 value
= g_conf().get_val
<int64_t>("osd_heartbeat_grace");
2980 value
*= 1000000 * ratio
; // Seconds of grace to microseconds at ratio
2983 // Convert user input to microseconds
2986 if (value
< 0) value
= 0;
2988 struct osd_ping_time_t
{
2992 std::array
<uint32_t,3> times
;
2993 std::array
<uint32_t,3> min
;
2994 std::array
<uint32_t,3> max
;
2996 uint32_t last_update
;
2998 bool operator<(const osd_ping_time_t
& rhs
) const {
2999 if (pingtime
< rhs
.pingtime
)
3001 if (pingtime
> rhs
.pingtime
)
3011 set
<osd_ping_time_t
> sorted
;
3012 // Get pingtimes under lock and not on the stack
3013 map
<int, osd_stat_t::Interfaces
> *pingtimes
= new map
<int, osd_stat_t::Interfaces
>;
3014 service
.get_hb_pingtime(pingtimes
);
3015 for (auto j
: *pingtimes
) {
3016 if (j
.second
.last_update
== 0)
3018 osd_ping_time_t item
;
3019 item
.pingtime
= std::max(j
.second
.back_pingtime
[0], j
.second
.back_pingtime
[1]);
3020 item
.pingtime
= std::max(item
.pingtime
, j
.second
.back_pingtime
[2]);
3021 if (item
.pingtime
>= value
) {
3023 item
.times
[0] = j
.second
.back_pingtime
[0];
3024 item
.times
[1] = j
.second
.back_pingtime
[1];
3025 item
.times
[2] = j
.second
.back_pingtime
[2];
3026 item
.min
[0] = j
.second
.back_min
[0];
3027 item
.min
[1] = j
.second
.back_min
[1];
3028 item
.min
[2] = j
.second
.back_min
[2];
3029 item
.max
[0] = j
.second
.back_max
[0];
3030 item
.max
[1] = j
.second
.back_max
[1];
3031 item
.max
[2] = j
.second
.back_max
[2];
3032 item
.last
= j
.second
.back_last
;
3034 item
.last_update
= j
.second
.last_update
;
3035 sorted
.emplace(item
);
3037 if (j
.second
.front_last
== 0)
3039 item
.pingtime
= std::max(j
.second
.front_pingtime
[0], j
.second
.front_pingtime
[1]);
3040 item
.pingtime
= std::max(item
.pingtime
, j
.second
.front_pingtime
[2]);
3041 if (item
.pingtime
>= value
) {
3043 item
.times
[0] = j
.second
.front_pingtime
[0];
3044 item
.times
[1] = j
.second
.front_pingtime
[1];
3045 item
.times
[2] = j
.second
.front_pingtime
[2];
3046 item
.min
[0] = j
.second
.front_min
[0];
3047 item
.min
[1] = j
.second
.front_min
[1];
3048 item
.min
[2] = j
.second
.front_min
[2];
3049 item
.max
[0] = j
.second
.front_max
[0];
3050 item
.max
[1] = j
.second
.front_max
[1];
3051 item
.max
[2] = j
.second
.front_max
[2];
3052 item
.last
= j
.second
.front_last
;
3053 item
.last_update
= j
.second
.last_update
;
3055 sorted
.emplace(item
);
3060 // Network ping times (1min 5min 15min)
3061 f
->open_object_section("network_ping_times");
3062 f
->dump_int("threshold", value
/ 1000);
3063 f
->open_array_section("entries");
3064 for (auto &sitem
: boost::adaptors::reverse(sorted
)) {
3065 ceph_assert(sitem
.pingtime
>= value
);
3066 f
->open_object_section("entry");
3068 const time_t lu(sitem
.last_update
);
3070 string
lustr(ctime_r(&lu
, buffer
));
3071 lustr
.pop_back(); // Remove trailing \n
3072 auto stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3073 f
->dump_string("last update", lustr
);
3074 f
->dump_bool("stale", ceph_clock_now().sec() - sitem
.last_update
> stale
);
3075 f
->dump_int("from osd", whoami
);
3076 f
->dump_int("to osd", sitem
.to
);
3077 f
->dump_string("interface", (sitem
.back
? "back" : "front"));
3078 f
->open_object_section("average");
3079 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.times
[0],3).c_str());
3080 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.times
[1],3).c_str());
3081 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.times
[2],3).c_str());
3082 f
->close_section(); // average
3083 f
->open_object_section("min");
3084 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3085 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3086 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3087 f
->close_section(); // min
3088 f
->open_object_section("max");
3089 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3090 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3091 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3092 f
->close_section(); // max
3093 f
->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem
.last
,3).c_str());
3094 f
->close_section(); // entry
3096 f
->close_section(); // entries
3097 f
->close_section(); // network_ping_times
3099 ceph_abort_msg("broken asok registration");
3103 on_finish(ret
, ss
.str(), outbl
);
3106 class TestOpsSocketHook
: public AdminSocketHook
{
3107 OSDService
*service
;
3110 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
3111 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
3113 std::ostream
& errss
,
3114 bufferlist
& out
) override
{
3118 test_ops(service
, store
, command
, cmdmap
, outss
);
3120 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
3126 void test_ops(OSDService
*service
, ObjectStore
*store
,
3127 std::string_view command
, const cmdmap_t
& cmdmap
, ostream
&ss
);
3131 class OSD::C_Tick
: public Context
{
3134 explicit C_Tick(OSD
*o
) : osd(o
) {}
3135 void finish(int r
) override
{
3140 class OSD::C_Tick_WithoutOSDLock
: public Context
{
3143 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
3144 void finish(int r
) override
{
3145 osd
->tick_without_osd_lock();
3149 int OSD::enable_disable_fuse(bool stop
)
3153 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
3154 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
3155 dout(1) << __func__
<< " disabling" << dendl
;
3159 r
= ::rmdir(mntpath
.c_str());
3162 derr
<< __func__
<< " failed to rmdir " << mntpath
<< ": "
3163 << cpp_strerror(r
) << dendl
;
3168 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
3169 dout(1) << __func__
<< " enabling" << dendl
;
3170 r
= ::mkdir(mntpath
.c_str(), 0700);
3173 if (r
< 0 && r
!= -EEXIST
) {
3174 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
3175 << cpp_strerror(r
) << dendl
;
3178 fuse_store
= new FuseStore(store
, mntpath
);
3179 r
= fuse_store
->start();
3181 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
3187 #endif // HAVE_LIBFUSE
3191 size_t OSD::get_num_cache_shards()
3193 return cct
->_conf
.get_val
<Option::size_t>("osd_num_cache_shards");
3196 int OSD::get_num_op_shards()
3198 if (cct
->_conf
->osd_op_num_shards
)
3199 return cct
->_conf
->osd_op_num_shards
;
3200 if (store_is_rotational
)
3201 return cct
->_conf
->osd_op_num_shards_hdd
;
3203 return cct
->_conf
->osd_op_num_shards_ssd
;
3206 int OSD::get_num_op_threads()
3208 if (cct
->_conf
->osd_op_num_threads_per_shard
)
3209 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard
;
3210 if (store_is_rotational
)
3211 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_hdd
;
3213 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_ssd
;
3216 float OSD::get_osd_recovery_sleep()
3218 if (cct
->_conf
->osd_recovery_sleep
)
3219 return cct
->_conf
->osd_recovery_sleep
;
3220 if (!store_is_rotational
&& !journal_is_rotational
)
3221 return cct
->_conf
->osd_recovery_sleep_ssd
;
3222 else if (store_is_rotational
&& !journal_is_rotational
)
3223 return cct
->_conf
.get_val
<double>("osd_recovery_sleep_hybrid");
3225 return cct
->_conf
->osd_recovery_sleep_hdd
;
3228 float OSD::get_osd_delete_sleep()
3230 float osd_delete_sleep
= cct
->_conf
.get_val
<double>("osd_delete_sleep");
3231 if (osd_delete_sleep
> 0)
3232 return osd_delete_sleep
;
3233 if (!store_is_rotational
&& !journal_is_rotational
)
3234 return cct
->_conf
.get_val
<double>("osd_delete_sleep_ssd");
3235 if (store_is_rotational
&& !journal_is_rotational
)
3236 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hybrid");
3237 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hdd");
3240 int OSD::get_recovery_max_active()
3242 if (cct
->_conf
->osd_recovery_max_active
)
3243 return cct
->_conf
->osd_recovery_max_active
;
3244 if (store_is_rotational
)
3245 return cct
->_conf
->osd_recovery_max_active_hdd
;
3247 return cct
->_conf
->osd_recovery_max_active_ssd
;
3250 float OSD::get_osd_snap_trim_sleep()
3252 float osd_snap_trim_sleep
= cct
->_conf
.get_val
<double>("osd_snap_trim_sleep");
3253 if (osd_snap_trim_sleep
> 0)
3254 return osd_snap_trim_sleep
;
3255 if (!store_is_rotational
&& !journal_is_rotational
)
3256 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_ssd");
3257 if (store_is_rotational
&& !journal_is_rotational
)
3258 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hybrid");
3259 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hdd");
3265 CompatSet initial
, diff
;
3266 std::lock_guard
lock(osd_lock
);
3271 tick_timer_without_osd_lock
.init();
3272 service
.recovery_request_timer
.init();
3273 service
.sleep_timer
.init();
3275 boot_finisher
.start();
3279 store
->read_meta("require_osd_release", &val
);
3280 last_require_osd_release
= ceph_release_from_name(val
);
3284 dout(2) << "init " << dev_path
3285 << " (looks like " << (store_is_rotational
? "hdd" : "ssd") << ")"
3287 dout(2) << "journal " << journal_path
<< dendl
;
3288 ceph_assert(store
); // call pre_init() first!
3290 store
->set_cache_shards(get_num_cache_shards());
3292 int r
= store
->mount();
3294 derr
<< "OSD:init: unable to mount object store" << dendl
;
3297 journal_is_rotational
= store
->is_journal_rotational();
3298 dout(2) << "journal looks like " << (journal_is_rotational
? "hdd" : "ssd")
3301 enable_disable_fuse(false);
3303 dout(2) << "boot" << dendl
;
3305 service
.meta_ch
= store
->open_collection(coll_t::meta());
3307 // initialize the daily loadavg with current 15min loadavg
3309 if (getloadavg(loadavgs
, 3) == 3) {
3310 daily_loadavg
= loadavgs
[2];
3312 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
3313 daily_loadavg
= 1.0;
3316 int rotating_auth_attempts
= 0;
3317 auto rotating_auth_timeout
=
3318 g_conf().get_val
<int64_t>("rotating_keys_bootstrap_timeout");
3320 // sanity check long object name handling
3323 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
3324 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
3325 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
3326 r
= store
->validate_hobject_key(l
);
3328 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
3329 << "object name[space] len" << dendl
;
3330 derr
<< " osd max object name len = "
3331 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
3332 derr
<< " osd max object namespace len = "
3333 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
3334 derr
<< cpp_strerror(r
) << dendl
;
3335 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
3338 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
3341 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
3346 r
= read_superblock();
3348 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
3353 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
3354 derr
<< "The disk uses features unsupported by the executable." << dendl
;
3355 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
3356 derr
<< " daemon features " << osd_compat
<< dendl
;
3358 if (osd_compat
.writeable(superblock
.compat_features
)) {
3359 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3360 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
3365 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3366 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
3372 assert_warn(whoami
== superblock
.whoami
);
3373 if (whoami
!= superblock
.whoami
) {
3374 derr
<< "OSD::init: superblock says osd"
3375 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
3380 startup_time
= ceph::mono_clock::now();
3382 // load up "current" osdmap
3383 assert_warn(!get_osdmap());
3385 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
3389 osdmap
= get_map(superblock
.current_epoch
);
3392 // make sure we don't have legacy pgs deleting
3395 int r
= store
->list_collections(ls
);
3396 ceph_assert(r
>= 0);
3399 if (c
.is_pg(&pgid
) &&
3400 !osdmap
->have_pg_pool(pgid
.pool())) {
3401 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
3402 if (!store
->exists(service
.meta_ch
, oid
)) {
3403 derr
<< __func__
<< " missing pg_pool_t for deleted pool "
3404 << pgid
.pool() << " for pg " << pgid
3405 << "; please downgrade to luminous and allow "
3406 << "pg deletion to complete before upgrading" << dendl
;
3413 initial
= get_osd_initial_compat_set();
3414 diff
= superblock
.compat_features
.unsupported(initial
);
3415 if (superblock
.compat_features
.merge(initial
)) {
3416 // Are we adding SNAPMAPPER2?
3417 if (diff
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
)) {
3418 dout(1) << __func__
<< " upgrade snap_mapper (first start as octopus)"
3420 auto ch
= service
.meta_ch
;
3421 auto hoid
= make_snapmapper_oid();
3422 unsigned max
= cct
->_conf
->osd_target_transaction_size
;
3423 r
= SnapMapper::convert_legacy(cct
, store
, ch
, hoid
, max
);
3427 // We need to persist the new compat_set before we
3429 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
3430 ObjectStore::Transaction t
;
3431 write_superblock(t
);
3432 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3437 // make sure snap mapper object exists
3438 if (!store
->exists(service
.meta_ch
, OSD::make_snapmapper_oid())) {
3439 dout(10) << "init creating/touching snapmapper object" << dendl
;
3440 ObjectStore::Transaction t
;
3441 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3442 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3446 if (!store
->exists(service
.meta_ch
, OSD::make_purged_snaps_oid())) {
3447 dout(10) << "init creating/touching purged_snaps object" << dendl
;
3448 ObjectStore::Transaction t
;
3449 t
.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3450 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3455 if (cct
->_conf
->osd_open_classes_on_start
) {
3456 int r
= ClassHandler::get_instance().open_all_classes();
3458 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
3461 check_osdmap_features();
3463 create_recoverystate_perf();
3466 epoch_t bind_epoch
= osdmap
->get_epoch();
3467 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
3470 clear_temp_objects();
3472 // initialize osdmap references in sharded wq
3473 for (auto& shard
: shards
) {
3474 std::lock_guard
l(shard
->osdmap_lock
);
3475 shard
->shard_osdmap
= osdmap
;
3478 // load up pgs (as they previously existed)
3481 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
3487 struct store_statfs_t stbuf
;
3488 osd_alert_list_t alerts
;
3489 int r
= store
->statfs(&stbuf
, &alerts
);
3490 ceph_assert(r
== 0);
3491 service
.set_statfs(stbuf
, alerts
);
3494 // client_messenger auth_client is already set up by monc.
3495 for (auto m
: { cluster_messenger
,
3497 hb_front_client_messenger
,
3498 hb_back_client_messenger
,
3499 hb_front_server_messenger
,
3500 hb_back_server_messenger
} ) {
3501 m
->set_auth_client(monc
);
3503 for (auto m
: { client_messenger
,
3505 hb_front_server_messenger
,
3506 hb_back_server_messenger
}) {
3507 m
->set_auth_server(monc
);
3509 monc
->set_handle_authentication_dispatcher(this);
3511 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
3512 | CEPH_ENTITY_TYPE_MGR
);
3517 mgrc
.set_pgstats_cb([this](){ return collect_pg_stats(); });
3518 mgrc
.set_perf_metric_query_cb(
3519 [this](const ConfigPayload
&config_payload
) {
3520 set_perf_queries(config_payload
);
3523 return get_perf_reports();
3527 // tell monc about log_client so it will know about mon session resets
3528 monc
->set_log_client(&log_client
);
3529 update_log_config();
3532 client_messenger
->add_dispatcher_tail(&mgrc
);
3533 client_messenger
->add_dispatcher_tail(this);
3534 cluster_messenger
->add_dispatcher_head(this);
3536 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3537 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3538 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3539 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3541 objecter_messenger
->add_dispatcher_head(service
.objecter
.get());
3544 service
.publish_map(osdmap
);
3545 service
.publish_superblock(superblock
);
3546 service
.max_oldest_map
= superblock
.oldest_map
;
3548 for (auto& shard
: shards
) {
3549 // put PGs in a temporary set because we may modify pg_slots
3550 // unordered_map below.
3552 for (auto& i
: shard
->pg_slots
) {
3553 PGRef pg
= i
.second
->pg
;
3559 for (auto pg
: pgs
) {
3560 std::scoped_lock l
{*pg
};
3561 set
<pair
<spg_t
,epoch_t
>> new_children
;
3562 set
<pair
<spg_t
,epoch_t
>> merge_pgs
;
3563 service
.identify_splits_and_merges(pg
->get_osdmap(), osdmap
, pg
->pg_id
,
3564 &new_children
, &merge_pgs
);
3565 if (!new_children
.empty()) {
3566 for (auto shard
: shards
) {
3567 shard
->prime_splits(osdmap
, &new_children
);
3569 assert(new_children
.empty());
3571 if (!merge_pgs
.empty()) {
3572 for (auto shard
: shards
) {
3573 shard
->prime_merges(osdmap
, &merge_pgs
);
3575 assert(merge_pgs
.empty());
3582 // start the heartbeat
3583 heartbeat_thread
.create("osd_srv_heartbt");
3586 tick_timer
.add_event_after(get_tick_interval(),
3589 std::lock_guard
l(tick_timer_lock
);
3590 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
3591 new C_Tick_WithoutOSDLock(this));
3596 r
= monc
->authenticate();
3598 derr
<< __func__
<< " authentication failed: " << cpp_strerror(r
)
3603 while (monc
->wait_auth_rotating(rotating_auth_timeout
) < 0) {
3604 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
3605 ++rotating_auth_attempts
;
3606 if (rotating_auth_attempts
> g_conf()->max_rotating_auth_attempts
) {
3607 derr
<< __func__
<< " wait_auth_rotating timed out" << dendl
;
3612 r
= update_crush_device_class();
3614 derr
<< __func__
<< " unable to update_crush_device_class: "
3615 << cpp_strerror(r
) << dendl
;
3619 r
= update_crush_location();
3621 derr
<< __func__
<< " unable to update_crush_location: "
3622 << cpp_strerror(r
) << dendl
;
3630 // start objecter *after* we have authenticated, so that we don't ignore
3631 // the OSDMaps it requests.
3632 service
.final_init();
3636 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
3639 dout(0) << "done with init, starting boot process" << dendl
;
3641 // subscribe to any pg creations
3642 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
3644 // MgrClient needs this (it doesn't have MonClient reference itself)
3645 monc
->sub_want("mgrmap", 0, 0);
3647 // we don't need to ask for an osdmap here; objecter will
3648 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3657 enable_disable_fuse(true);
3664 void OSD::final_init()
3666 AdminSocket
*admin_socket
= cct
->get_admin_socket();
3667 asok_hook
= new OSDSocketHook(this);
3668 int r
= admin_socket
->register_command("status", asok_hook
,
3669 "high-level status of OSD");
3670 ceph_assert(r
== 0);
3671 r
= admin_socket
->register_command("flush_journal",
3673 "flush the journal to permanent store");
3674 ceph_assert(r
== 0);
3675 r
= admin_socket
->register_command("dump_ops_in_flight " \
3676 "name=filterstr,type=CephString,n=N,req=false",
3678 "show the ops currently in flight");
3679 ceph_assert(r
== 0);
3680 r
= admin_socket
->register_command("ops " \
3681 "name=filterstr,type=CephString,n=N,req=false",
3683 "show the ops currently in flight");
3684 ceph_assert(r
== 0);
3685 r
= admin_socket
->register_command("dump_blocked_ops " \
3686 "name=filterstr,type=CephString,n=N,req=false",
3688 "show the blocked ops currently in flight");
3689 ceph_assert(r
== 0);
3690 r
= admin_socket
->register_command("dump_historic_ops " \
3691 "name=filterstr,type=CephString,n=N,req=false",
3694 ceph_assert(r
== 0);
3695 r
= admin_socket
->register_command("dump_historic_slow_ops " \
3696 "name=filterstr,type=CephString,n=N,req=false",
3698 "show slowest recent ops");
3699 ceph_assert(r
== 0);
3700 r
= admin_socket
->register_command("dump_historic_ops_by_duration " \
3701 "name=filterstr,type=CephString,n=N,req=false",
3703 "show slowest recent ops, sorted by duration");
3704 ceph_assert(r
== 0);
3705 r
= admin_socket
->register_command("dump_op_pq_state",
3707 "dump op priority queue state");
3708 ceph_assert(r
== 0);
3709 r
= admin_socket
->register_command("dump_blacklist",
3711 "dump blacklisted clients and times");
3712 ceph_assert(r
== 0);
3713 r
= admin_socket
->register_command("dump_watchers",
3715 "show clients which have active watches,"
3716 " and on which objects");
3717 ceph_assert(r
== 0);
3718 r
= admin_socket
->register_command("dump_recovery_reservations",
3720 "show recovery reservations");
3721 ceph_assert(r
== 0);
3722 r
= admin_socket
->register_command("dump_scrub_reservations",
3724 "show scrub reservations");
3725 ceph_assert(r
== 0);
3726 r
= admin_socket
->register_command("get_latest_osdmap",
3728 "force osd to update the latest map from "
3730 ceph_assert(r
== 0);
3732 r
= admin_socket
->register_command("set_heap_property " \
3733 "name=property,type=CephString " \
3734 "name=value,type=CephInt",
3736 "update malloc extension heap property");
3737 ceph_assert(r
== 0);
3739 r
= admin_socket
->register_command("get_heap_property " \
3740 "name=property,type=CephString",
3742 "get malloc extension heap property");
3743 ceph_assert(r
== 0);
3745 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
3747 "print statistics of kvdb which used by bluestore");
3748 ceph_assert(r
== 0);
3750 r
= admin_socket
->register_command("dump_scrubs",
3752 "print scheduled scrubs");
3753 ceph_assert(r
== 0);
3755 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
3757 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3758 ceph_assert(r
== 0);
3760 r
= admin_socket
->register_command("flush_store_cache",
3762 "Flush bluestore internal cache");
3763 ceph_assert(r
== 0);
3764 r
= admin_socket
->register_command("dump_pgstate_history",
3766 "show recent state history");
3767 ceph_assert(r
== 0);
3769 r
= admin_socket
->register_command("compact",
3771 "Commpact object store's omap."
3772 " WARNING: Compaction probably slows your requests");
3773 ceph_assert(r
== 0);
3775 r
= admin_socket
->register_command("get_mapped_pools",
3777 "dump pools whose PG(s) are mapped to this OSD.");
3779 ceph_assert(r
== 0);
3781 r
= admin_socket
->register_command("smart name=devid,type=CephString,req=false",
3783 "probe OSD devices for SMART data.");
3785 ceph_assert(r
== 0);
3787 r
= admin_socket
->register_command("list_devices",
3789 "list OSD devices.");
3790 r
= admin_socket
->register_command("send_beacon",
3792 "send OSD beacon to mon immediately");
3794 r
= admin_socket
->register_command(
3795 "dump_osd_network name=value,type=CephInt,req=false", asok_hook
,
3796 "Dump osd heartbeat network ping times");
3797 ceph_assert(r
== 0);
3799 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
);
3800 // Note: pools are CephString instead of CephPoolname because
3801 // these commands traditionally support both pool names and numbers
3802 r
= admin_socket
->register_command(
3804 "name=pool,type=CephString " \
3805 "name=objname,type=CephObjectname " \
3806 "name=key,type=CephString "\
3807 "name=val,type=CephString",
3810 ceph_assert(r
== 0);
3811 r
= admin_socket
->register_command(
3813 "name=pool,type=CephString " \
3814 "name=objname,type=CephObjectname " \
3815 "name=key,type=CephString",
3818 ceph_assert(r
== 0);
3819 r
= admin_socket
->register_command(
3821 "name=pool,type=CephString " \
3822 "name=objname,type=CephObjectname " \
3823 "name=header,type=CephString",
3826 ceph_assert(r
== 0);
3828 r
= admin_socket
->register_command(
3830 "name=pool,type=CephString " \
3831 "name=objname,type=CephObjectname",
3833 "output entire object map");
3834 ceph_assert(r
== 0);
3836 r
= admin_socket
->register_command(
3838 "name=pool,type=CephString " \
3839 "name=objname,type=CephObjectname " \
3840 "name=len,type=CephInt",
3842 "truncate object to length");
3843 ceph_assert(r
== 0);
3845 r
= admin_socket
->register_command(
3847 "name=pool,type=CephString " \
3848 "name=objname,type=CephObjectname " \
3849 "name=shardid,type=CephInt,req=false,range=0|255",
3851 "inject data error to an object");
3852 ceph_assert(r
== 0);
3854 r
= admin_socket
->register_command(
3856 "name=pool,type=CephString " \
3857 "name=objname,type=CephObjectname " \
3858 "name=shardid,type=CephInt,req=false,range=0|255",
3860 "inject metadata error to an object");
3861 ceph_assert(r
== 0);
3862 r
= admin_socket
->register_command(
3863 "set_recovery_delay " \
3864 "name=utime,type=CephInt,req=false",
3866 "Delay osd recovery by specified seconds");
3867 ceph_assert(r
== 0);
3868 r
= admin_socket
->register_command(
3870 "name=type,type=CephString,req=false " \
3871 "name=count,type=CephInt,req=false ",
3873 "Inject a full disk (optional count times)");
3874 ceph_assert(r
== 0);
3875 r
= admin_socket
->register_command(
3877 "name=count,type=CephInt,req=false " \
3878 "name=size,type=CephInt,req=false " \
3879 "name=object_size,type=CephInt,req=false " \
3880 "name=object_num,type=CephInt,req=false ",
3882 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3883 "(default count=1G default size=4MB). Results in log.");
3884 ceph_assert(r
== 0);
3885 r
= admin_socket
->register_command(
3887 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3888 "name=message,type=CephString,n=N",
3890 "log a message to the cluster log");
3891 ceph_assert(r
== 0);
3892 r
= admin_socket
->register_command(
3896 ceph_assert(r
== 0);
3897 r
= admin_socket
->register_command(
3899 "name=heapcmd,type=CephChoices,strings=" \
3900 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3901 "name=value,type=CephString,req=false",
3903 "show heap usage info (available only if compiled with tcmalloc)");
3904 ceph_assert(r
== 0);
3905 r
= admin_socket
->register_command(
3906 "debug dump_missing " \
3907 "name=filename,type=CephFilepath",
3909 "dump missing objects to a named file");
3910 ceph_assert(r
== 0);
3911 r
= admin_socket
->register_command(
3912 "debug kick_recovery_wq " \
3913 "name=delay,type=CephInt,range=0",
3915 "set osd_recovery_delay_start to <val>");
3916 ceph_assert(r
== 0);
3917 r
= admin_socket
->register_command(
3919 "name=arg,type=CephChoices,strings=status|flush",
3921 "run cpu profiling on daemon");
3922 ceph_assert(r
== 0);
3923 r
= admin_socket
->register_command(
3924 "dump_pg_recovery_stats",
3926 "dump pg recovery statistics");
3927 ceph_assert(r
== 0);
3928 r
= admin_socket
->register_command(
3929 "reset_pg_recovery_stats",
3931 "reset pg recovery statistics");
3932 ceph_assert(r
== 0);
3933 r
= admin_socket
->register_command(
3936 "Drop all OSD caches");
3937 ceph_assert(r
== 0);
3938 r
= admin_socket
->register_command(
3941 "Get OSD caches statistics");
3942 ceph_assert(r
== 0);
3943 r
= admin_socket
->register_command(
3944 "scrub_purged_snaps",
3946 "Scrub purged_snaps vs snapmapper index");
3947 ceph_assert(r
== 0);
3949 // -- pg commands --
3950 // old form: ceph pg <pgid> command ...
3951 r
= admin_socket
->register_command(
3953 "name=pgid,type=CephPgid " \
3954 "name=cmd,type=CephChoices,strings=query",
3957 ceph_assert(r
== 0);
3958 r
= admin_socket
->register_command(
3960 "name=pgid,type=CephPgid " \
3961 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3962 "name=mulcmd,type=CephChoices,strings=revert|delete",
3965 ceph_assert(r
== 0);
3966 r
= admin_socket
->register_command(
3968 "name=pgid,type=CephPgid " \
3969 "name=cmd,type=CephChoices,strings=list_unfound " \
3970 "name=offset,type=CephString,req=false",
3973 ceph_assert(r
== 0);
3974 r
= admin_socket
->register_command(
3976 "name=pgid,type=CephPgid " \
3977 "name=cmd,type=CephChoices,strings=scrub " \
3978 "name=time,type=CephInt,req=false",
3981 ceph_assert(r
== 0);
3982 r
= admin_socket
->register_command(
3984 "name=pgid,type=CephPgid " \
3985 "name=cmd,type=CephChoices,strings=deep_scrub " \
3986 "name=time,type=CephInt,req=false",
3989 ceph_assert(r
== 0);
3990 // new form: tell <pgid> <cmd> for both cli and rest
3991 r
= admin_socket
->register_command(
3994 "show details of a specific pg");
3995 ceph_assert(r
== 0);
3996 r
= admin_socket
->register_command(
3997 "mark_unfound_lost " \
3998 "name=pgid,type=CephPgid,req=false " \
3999 "name=mulcmd,type=CephChoices,strings=revert|delete",
4001 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4002 ceph_assert(r
== 0);
4003 r
= admin_socket
->register_command(
4005 "name=pgid,type=CephPgid,req=false " \
4006 "name=offset,type=CephString,req=false",
4008 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4009 ceph_assert(r
== 0);
4010 r
= admin_socket
->register_command(
4012 "name=pgid,type=CephPgid,req=false " \
4013 "name=time,type=CephInt,req=false",
4015 "Trigger a scheduled scrub ");
4016 ceph_assert(r
== 0);
4017 r
= admin_socket
->register_command(
4019 "name=pgid,type=CephPgid,req=false " \
4020 "name=time,type=CephInt,req=false",
4022 "Trigger a scheduled deep scrub ");
4023 ceph_assert(r
== 0);
4026 void OSD::create_logger()
4028 dout(10) << "create_logger" << dendl
;
4030 logger
= build_osd_logger(cct
);
4031 cct
->get_perfcounters_collection()->add(logger
);
4034 void OSD::create_recoverystate_perf()
4036 dout(10) << "create_recoverystate_perf" << dendl
;
4038 recoverystate_perf
= build_recoverystate_perf(cct
);
4039 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
4044 if (cct
->_conf
->osd_fast_shutdown
) {
4045 derr
<< "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl
;
4046 if (cct
->_conf
->osd_fast_shutdown_notify_mon
)
4047 service
.prepare_to_stop();
4052 if (!service
.prepare_to_stop())
4053 return 0; // already shutting down
4055 if (is_stopping()) {
4059 dout(0) << "shutdown" << dendl
;
4061 set_state(STATE_STOPPING
);
4064 if (cct
->_conf
.get_val
<bool>("osd_debug_shutdown")) {
4065 cct
->_conf
.set_val("debug_osd", "100");
4066 cct
->_conf
.set_val("debug_journal", "100");
4067 cct
->_conf
.set_val("debug_filestore", "100");
4068 cct
->_conf
.set_val("debug_bluestore", "100");
4069 cct
->_conf
.set_val("debug_ms", "100");
4070 cct
->_conf
.apply_changes(nullptr);
4073 // stop MgrClient earlier as it's more like an internal consumer of OSD
4076 service
.start_shutdown();
4078 // stop sending work to pgs. this just prevents any new work in _process
4079 // from racing with on_shutdown and potentially entering the pg after.
4080 op_shardedwq
.drain();
4086 for (auto pg
: pgs
) {
4091 // drain op queue again (in case PGs requeued something)
4092 op_shardedwq
.drain();
4094 finished
.clear(); // zap waiters (bleh, this is messy)
4095 waiting_for_osdmap
.clear();
4098 // unregister commands
4099 cct
->get_admin_socket()->unregister_commands(asok_hook
);
4103 cct
->get_admin_socket()->unregister_commands(test_ops_hook
);
4104 delete test_ops_hook
;
4105 test_ops_hook
= NULL
;
4110 std::lock_guard l
{heartbeat_lock
};
4111 heartbeat_stop
= true;
4112 heartbeat_cond
.notify_all();
4113 heartbeat_peers
.clear();
4115 heartbeat_thread
.join();
4117 hb_back_server_messenger
->mark_down_all();
4118 hb_front_server_messenger
->mark_down_all();
4119 hb_front_client_messenger
->mark_down_all();
4120 hb_back_client_messenger
->mark_down_all();
4124 dout(10) << "op sharded tp stopped" << dendl
;
4126 dout(10) << "stopping agent" << dendl
;
4127 service
.agent_stop();
4129 boot_finisher
.wait_for_empty();
4133 boot_finisher
.stop();
4134 reset_heartbeat_peers(true);
4136 tick_timer
.shutdown();
4139 std::lock_guard
l(tick_timer_lock
);
4140 tick_timer_without_osd_lock
.shutdown();
4143 // note unmount epoch
4144 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl
;
4145 superblock
.mounted
= service
.get_boot_epoch();
4146 superblock
.clean_thru
= get_osdmap_epoch();
4147 ObjectStore::Transaction t
;
4148 write_superblock(t
);
4149 int r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
4151 derr
<< "OSD::shutdown: error writing superblock: "
4152 << cpp_strerror(r
) << dendl
;
4156 service
.shutdown_reserver();
4159 #ifdef PG_DEBUG_REFS
4160 service
.dump_live_pgids();
4164 _get_pgs(&pgs
, true);
4168 for (auto& pg
: pgs
) {
4169 if (pg
->is_deleted()) {
4172 dout(20) << " kicking pg " << pg
<< dendl
;
4174 if (pg
->get_num_ref() != 1) {
4175 derr
<< "pgid " << pg
->get_pgid() << " has ref count of "
4176 << pg
->get_num_ref() << dendl
;
4177 #ifdef PG_DEBUG_REFS
4178 pg
->dump_live_ids();
4180 if (cct
->_conf
->osd_shutdown_pgref_assert
) {
4188 #ifdef PG_DEBUG_REFS
4189 service
.dump_live_pgids();
4193 cct
->_conf
.remove_observer(this);
4196 service
.meta_ch
.reset();
4198 dout(10) << "syncing store" << dendl
;
4199 enable_disable_fuse(true);
4201 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
4202 dout(10) << "flushing journal" << dendl
;
4203 store
->flush_journal();
4209 std::unique_lock l
{map_lock
};
4210 set_osdmap(OSDMapRef());
4212 for (auto s
: shards
) {
4213 std::lock_guard
l(s
->osdmap_lock
);
4214 s
->shard_osdmap
= OSDMapRef();
4218 std::lock_guard
lock(osd_lock
);
4222 dout(10) << "Store synced" << dendl
;
4224 op_tracker
.on_shutdown();
4226 ClassHandler::get_instance().shutdown();
4227 client_messenger
->shutdown();
4228 cluster_messenger
->shutdown();
4229 hb_front_client_messenger
->shutdown();
4230 hb_back_client_messenger
->shutdown();
4231 objecter_messenger
->shutdown();
4232 hb_front_server_messenger
->shutdown();
4233 hb_back_server_messenger
->shutdown();
4238 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
4240 bool created
= false;
4242 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
4243 vector
<string
> vcmd
{cmd
};
4247 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
4250 if (r
== -ENOENT
&& !created
) {
4251 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
4252 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
4253 vector
<string
> vnewcmd
{newcmd
};
4257 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
4260 derr
<< __func__
<< " fail: osd does not exist and created failed: "
4261 << cpp_strerror(r
) << dendl
;
4267 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
4276 int OSD::update_crush_location()
4278 if (!cct
->_conf
->osd_crush_update_on_start
) {
4279 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
4284 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
4285 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
4287 struct store_statfs_t st
;
4288 osd_alert_list_t alerts
;
4289 int r
= store
->statfs(&st
, &alerts
);
4291 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
4294 snprintf(weight
, sizeof(weight
), "%.4lf",
4297 double(1ull << 40 /* TB */)));
4300 dout(10) << __func__
<< " crush location is " << cct
->crush_location
<< dendl
;
4303 string("{\"prefix\": \"osd crush create-or-move\", ") +
4304 string("\"id\": ") + stringify(whoami
) + ", " +
4305 string("\"weight\":") + weight
+ ", " +
4306 string("\"args\": [") + stringify(cct
->crush_location
) + "]}";
4307 return mon_cmd_maybe_osd_create(cmd
);
4310 int OSD::update_crush_device_class()
4312 if (!cct
->_conf
->osd_class_update_on_start
) {
4313 dout(10) << __func__
<< " osd_class_update_on_start = false" << dendl
;
4317 string device_class
;
4318 int r
= store
->read_meta("crush_device_class", &device_class
);
4319 if (r
< 0 || device_class
.empty()) {
4320 device_class
= store
->get_default_device_class();
4323 if (device_class
.empty()) {
4324 dout(20) << __func__
<< " no device class stored locally" << dendl
;
4329 string("{\"prefix\": \"osd crush set-device-class\", ") +
4330 string("\"class\": \"") + device_class
+ string("\", ") +
4331 string("\"ids\": [\"") + stringify(whoami
) + string("\"]}");
4333 r
= mon_cmd_maybe_osd_create(cmd
);
4335 // good, already bound to a device-class
4342 void OSD::write_superblock(ObjectStore::Transaction
& t
)
4344 dout(10) << "write_superblock " << superblock
<< dendl
;
4346 //hack: at minimum it's using the baseline feature set
4347 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
4348 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
4351 encode(superblock
, bl
);
4352 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
4355 int OSD::read_superblock()
4358 int r
= store
->read(service
.meta_ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
4362 auto p
= bl
.cbegin();
4363 decode(superblock
, p
);
4365 dout(10) << "read_superblock " << superblock
<< dendl
;
4370 void OSD::clear_temp_objects()
4372 dout(10) << __func__
<< dendl
;
4374 store
->list_collections(ls
);
4375 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
4377 if (!p
->is_pg(&pgid
))
4380 // list temp objects
4381 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
4383 vector
<ghobject_t
> temps
;
4386 vector
<ghobject_t
> objects
;
4387 auto ch
= store
->open_collection(*p
);
4389 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4390 store
->get_ideal_list_max(),
4392 if (objects
.empty())
4394 vector
<ghobject_t
>::iterator q
;
4395 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
4396 // Hammer set pool for temps to -1, so check for clean-up
4397 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
4398 temps
.push_back(*q
);
4403 // If we saw a non-temp object and hit the break above we can
4404 // break out of the while loop too.
4405 if (q
!= objects
.end())
4408 if (!temps
.empty()) {
4409 ObjectStore::Transaction t
;
4411 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
4412 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
4414 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
4415 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4416 t
= ObjectStore::Transaction();
4421 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4427 void OSD::recursive_remove_collection(CephContext
* cct
,
4428 ObjectStore
*store
, spg_t pgid
,
4434 make_snapmapper_oid());
4436 ObjectStore::CollectionHandle ch
= store
->open_collection(tmp
);
4437 ObjectStore::Transaction t
;
4438 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
4441 int max
= cct
->_conf
->osd_target_transaction_size
;
4442 vector
<ghobject_t
> objects
;
4443 objects
.reserve(max
);
4446 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4447 max
, &objects
, &next
);
4448 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
4449 if (objects
.empty())
4451 for (auto& p
: objects
) {
4452 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
4453 int r
= mapper
.remove_oid(p
.hobj
, &_t
);
4454 if (r
!= 0 && r
!= -ENOENT
)
4458 int r
= store
->queue_transaction(ch
, std::move(t
));
4459 ceph_assert(r
== 0);
4460 t
= ObjectStore::Transaction();
4462 t
.remove_collection(tmp
);
4463 int r
= store
->queue_transaction(ch
, std::move(t
));
4464 ceph_assert(r
== 0);
4467 if (!ch
->flush_commit(&waiter
)) {
4473 // ======================================================
4477 OSDMapRef createmap
,
4480 dout(10) << __func__
<< " " << pgid
<< dendl
;
4482 map
<string
,string
> ec_profile
;
4484 if (createmap
->have_pg_pool(pgid
.pool())) {
4485 pi
= *createmap
->get_pg_pool(pgid
.pool());
4486 name
= createmap
->get_pool_name(pgid
.pool());
4487 if (pi
.is_erasure()) {
4488 ec_profile
= createmap
->get_erasure_code_profile(pi
.erasure_code_profile
);
4491 // pool was deleted; grab final pg_pool_t off disk.
4492 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
4494 int r
= store
->read(service
.meta_ch
, oid
, 0, 0, bl
);
4496 derr
<< __func__
<< " missing pool " << pgid
.pool() << " tombstone"
4500 ceph_assert(r
>= 0);
4501 auto p
= bl
.cbegin();
4504 if (p
.end()) { // dev release v13.0.2 did not include ec_profile
4505 derr
<< __func__
<< " missing ec_profile from pool " << pgid
.pool()
4506 << " tombstone" << dendl
;
4509 decode(ec_profile
, p
);
4511 PGPool
pool(cct
, createmap
, pgid
.pool(), pi
, name
);
4513 if (pi
.type
== pg_pool_t::TYPE_REPLICATED
||
4514 pi
.type
== pg_pool_t::TYPE_ERASURE
)
4515 pg
= new PrimaryLogPG(&service
, createmap
, pool
, ec_profile
, pgid
);
4521 void OSD::_get_pgs(vector
<PGRef
> *v
, bool clear_too
)
4524 v
->reserve(get_num_pgs());
4525 for (auto& s
: shards
) {
4526 std::lock_guard
l(s
->shard_lock
);
4527 for (auto& j
: s
->pg_slots
) {
4529 !j
.second
->pg
->is_deleted()) {
4530 v
->push_back(j
.second
->pg
);
4532 s
->_detach_pg(j
.second
.get());
4539 void OSD::_get_pgids(vector
<spg_t
> *v
)
4542 v
->reserve(get_num_pgs());
4543 for (auto& s
: shards
) {
4544 std::lock_guard
l(s
->shard_lock
);
4545 for (auto& j
: s
->pg_slots
) {
4547 !j
.second
->pg
->is_deleted()) {
4548 v
->push_back(j
.first
);
4554 void OSD::register_pg(PGRef pg
)
4556 spg_t pgid
= pg
->get_pgid();
4557 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4558 auto sdata
= shards
[shard_index
];
4559 std::lock_guard
l(sdata
->shard_lock
);
4560 auto r
= sdata
->pg_slots
.emplace(pgid
, make_unique
<OSDShardPGSlot
>());
4561 ceph_assert(r
.second
);
4562 auto *slot
= r
.first
->second
.get();
4563 dout(20) << __func__
<< " " << pgid
<< " " << pg
<< dendl
;
4564 sdata
->_attach_pg(slot
, pg
.get());
4567 bool OSD::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
4569 auto sdata
= pg
->osd_shard
;
4572 std::lock_guard
l(sdata
->shard_lock
);
4573 auto p
= sdata
->pg_slots
.find(pg
->pg_id
);
4574 if (p
== sdata
->pg_slots
.end() ||
4576 dout(20) << __func__
<< " " << pg
->pg_id
<< " not found" << dendl
;
4579 if (p
->second
->waiting_for_merge_epoch
) {
4580 dout(20) << __func__
<< " " << pg
->pg_id
<< " waiting for merge" << dendl
;
4583 dout(20) << __func__
<< " " << pg
->pg_id
<< " " << pg
<< dendl
;
4584 sdata
->_detach_pg(p
->second
.get());
4587 for (auto shard
: shards
) {
4588 shard
->unprime_split_children(pg
->pg_id
, old_pg_num
);
4591 // update pg count now since we might not get an osdmap any time soon.
4592 if (pg
->is_primary())
4593 service
.logger
->dec(l_osd_pg_primary
);
4594 else if (pg
->is_nonprimary())
4595 service
.logger
->dec(l_osd_pg_replica
); // misnomver
4597 service
.logger
->dec(l_osd_pg_stray
);
4602 PGRef
OSD::_lookup_pg(spg_t pgid
)
4604 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4605 auto sdata
= shards
[shard_index
];
4606 std::lock_guard
l(sdata
->shard_lock
);
4607 auto p
= sdata
->pg_slots
.find(pgid
);
4608 if (p
== sdata
->pg_slots
.end()) {
4611 return p
->second
->pg
;
4614 PGRef
OSD::_lookup_lock_pg(spg_t pgid
)
4616 PGRef pg
= _lookup_pg(pgid
);
4621 if (!pg
->is_deleted()) {
4628 PGRef
OSD::lookup_lock_pg(spg_t pgid
)
4630 return _lookup_lock_pg(pgid
);
4633 void OSD::load_pgs()
4635 ceph_assert(ceph_mutex_is_locked(osd_lock
));
4636 dout(0) << "load_pgs" << dendl
;
4639 auto pghist
= make_pg_num_history_oid();
4641 int r
= store
->read(service
.meta_ch
, pghist
, 0, 0, bl
, 0);
4642 if (r
>= 0 && bl
.length() > 0) {
4643 auto p
= bl
.cbegin();
4644 decode(pg_num_history
, p
);
4646 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
4650 int r
= store
->list_collections(ls
);
4652 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
4656 for (vector
<coll_t
>::iterator it
= ls
.begin();
4660 if (it
->is_temp(&pgid
) ||
4661 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
, pgid
))) {
4662 dout(10) << "load_pgs " << *it
4663 << " removing, legacy or flagged for removal pg" << dendl
;
4664 recursive_remove_collection(cct
, store
, pgid
, *it
);
4668 if (!it
->is_pg(&pgid
)) {
4669 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
4673 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
4674 epoch_t map_epoch
= 0;
4675 int r
= PG::peek_map_epoch(store
, pgid
, &map_epoch
);
4677 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
4683 if (map_epoch
> 0) {
4684 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
4686 if (!get_osdmap()->have_pg_pool(pgid
.pool())) {
4687 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
4688 << " on pg " << pgid
<< ", but the pool is not present in the "
4689 << "current map, so this is probably a result of bug 10617. "
4690 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4691 << "to clean it up later." << dendl
;
4694 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
4695 << map_epoch
<< ", but missing map. Crashing."
4697 ceph_abort_msg("Missing map in load_pgs");
4700 pg
= _make_pg(pgosdmap
, pgid
);
4702 pg
= _make_pg(get_osdmap(), pgid
);
4705 recursive_remove_collection(cct
, store
, pgid
, *it
);
4709 // there can be no waiters here, so we don't call _wake_pg_slot
4712 pg
->ch
= store
->open_collection(pg
->coll
);
4714 // read pg state, log
4715 pg
->read_state(store
);
4718 dout(10) << "load_pgs " << *it
<< " deleting dne" << dendl
;
4721 recursive_remove_collection(cct
, store
, pgid
, *it
);
4725 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4726 assert(NULL
!= shards
[shard_index
]);
4727 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4730 pg
->reg_next_scrub();
4732 dout(10) << __func__
<< " loaded " << *pg
<< dendl
;
4738 dout(0) << __func__
<< " opened " << num
<< " pgs" << dendl
;
4742 PGRef
OSD::handle_pg_create_info(const OSDMapRef
& osdmap
,
4743 const PGCreateInfo
*info
)
4745 spg_t pgid
= info
->pgid
;
4747 if (maybe_wait_for_max_pg(osdmap
, pgid
, info
->by_mon
)) {
4748 dout(10) << __func__
<< " hit max pg, dropping" << dendl
;
4752 PeeringCtx rctx
= create_context();
4754 OSDMapRef startmap
= get_map(info
->epoch
);
4757 int64_t pool_id
= pgid
.pgid
.pool();
4758 const pg_pool_t
*pool
= osdmap
->get_pg_pool(pool_id
);
4760 dout(10) << __func__
<< " ignoring " << pgid
<< ", pool dne" << dendl
;
4763 if (osdmap
->require_osd_release
>= ceph_release_t::nautilus
&&
4764 !pool
->has_flag(pg_pool_t::FLAG_CREATING
)) {
4765 // this ensures we do not process old creating messages after the
4766 // pool's initial pgs have been created (and pg are subsequently
4767 // allowed to split or merge).
4768 dout(20) << __func__
<< " dropping " << pgid
4769 << "create, pool does not have CREATING flag set" << dendl
;
4774 int up_primary
, acting_primary
;
4775 vector
<int> up
, acting
;
4776 startmap
->pg_to_up_acting_osds(
4777 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4779 const pg_pool_t
* pp
= startmap
->get_pg_pool(pgid
.pool());
4780 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
4781 store
->get_type() != "bluestore") {
4782 clog
->warn() << "pg " << pgid
4783 << " is at risk of silent data corruption: "
4784 << "the pool allows ec overwrites but is not stored in "
4785 << "bluestore, so deep scrubbing will not detect bitrot";
4787 create_pg_collection(
4788 rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
4789 init_pg_ondisk(rctx
.transaction
, pgid
, pp
);
4791 int role
= startmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
4793 PGRef pg
= _make_pg(startmap
, pgid
);
4794 pg
->ch
= store
->create_new_collection(pg
->coll
);
4797 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4798 assert(NULL
!= shards
[shard_index
]);
4799 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4804 // we are holding the shard lock
4805 ceph_assert(!pg
->is_deleted());
4814 info
->past_intervals
,
4818 pg
->init_collection_pool_opts();
4820 if (pg
->is_primary()) {
4821 std::lock_guard locker
{m_perf_queries_lock
};
4822 pg
->set_dynamic_perf_stats_queries(m_perf_queries
);
4825 pg
->handle_initialize(rctx
);
4826 pg
->handle_activate_map(rctx
);
4828 dispatch_context(rctx
, pg
.get(), osdmap
, nullptr);
4830 dout(10) << __func__
<< " new pg " << *pg
<< dendl
;
4834 bool OSD::maybe_wait_for_max_pg(const OSDMapRef
& osdmap
,
4838 const auto max_pgs_per_osd
=
4839 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4840 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4842 if (num_pgs
< max_pgs_per_osd
) {
4846 std::lock_guard
l(pending_creates_lock
);
4847 if (is_mon_create
) {
4848 pending_creates_from_mon
++;
4850 bool is_primary
= osdmap
->get_pg_acting_role(pgid
, whoami
) == 0;
4851 pending_creates_from_osd
.emplace(pgid
, is_primary
);
4853 dout(1) << __func__
<< " withhold creation of pg " << pgid
4854 << ": " << num_pgs
<< " >= "<< max_pgs_per_osd
<< dendl
;
4858 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4859 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4860 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4861 static vector
<int32_t> twiddle(const vector
<int>& acting
) {
4862 if (acting
.size() > 1) {
4865 vector
<int32_t> twiddled(acting
.begin(), acting
.end());
4866 twiddled
.push_back(-1);
4871 void OSD::resume_creating_pg()
4873 bool do_sub_pg_creates
= false;
4874 bool have_pending_creates
= false;
4876 const auto max_pgs_per_osd
=
4877 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4878 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4879 if (max_pgs_per_osd
<= num_pgs
) {
4880 // this could happen if admin decreases this setting before a PG is removed
4883 unsigned spare_pgs
= max_pgs_per_osd
- num_pgs
;
4884 std::lock_guard
l(pending_creates_lock
);
4885 if (pending_creates_from_mon
> 0) {
4886 dout(20) << __func__
<< " pending_creates_from_mon "
4887 << pending_creates_from_mon
<< dendl
;
4888 do_sub_pg_creates
= true;
4889 if (pending_creates_from_mon
>= spare_pgs
) {
4890 spare_pgs
= pending_creates_from_mon
= 0;
4892 spare_pgs
-= pending_creates_from_mon
;
4893 pending_creates_from_mon
= 0;
4896 auto pg
= pending_creates_from_osd
.cbegin();
4897 while (spare_pgs
> 0 && pg
!= pending_creates_from_osd
.cend()) {
4898 dout(20) << __func__
<< " pg " << pg
->first
<< dendl
;
4900 get_osdmap()->pg_to_up_acting_osds(pg
->first
.pgid
, nullptr, nullptr, &acting
, nullptr);
4901 service
.queue_want_pg_temp(pg
->first
.pgid
, twiddle(acting
), true);
4902 pg
= pending_creates_from_osd
.erase(pg
);
4903 do_sub_pg_creates
= true;
4906 have_pending_creates
= (pending_creates_from_mon
> 0 ||
4907 !pending_creates_from_osd
.empty());
4910 bool do_renew_subs
= false;
4911 if (do_sub_pg_creates
) {
4912 if (monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0)) {
4913 dout(4) << __func__
<< ": resolicit pg creates from mon since "
4914 << last_pg_create_epoch
<< dendl
;
4915 do_renew_subs
= true;
4918 version_t start
= get_osdmap_epoch() + 1;
4919 if (have_pending_creates
) {
4920 // don't miss any new osdmap deleting PGs
4921 if (monc
->sub_want("osdmap", start
, 0)) {
4922 dout(4) << __func__
<< ": resolicit osdmap from mon since "
4924 do_renew_subs
= true;
4926 } else if (do_sub_pg_creates
) {
4927 // no need to subscribe the osdmap continuously anymore
4928 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4929 if (monc
->sub_want_increment("osdmap", start
, CEPH_SUBSCRIBE_ONETIME
)) {
4930 dout(4) << __func__
<< ": re-subscribe osdmap(onetime) since "
4932 do_renew_subs
= true;
4936 if (do_renew_subs
) {
4940 service
.send_pg_temp();
4943 void OSD::build_initial_pg_history(
4946 utime_t created_stamp
,
4950 dout(10) << __func__
<< " " << pgid
<< " created " << created
<< dendl
;
4951 *h
= pg_history_t(created
, created_stamp
);
4953 OSDMapRef lastmap
= service
.get_map(created
);
4954 int up_primary
, acting_primary
;
4955 vector
<int> up
, acting
;
4956 lastmap
->pg_to_up_acting_osds(
4957 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4959 ostringstream debug
;
4960 for (epoch_t e
= created
+ 1; e
<= get_osdmap_epoch(); ++e
) {
4961 OSDMapRef osdmap
= service
.get_map(e
);
4962 int new_up_primary
, new_acting_primary
;
4963 vector
<int> new_up
, new_acting
;
4964 osdmap
->pg_to_up_acting_osds(
4965 pgid
.pgid
, &new_up
, &new_up_primary
, &new_acting
, &new_acting_primary
);
4967 // this is a bit imprecise, but sufficient?
4968 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
4969 const pg_pool_t
*pi
;
4970 bool operator()(const set
<pg_shard_t
> &have
) const {
4971 return have
.size() >= pi
->min_size
;
4973 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
4974 } min_size_predicate(osdmap
->get_pg_pool(pgid
.pgid
.pool()));
4976 bool new_interval
= PastIntervals::check_new_interval(
4983 h
->same_interval_since
,
4984 h
->last_epoch_clean
,
4992 h
->same_interval_since
= e
;
4994 h
->same_up_since
= e
;
4996 if (acting_primary
!= new_acting_primary
) {
4997 h
->same_primary_since
= e
;
4999 if (pgid
.pgid
.is_split(lastmap
->get_pg_num(pgid
.pgid
.pool()),
5000 osdmap
->get_pg_num(pgid
.pgid
.pool()),
5002 h
->last_epoch_split
= e
;
5005 acting
= new_acting
;
5006 up_primary
= new_up_primary
;
5007 acting_primary
= new_acting_primary
;
5011 dout(20) << __func__
<< " " << debug
.str() << dendl
;
5012 dout(10) << __func__
<< " " << *h
<< " " << *pi
5013 << " [" << (pi
->empty() ? pair
<epoch_t
,epoch_t
>(0,0) :
5014 pi
->get_bounds()) << ")"
5018 void OSD::_add_heartbeat_peer(int p
)
5024 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
5025 if (i
== heartbeat_peers
.end()) {
5026 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, get_osdmap_epoch());
5029 assert(cons
.second
);
5031 hi
= &heartbeat_peers
[p
];
5034 auto stamps
= service
.get_hb_stamps(p
);
5036 auto sb
= ceph::make_ref
<Session
>(cct
, cons
.first
.get());
5038 sb
->stamps
= stamps
;
5039 hi
->hb_interval_start
= ceph_clock_now();
5040 hi
->con_back
= cons
.first
.get();
5041 hi
->con_back
->set_priv(sb
);
5043 auto sf
= ceph::make_ref
<Session
>(cct
, cons
.second
.get());
5045 sf
->stamps
= stamps
;
5046 hi
->con_front
= cons
.second
.get();
5047 hi
->con_front
->set_priv(sf
);
5049 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5050 << " " << hi
->con_back
->get_peer_addr()
5051 << " " << hi
->con_front
->get_peer_addr()
5056 hi
->epoch
= get_osdmap_epoch();
5059 void OSD::_remove_heartbeat_peer(int n
)
5061 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
5062 ceph_assert(q
!= heartbeat_peers
.end());
5063 dout(20) << " removing heartbeat peer osd." << n
5064 << " " << q
->second
.con_back
->get_peer_addr()
5065 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
5067 q
->second
.clear_mark_down();
5068 heartbeat_peers
.erase(q
);
5071 void OSD::need_heartbeat_peer_update()
5075 dout(20) << "need_heartbeat_peer_update" << dendl
;
5076 heartbeat_set_peers_need_update();
5079 void OSD::maybe_update_heartbeat_peers()
5081 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5083 if (is_waiting_for_healthy() || is_active()) {
5084 utime_t now
= ceph_clock_now();
5085 if (last_heartbeat_resample
== utime_t()) {
5086 last_heartbeat_resample
= now
;
5087 heartbeat_set_peers_need_update();
5088 } else if (!heartbeat_peers_need_update()) {
5089 utime_t dur
= now
- last_heartbeat_resample
;
5090 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
5091 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
5092 heartbeat_set_peers_need_update();
5093 last_heartbeat_resample
= now
;
5094 // automatically clean up any stale heartbeat peers
5095 // if we are unhealthy, then clean all
5096 reset_heartbeat_peers(is_waiting_for_healthy());
5101 if (!heartbeat_peers_need_update())
5103 heartbeat_clear_peers_need_update();
5105 std::lock_guard
l(heartbeat_lock
);
5107 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
5110 // build heartbeat from set
5114 for (auto& pg
: pgs
) {
5115 pg
->with_heartbeat_peers([&](int peer
) {
5116 if (get_osdmap()->is_up(peer
)) {
5117 _add_heartbeat_peer(peer
);
5123 // include next and previous up osds to ensure we have a fully-connected set
5124 set
<int> want
, extras
;
5125 const int next
= get_osdmap()->get_next_up_osd_after(whoami
);
5128 int prev
= get_osdmap()->get_previous_up_osd_before(whoami
);
5129 if (prev
>= 0 && prev
!= next
)
5132 // make sure we have at least **min_down** osds coming from different
5133 // subtree level (e.g., hosts) for fast failure detection.
5134 auto min_down
= cct
->_conf
.get_val
<uint64_t>("mon_osd_min_down_reporters");
5135 auto subtree
= cct
->_conf
.get_val
<string
>("mon_osd_reporter_subtree_level");
5136 auto limit
= std::max(min_down
, (uint64_t)cct
->_conf
->osd_heartbeat_min_peers
);
5137 get_osdmap()->get_random_up_osds_by_subtree(
5138 whoami
, subtree
, limit
, want
, &want
);
5140 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
5141 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
5143 _add_heartbeat_peer(*p
);
5146 // remove down peers; enumerate extras
5147 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5148 while (p
!= heartbeat_peers
.end()) {
5149 if (!get_osdmap()->is_up(p
->first
)) {
5152 _remove_heartbeat_peer(o
);
5155 if (p
->second
.epoch
< get_osdmap_epoch()) {
5156 extras
.insert(p
->first
);
5162 for (int n
= next
; n
>= 0; ) {
5163 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
5165 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
5166 dout(10) << " adding random peer osd." << n
<< dendl
;
5168 _add_heartbeat_peer(n
);
5170 n
= get_osdmap()->get_next_up_osd_after(n
);
5172 break; // came full circle; stop
5176 for (set
<int>::iterator p
= extras
.begin();
5177 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
5181 _remove_heartbeat_peer(*p
);
5184 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
5186 // clean up stale failure pending
5187 for (auto it
= failure_pending
.begin(); it
!= failure_pending
.end();) {
5188 if (heartbeat_peers
.count(it
->first
) == 0) {
5189 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
5190 failure_pending
.erase(it
++);
5197 void OSD::reset_heartbeat_peers(bool all
)
5199 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5200 dout(10) << "reset_heartbeat_peers" << dendl
;
5201 utime_t stale
= ceph_clock_now();
5202 stale
-= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
5203 std::lock_guard
l(heartbeat_lock
);
5204 for (auto it
= heartbeat_peers
.begin(); it
!= heartbeat_peers
.end();) {
5205 auto& [peer
, hi
] = *it
;
5206 if (all
|| hi
.is_stale(stale
)) {
5207 hi
.clear_mark_down();
5208 // stop sending failure_report to mon too
5209 failure_queue
.erase(peer
);
5210 failure_pending
.erase(peer
);
5211 it
= heartbeat_peers
.erase(it
);
5218 void OSD::handle_osd_ping(MOSDPing
*m
)
5220 if (superblock
.cluster_fsid
!= m
->fsid
) {
5221 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
5222 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
5228 int from
= m
->get_source().num();
5230 heartbeat_lock
.lock();
5231 if (is_stopping()) {
5232 heartbeat_lock
.unlock();
5237 utime_t now
= ceph_clock_now();
5238 auto mnow
= service
.get_mnow();
5239 ConnectionRef
con(m
->get_connection());
5240 OSDMapRef curmap
= service
.get_osdmap();
5242 heartbeat_lock
.unlock();
5247 auto sref
= con
->get_priv();
5248 Session
*s
= static_cast<Session
*>(sref
.get());
5250 heartbeat_lock
.unlock();
5256 s
->stamps
= service
.get_hb_stamps(from
);
5261 case MOSDPing::PING
:
5263 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
5264 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
5265 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
5266 if (heartbeat_drop
->second
== 0) {
5267 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
5269 --heartbeat_drop
->second
;
5270 dout(5) << "Dropping heartbeat from " << from
5271 << ", " << heartbeat_drop
->second
5272 << " remaining to drop" << dendl
;
5275 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
5276 ((((double)(rand()%100))/100.0))) {
5278 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
5279 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
5280 dout(5) << "Dropping heartbeat from " << from
5281 << ", " << heartbeat_drop
->second
5282 << " remaining to drop" << dendl
;
5287 ceph::signedspan sender_delta_ub
{};
5288 s
->stamps
->got_ping(
5294 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5296 if (!cct
->get_heartbeat_map()->is_healthy()) {
5297 dout(10) << "internal heartbeat not healthy, dropping ping request"
5302 Message
*r
= new MOSDPing(monc
->get_fsid(),
5303 curmap
->get_epoch(),
5304 MOSDPing::PING_REPLY
,
5308 service
.get_up_epoch(),
5309 cct
->_conf
->osd_heartbeat_min_size
,
5311 con
->send_message(r
);
5313 if (curmap
->is_up(from
)) {
5315 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5316 from
, curmap
->get_epoch());
5318 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5321 } else if (!curmap
->exists(from
) ||
5322 curmap
->get_down_at(from
) > m
->map_epoch
) {
5323 // tell them they have died
5324 Message
*r
= new MOSDPing(monc
->get_fsid(),
5325 curmap
->get_epoch(),
5330 service
.get_up_epoch(),
5331 cct
->_conf
->osd_heartbeat_min_size
);
5332 con
->send_message(r
);
5337 case MOSDPing::PING_REPLY
:
5339 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
5340 if (i
!= heartbeat_peers
.end()) {
5341 auto acked
= i
->second
.ping_history
.find(m
->ping_stamp
);
5342 if (acked
!= i
->second
.ping_history
.end()) {
5343 int &unacknowledged
= acked
->second
.second
;
5344 if (con
== i
->second
.con_back
) {
5345 dout(25) << "handle_osd_ping got reply from osd." << from
5346 << " first_tx " << i
->second
.first_tx
5347 << " last_tx " << i
->second
.last_tx
5348 << " last_rx_back " << i
->second
.last_rx_back
5350 << " last_rx_front " << i
->second
.last_rx_front
5352 i
->second
.last_rx_back
= now
;
5353 ceph_assert(unacknowledged
> 0);
5355 // if there is no front con, set both stamps.
5356 if (i
->second
.con_front
== NULL
) {
5357 i
->second
.last_rx_front
= now
;
5358 ceph_assert(unacknowledged
> 0);
5361 } else if (con
== i
->second
.con_front
) {
5362 dout(25) << "handle_osd_ping got reply from osd." << from
5363 << " first_tx " << i
->second
.first_tx
5364 << " last_tx " << i
->second
.last_tx
5365 << " last_rx_back " << i
->second
.last_rx_back
5366 << " last_rx_front " << i
->second
.last_rx_front
5369 i
->second
.last_rx_front
= now
;
5370 ceph_assert(unacknowledged
> 0);
5374 if (unacknowledged
== 0) {
5375 // succeeded in getting all replies
5376 dout(25) << "handle_osd_ping got all replies from osd." << from
5377 << " , erase pending ping(sent at " << m
->ping_stamp
<< ")"
5378 << " and older pending ping(s)"
5381 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5382 ++i
->second
.hb_average_count
;
5383 uint32_t back_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_back
- m
->ping_stamp
);
5384 i
->second
.hb_total_back
+= back_pingtime
;
5385 if (back_pingtime
< i
->second
.hb_min_back
)
5386 i
->second
.hb_min_back
= back_pingtime
;
5387 if (back_pingtime
> i
->second
.hb_max_back
)
5388 i
->second
.hb_max_back
= back_pingtime
;
5389 uint32_t front_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_front
- m
->ping_stamp
);
5390 i
->second
.hb_total_front
+= front_pingtime
;
5391 if (front_pingtime
< i
->second
.hb_min_front
)
5392 i
->second
.hb_min_front
= front_pingtime
;
5393 if (front_pingtime
> i
->second
.hb_max_front
)
5394 i
->second
.hb_max_front
= front_pingtime
;
5396 ceph_assert(i
->second
.hb_interval_start
!= utime_t());
5397 if (i
->second
.hb_interval_start
== utime_t())
5398 i
->second
.hb_interval_start
= now
;
5399 int64_t hb_avg_time_period
= 60;
5400 if (cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span")) {
5401 hb_avg_time_period
= cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span");
5403 if (now
- i
->second
.hb_interval_start
>= utime_t(hb_avg_time_period
, 0)) {
5404 uint32_t back_avg
= i
->second
.hb_total_back
/ i
->second
.hb_average_count
;
5405 uint32_t back_min
= i
->second
.hb_min_back
;
5406 uint32_t back_max
= i
->second
.hb_max_back
;
5407 uint32_t front_avg
= i
->second
.hb_total_front
/ i
->second
.hb_average_count
;
5408 uint32_t front_min
= i
->second
.hb_min_front
;
5409 uint32_t front_max
= i
->second
.hb_max_front
;
5411 // Reset for new interval
5412 i
->second
.hb_average_count
= 0;
5413 i
->second
.hb_interval_start
= now
;
5414 i
->second
.hb_total_back
= i
->second
.hb_max_back
= 0;
5415 i
->second
.hb_min_back
= UINT_MAX
;
5416 i
->second
.hb_total_front
= i
->second
.hb_max_front
= 0;
5417 i
->second
.hb_min_front
= UINT_MAX
;
5419 // Record per osd interace ping times
5420 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5421 if (i
->second
.hb_back_pingtime
.size() == 0) {
5422 ceph_assert(i
->second
.hb_front_pingtime
.size() == 0);
5423 for (unsigned k
= 0 ; k
< hb_vector_size
; ++k
) {
5424 i
->second
.hb_back_pingtime
.push_back(back_avg
);
5425 i
->second
.hb_back_min
.push_back(back_min
);
5426 i
->second
.hb_back_max
.push_back(back_max
);
5427 i
->second
.hb_front_pingtime
.push_back(front_avg
);
5428 i
->second
.hb_front_min
.push_back(front_min
);
5429 i
->second
.hb_front_max
.push_back(front_max
);
5430 ++i
->second
.hb_index
;
5433 int index
= i
->second
.hb_index
& (hb_vector_size
- 1);
5434 i
->second
.hb_back_pingtime
[index
] = back_avg
;
5435 i
->second
.hb_back_min
[index
] = back_min
;
5436 i
->second
.hb_back_max
[index
] = back_max
;
5437 i
->second
.hb_front_pingtime
[index
] = front_avg
;
5438 i
->second
.hb_front_min
[index
] = front_min
;
5439 i
->second
.hb_front_max
[index
] = front_max
;
5440 ++i
->second
.hb_index
;
5444 std::lock_guard
l(service
.stat_lock
);
5445 service
.osd_stat
.hb_pingtime
[from
].last_update
= now
.sec();
5446 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5449 uint32_t min
= UINT_MAX
;
5453 uint32_t size
= (uint32_t)i
->second
.hb_back_pingtime
.size();
5454 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5456 int index
= (i
->second
.hb_index
+ k
) % size
;
5457 total
+= i
->second
.hb_back_pingtime
[index
];
5458 if (i
->second
.hb_back_min
[index
] < min
)
5459 min
= i
->second
.hb_back_min
[index
];
5460 if (i
->second
.hb_back_max
[index
] > max
)
5461 max
= i
->second
.hb_back_max
[index
];
5462 if (count
== 1 || count
== 5 || count
== 15) {
5463 service
.osd_stat
.hb_pingtime
[from
].back_pingtime
[which
] = total
/ count
;
5464 service
.osd_stat
.hb_pingtime
[from
].back_min
[which
] = min
;
5465 service
.osd_stat
.hb_pingtime
[from
].back_max
[which
] = max
;
5472 if (i
->second
.con_front
!= NULL
) {
5473 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5480 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5482 int index
= (i
->second
.hb_index
+ k
) % size
;
5483 total
+= i
->second
.hb_front_pingtime
[index
];
5484 if (i
->second
.hb_front_min
[index
] < min
)
5485 min
= i
->second
.hb_front_min
[index
];
5486 if (i
->second
.hb_front_max
[index
] > max
)
5487 max
= i
->second
.hb_front_max
[index
];
5488 if (count
== 1 || count
== 5 || count
== 15) {
5489 service
.osd_stat
.hb_pingtime
[from
].front_pingtime
[which
] = total
/ count
;
5490 service
.osd_stat
.hb_pingtime
[from
].front_min
[which
] = min
;
5491 service
.osd_stat
.hb_pingtime
[from
].front_max
[which
] = max
;
5500 std::lock_guard
l(service
.stat_lock
);
5501 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5502 if (i
->second
.con_front
!= NULL
)
5503 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5505 i
->second
.ping_history
.erase(i
->second
.ping_history
.begin(), ++acked
);
5508 if (i
->second
.is_healthy(now
)) {
5509 // Cancel false reports
5510 auto failure_queue_entry
= failure_queue
.find(from
);
5511 if (failure_queue_entry
!= failure_queue
.end()) {
5512 dout(10) << "handle_osd_ping canceling queued "
5513 << "failure report for osd." << from
<< dendl
;
5514 failure_queue
.erase(failure_queue_entry
);
5517 auto failure_pending_entry
= failure_pending
.find(from
);
5518 if (failure_pending_entry
!= failure_pending
.end()) {
5519 dout(10) << "handle_osd_ping canceling in-flight "
5520 << "failure report for osd." << from
<< dendl
;
5521 send_still_alive(curmap
->get_epoch(),
5523 failure_pending_entry
->second
.second
);
5524 failure_pending
.erase(failure_pending_entry
);
5528 // old replies, deprecated by newly sent pings.
5529 dout(10) << "handle_osd_ping no pending ping(sent at " << m
->ping_stamp
5530 << ") is found, treat as covered by newly sent pings "
5537 curmap
->is_up(from
)) {
5539 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5540 from
, curmap
->get_epoch());
5542 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5547 s
->stamps
->got_ping_reply(
5551 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5555 case MOSDPing::YOU_DIED
:
5556 dout(10) << "handle_osd_ping " << m
->get_source_inst()
5557 << " says i am down in " << m
->map_epoch
<< dendl
;
5558 osdmap_subscribe(curmap
->get_epoch()+1, false);
5562 heartbeat_lock
.unlock();
5566 void OSD::heartbeat_entry()
5568 std::unique_lock
l(heartbeat_lock
);
5571 while (!heartbeat_stop
) {
5575 if (cct
->_conf
.get_val
<bool>("debug_disable_randomized_ping")) {
5576 wait
= (float)cct
->_conf
->osd_heartbeat_interval
;
5578 wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
5580 auto w
= ceph::make_timespan(wait
);
5581 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
5582 heartbeat_cond
.wait_for(l
, w
);
5585 dout(30) << "heartbeat_entry woke up" << dendl
;
5589 void OSD::heartbeat_check()
5591 ceph_assert(ceph_mutex_is_locked(heartbeat_lock
));
5592 utime_t now
= ceph_clock_now();
5594 // check for incoming heartbeats (move me elsewhere?)
5595 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5596 p
!= heartbeat_peers
.end();
5599 if (p
->second
.first_tx
== utime_t()) {
5600 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
5601 << " yet, skipping" << dendl
;
5605 dout(25) << "heartbeat_check osd." << p
->first
5606 << " first_tx " << p
->second
.first_tx
5607 << " last_tx " << p
->second
.last_tx
5608 << " last_rx_back " << p
->second
.last_rx_back
5609 << " last_rx_front " << p
->second
.last_rx_front
5611 if (p
->second
.is_unhealthy(now
)) {
5612 utime_t oldest_deadline
= p
->second
.ping_history
.begin()->second
.first
;
5613 if (p
->second
.last_rx_back
== utime_t() ||
5614 p
->second
.last_rx_front
== utime_t()) {
5615 derr
<< "heartbeat_check: no reply from "
5616 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5617 << " osd." << p
->first
5618 << " ever on either front or back, first ping sent "
5619 << p
->second
.first_tx
5620 << " (oldest deadline " << oldest_deadline
<< ")"
5623 failure_queue
[p
->first
] = p
->second
.first_tx
;
5625 derr
<< "heartbeat_check: no reply from "
5626 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5627 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
5628 << " front " << p
->second
.last_rx_front
5629 << " (oldest deadline " << oldest_deadline
<< ")"
5632 failure_queue
[p
->first
] = std::min(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
5638 void OSD::heartbeat()
5640 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock
));
5641 dout(30) << "heartbeat" << dendl
;
5645 int hb_interval
= cct
->_conf
->osd_heartbeat_interval
;
5646 int n_samples
= 86400;
5647 if (hb_interval
> 1) {
5648 n_samples
/= hb_interval
;
5653 if (getloadavg(loadavgs
, 1) == 1) {
5654 logger
->set(l_osd_loadavg
, 100 * loadavgs
[0]);
5655 daily_loadavg
= (daily_loadavg
* (n_samples
- 1) + loadavgs
[0]) / n_samples
;
5656 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg
<< dendl
;
5659 dout(30) << "heartbeat checking stats" << dendl
;
5661 // refresh peer list and osd stats
5662 vector
<int> hb_peers
;
5663 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5664 p
!= heartbeat_peers
.end();
5666 hb_peers
.push_back(p
->first
);
5668 auto new_stat
= service
.set_osd_stat(hb_peers
, get_num_pgs());
5669 dout(5) << __func__
<< " " << new_stat
<< dendl
;
5670 ceph_assert(new_stat
.statfs
.total
);
5673 float ratio
= service
.compute_adjusted_ratio(new_stat
, &pratio
);
5675 service
.check_full_status(ratio
, pratio
);
5677 utime_t now
= ceph_clock_now();
5678 auto mnow
= service
.get_mnow();
5679 utime_t deadline
= now
;
5680 deadline
+= cct
->_conf
->osd_heartbeat_grace
;
5683 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
5684 i
!= heartbeat_peers
.end();
5686 int peer
= i
->first
;
5687 Session
*s
= static_cast<Session
*>(i
->second
.con_back
->get_priv().get());
5689 dout(30) << "heartbeat osd." << peer
<< " has no open con" << dendl
;
5692 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
5694 i
->second
.last_tx
= now
;
5695 if (i
->second
.first_tx
== utime_t())
5696 i
->second
.first_tx
= now
;
5697 i
->second
.ping_history
[now
] = make_pair(deadline
,
5698 HeartbeatInfo::HEARTBEAT_MAX_CONN
);
5699 if (i
->second
.hb_interval_start
== utime_t())
5700 i
->second
.hb_interval_start
= now
;
5702 std::optional
<ceph::signedspan
> delta_ub
;
5703 s
->stamps
->sent_ping(&delta_ub
);
5705 i
->second
.con_back
->send_message(
5706 new MOSDPing(monc
->get_fsid(),
5707 service
.get_osdmap_epoch(),
5712 service
.get_up_epoch(),
5713 cct
->_conf
->osd_heartbeat_min_size
,
5716 if (i
->second
.con_front
)
5717 i
->second
.con_front
->send_message(
5718 new MOSDPing(monc
->get_fsid(),
5719 service
.get_osdmap_epoch(),
5724 service
.get_up_epoch(),
5725 cct
->_conf
->osd_heartbeat_min_size
,
5729 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
5731 // hmm.. am i all alone?
5732 dout(30) << "heartbeat lonely?" << dendl
;
5733 if (heartbeat_peers
.empty()) {
5734 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
5735 last_mon_heartbeat
= now
;
5736 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
5737 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5741 dout(30) << "heartbeat done" << dendl
;
5744 bool OSD::heartbeat_reset(Connection
*con
)
5746 std::lock_guard
l(heartbeat_lock
);
5747 auto s
= con
->get_priv();
5748 dout(20) << __func__
<< " con " << con
<< " s " << s
.get() << dendl
;
5749 con
->set_priv(nullptr);
5751 if (is_stopping()) {
5754 auto session
= static_cast<Session
*>(s
.get());
5755 auto p
= heartbeat_peers
.find(session
->peer
);
5756 if (p
!= heartbeat_peers
.end() &&
5757 (p
->second
.con_back
== con
||
5758 p
->second
.con_front
== con
)) {
5759 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5760 << ", reopening" << dendl
;
5761 p
->second
.clear_mark_down(con
);
5762 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
5764 p
->second
.con_back
= newcon
.first
.get();
5765 p
->second
.con_back
->set_priv(s
);
5766 if (newcon
.second
) {
5767 p
->second
.con_front
= newcon
.second
.get();
5768 p
->second
.con_front
->set_priv(s
);
5770 p
->second
.ping_history
.clear();
5772 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5773 << ", raced with osdmap update, closing out peer" << dendl
;
5774 heartbeat_peers
.erase(p
);
5777 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
5785 // =========================================
5789 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5790 dout(10) << "tick" << dendl
;
5792 utime_t now
= ceph_clock_now();
5793 // throw out any obsolete markdown log
5794 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
5795 while (!osd_markdown_log
.empty() &&
5796 osd_markdown_log
.front() + grace
< now
)
5797 osd_markdown_log
.pop_front();
5799 if (is_active() || is_waiting_for_healthy()) {
5800 maybe_update_heartbeat_peers();
5803 if (is_waiting_for_healthy()) {
5807 if (is_waiting_for_healthy() || is_booting()) {
5808 std::lock_guard
l(heartbeat_lock
);
5809 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
) {
5810 last_mon_heartbeat
= now
;
5811 dout(1) << __func__
<< " checking mon for new map" << dendl
;
5812 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5818 // scrub purged_snaps every deep scrub interval
5820 const utime_t last
= superblock
.last_purged_snaps_scrub
;
5821 utime_t next
= last
;
5822 next
+= cct
->_conf
->osd_scrub_min_interval
;
5824 // use a seed that is stable for each scrub interval, but varies
5825 // by OSD to avoid any herds.
5826 rng
.seed(whoami
+ superblock
.last_purged_snaps_scrub
.sec());
5827 double r
= (rng() % 1024) / 1024;
5829 cct
->_conf
->osd_scrub_min_interval
*
5830 cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
5831 if (next
< ceph_clock_now()) {
5832 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5833 << " next " << next
<< " ... now" << dendl
;
5834 scrub_purged_snaps();
5836 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5837 << " next " << next
<< dendl
;
5841 tick_timer
.add_event_after(get_tick_interval(), new C_Tick(this));
5844 void OSD::tick_without_osd_lock()
5846 ceph_assert(ceph_mutex_is_locked(tick_timer_lock
));
5847 dout(10) << "tick_without_osd_lock" << dendl
;
5849 logger
->set(l_osd_cached_crc
, buffer::get_cached_crc());
5850 logger
->set(l_osd_cached_crc_adjusted
, buffer::get_cached_crc_adjusted());
5851 logger
->set(l_osd_missed_crc
, buffer::get_missed_crc());
5853 // refresh osd stats
5854 struct store_statfs_t stbuf
;
5855 osd_alert_list_t alerts
;
5856 int r
= store
->statfs(&stbuf
, &alerts
);
5857 ceph_assert(r
== 0);
5858 service
.set_statfs(stbuf
, alerts
);
5860 // osd_lock is not being held, which means the OSD state
5861 // might change when doing the monitor report
5862 if (is_active() || is_waiting_for_healthy()) {
5864 std::lock_guard l
{heartbeat_lock
};
5867 map_lock
.lock_shared();
5868 std::lock_guard
l(mon_report_lock
);
5871 utime_t now
= ceph_clock_now();
5872 if (service
.need_fullness_update() ||
5873 now
- last_mon_report
> cct
->_conf
->osd_mon_report_interval
) {
5874 last_mon_report
= now
;
5878 map_lock
.unlock_shared();
5880 epoch_t max_waiting_epoch
= 0;
5881 for (auto s
: shards
) {
5882 max_waiting_epoch
= std::max(max_waiting_epoch
,
5883 s
->get_max_waiting_epoch());
5885 if (max_waiting_epoch
> get_osdmap()->get_epoch()) {
5886 dout(20) << __func__
<< " max_waiting_epoch " << max_waiting_epoch
5887 << ", requesting new map" << dendl
;
5888 osdmap_subscribe(superblock
.newest_map
+ 1, false);
5893 if (!scrub_random_backoff()) {
5896 service
.promote_throttle_recalibrate();
5897 resume_creating_pg();
5898 bool need_send_beacon
= false;
5899 const auto now
= ceph::coarse_mono_clock::now();
5901 // borrow lec lock to pretect last_sent_beacon from changing
5902 std::lock_guard l
{min_last_epoch_clean_lock
};
5903 const auto elapsed
= now
- last_sent_beacon
;
5904 if (chrono::duration_cast
<chrono::seconds
>(elapsed
).count() >
5905 cct
->_conf
->osd_beacon_report_interval
) {
5906 need_send_beacon
= true;
5909 if (need_send_beacon
) {
5914 mgrc
.update_daemon_health(get_health_metrics());
5915 service
.kick_recovery_queue();
5916 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
5917 new C_Tick_WithoutOSDLock(this));
5921 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5922 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5923 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5924 // getomap <pool> [namespace/]<obj-name>
5925 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5926 // injectmdataerr [namespace/]<obj-name> [shardid]
5927 // injectdataerr [namespace/]<obj-name> [shardid]
5929 // set_recovery_delay [utime]
5930 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
5931 std::string_view command
,
5932 const cmdmap_t
& cmdmap
, ostream
&ss
)
5935 //Support changing the omap on a single osd by using the Admin Socket to
5936 //directly request the osd make a change.
5937 if (command
== "setomapval" || command
== "rmomapkey" ||
5938 command
== "setomapheader" || command
== "getomap" ||
5939 command
== "truncobj" || command
== "injectmdataerr" ||
5940 command
== "injectdataerr"
5944 OSDMapRef curmap
= service
->get_osdmap();
5949 cmd_getval(cmdmap
, "pool", poolstr
);
5950 pool
= curmap
->lookup_pg_pool_name(poolstr
);
5951 //If we can't find it by name then maybe id specified
5952 if (pool
< 0 && isdigit(poolstr
[0]))
5953 pool
= atoll(poolstr
.c_str());
5955 ss
<< "Invalid pool '" << poolstr
<< "''";
5959 string objname
, nspace
;
5960 cmd_getval(cmdmap
, "objname", objname
);
5961 std::size_t found
= objname
.find_first_of('/');
5962 if (found
!= string::npos
) {
5963 nspace
= objname
.substr(0, found
);
5964 objname
= objname
.substr(found
+1);
5966 object_locator_t
oloc(pool
, nspace
);
5967 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
5970 ss
<< "Invalid namespace/objname";
5975 cmd_getval(cmdmap
, "shardid", shardid
, int64_t(shard_id_t::NO_SHARD
));
5976 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
5977 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
5978 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
5979 if (curmap
->pg_is_ec(rawpg
)) {
5980 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
5981 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
5986 ObjectStore::Transaction t
;
5988 if (command
== "setomapval") {
5989 map
<string
, bufferlist
> newattrs
;
5992 cmd_getval(cmdmap
, "key", key
);
5993 cmd_getval(cmdmap
, "val", valstr
);
5996 newattrs
[key
] = val
;
5997 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
5998 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6000 ss
<< "error=" << r
;
6003 } else if (command
== "rmomapkey") {
6005 cmd_getval(cmdmap
, "key", key
);
6007 t
.omap_rmkey(coll_t(pgid
), ghobject_t(obj
), key
);
6008 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6010 ss
<< "error=" << r
;
6013 } else if (command
== "setomapheader") {
6014 bufferlist newheader
;
6017 cmd_getval(cmdmap
, "header", headerstr
);
6018 newheader
.append(headerstr
);
6019 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
6020 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6022 ss
<< "error=" << r
;
6025 } else if (command
== "getomap") {
6026 //Debug: Output entire omap
6028 map
<string
, bufferlist
> keyvals
;
6029 auto ch
= store
->open_collection(coll_t(pgid
));
6031 ss
<< "unable to open collection for " << pgid
;
6034 r
= store
->omap_get(ch
, ghobject_t(obj
), &hdrbl
, &keyvals
);
6036 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
6037 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
6038 it
!= keyvals
.end(); ++it
)
6039 ss
<< " key=" << (*it
).first
<< " val="
6040 << string((*it
).second
.c_str(), (*it
).second
.length());
6042 ss
<< "error=" << r
;
6045 } else if (command
== "truncobj") {
6047 cmd_getval(cmdmap
, "len", trunclen
);
6048 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
6049 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6051 ss
<< "error=" << r
;
6054 } else if (command
== "injectdataerr") {
6055 store
->inject_data_error(gobj
);
6057 } else if (command
== "injectmdataerr") {
6058 store
->inject_mdata_error(gobj
);
6063 if (command
== "set_recovery_delay") {
6065 cmd_getval(cmdmap
, "utime", delay
, (int64_t)0);
6068 int r
= service
->cct
->_conf
.set_val("osd_recovery_delay_start",
6071 ss
<< "set_recovery_delay: error setting "
6072 << "osd_recovery_delay_start to '" << delay
<< "': error "
6076 service
->cct
->_conf
.apply_changes(nullptr);
6077 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
6078 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
6081 if (command
== "injectfull") {
6084 OSDService::s_names state
;
6085 cmd_getval(cmdmap
, "type", type
, string("full"));
6086 cmd_getval(cmdmap
, "count", count
, (int64_t)-1);
6087 if (type
== "none" || count
== 0) {
6091 state
= service
->get_full_state(type
);
6092 if (state
== OSDService::s_names::INVALID
) {
6093 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6096 service
->set_injectfull(state
, count
);
6099 ss
<< "Internal error - command=" << command
;
6102 // =========================================
6104 void OSD::ms_handle_connect(Connection
*con
)
6106 dout(10) << __func__
<< " con " << con
<< dendl
;
6107 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
6108 std::lock_guard
l(osd_lock
);
6111 dout(10) << __func__
<< " on mon" << dendl
;
6115 } else if (is_booting()) {
6116 _send_boot(); // resend boot message
6118 map_lock
.lock_shared();
6119 std::lock_guard
l2(mon_report_lock
);
6121 utime_t now
= ceph_clock_now();
6122 last_mon_report
= now
;
6124 // resend everything, it's a new session
6127 service
.requeue_pg_temp();
6128 service
.clear_sent_ready_to_merge();
6129 service
.send_pg_temp();
6130 service
.send_ready_to_merge();
6131 service
.send_pg_created();
6135 map_lock
.unlock_shared();
6137 send_beacon(ceph::coarse_mono_clock::now());
6141 // full map requests may happen while active or pre-boot
6142 if (requested_full_first
) {
6143 rerequest_full_maps();
6148 void OSD::ms_handle_fast_connect(Connection
*con
)
6150 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6151 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6152 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6153 s
= ceph::make_ref
<Session
>(cct
, con
);
6155 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
6156 << " addr=" << s
->con
->get_peer_addr() << dendl
;
6157 // we don't connect to clients
6158 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6159 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6164 void OSD::ms_handle_fast_accept(Connection
*con
)
6166 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6167 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6168 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6169 s
= ceph::make_ref
<Session
>(cct
, con
);
6171 dout(10) << "new session (incoming)" << s
<< " con=" << con
6172 << " addr=" << con
->get_peer_addr()
6173 << " must have raced with connect" << dendl
;
6174 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6175 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6180 bool OSD::ms_handle_reset(Connection
*con
)
6182 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6183 dout(2) << "ms_handle_reset con " << con
<< " session " << session
.get() << dendl
;
6186 session
->wstate
.reset(con
);
6187 session
->con
->set_priv(nullptr);
6188 session
->con
.reset(); // break con <-> session ref cycle
6189 // note that we break session->con *before* the session_handle_reset
6190 // cleanup below. this avoids a race between us and
6191 // PG::add_backoff, Session::check_backoff, etc.
6192 session_handle_reset(session
);
6196 bool OSD::ms_handle_refused(Connection
*con
)
6198 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
6201 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6202 dout(2) << "ms_handle_refused con " << con
<< " session " << session
.get() << dendl
;
6205 int type
= con
->get_peer_type();
6206 // handle only OSD failures here
6207 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
6208 OSDMapRef osdmap
= get_osdmap();
6210 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
6211 if (id
>= 0 && osdmap
->is_up(id
)) {
6212 // I'm cheating mon heartbeat grace logic, because we know it's not going
6213 // to respawn alone. +1 so we won't hit any boundary case.
6214 monc
->send_mon_message(
6218 osdmap
->get_addrs(id
),
6219 cct
->_conf
->osd_heartbeat_grace
+ 1,
6220 osdmap
->get_epoch(),
6221 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
6229 struct C_OSD_GetVersion
: public Context
{
6231 uint64_t oldest
, newest
;
6232 explicit C_OSD_GetVersion(OSD
*o
) : osd(o
), oldest(0), newest(0) {}
6233 void finish(int r
) override
{
6235 osd
->_got_mon_epochs(oldest
, newest
);
6239 void OSD::start_boot()
6241 if (!_is_healthy()) {
6242 // if we are not healthy, do not mark ourselves up (yet)
6243 dout(1) << "not healthy; waiting to boot" << dendl
;
6244 if (!is_waiting_for_healthy())
6245 start_waiting_for_healthy();
6246 // send pings sooner rather than later
6250 dout(1) << __func__
<< dendl
;
6251 set_state(STATE_PREBOOT
);
6252 dout(10) << "start_boot - have maps " << superblock
.oldest_map
6253 << ".." << superblock
.newest_map
<< dendl
;
6254 C_OSD_GetVersion
*c
= new C_OSD_GetVersion(this);
6255 monc
->get_version("osdmap", &c
->newest
, &c
->oldest
, c
);
6258 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
6260 std::lock_guard
l(osd_lock
);
6262 _preboot(oldest
, newest
);
6266 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
6268 ceph_assert(is_preboot());
6269 dout(10) << __func__
<< " _preboot mon has osdmaps "
6270 << oldest
<< ".." << newest
<< dendl
;
6272 // ensure our local fullness awareness is accurate
6274 std::lock_guard
l(heartbeat_lock
);
6278 const auto& monmap
= monc
->monmap
;
6279 const auto osdmap
= get_osdmap();
6280 // if our map within recent history, try to add ourselves to the osdmap.
6281 if (osdmap
->get_epoch() == 0) {
6282 derr
<< "waiting for initial osdmap" << dendl
;
6283 } else if (osdmap
->is_destroyed(whoami
)) {
6284 derr
<< "osdmap says I am destroyed" << dendl
;
6285 // provide a small margin so we don't livelock seeing if we
6286 // un-destroyed ourselves.
6287 if (osdmap
->get_epoch() > newest
- 1) {
6290 } else if (osdmap
->is_noup(whoami
)) {
6291 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
6292 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
6293 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6295 } else if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
6296 derr
<< "osdmap require_osd_release < luminous; please upgrade to luminous"
6298 } else if (service
.need_fullness_update()) {
6299 derr
<< "osdmap fullness state needs update" << dendl
;
6301 } else if (monmap
.min_mon_release
>= ceph_release_t::octopus
&&
6302 superblock
.purged_snaps_last
< superblock
.current_epoch
) {
6303 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6304 << " < newest_map " << superblock
.current_epoch
<< dendl
;
6305 _get_purged_snaps();
6306 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
6307 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
6309 // wait for pgs to fully catch up in a different thread, since
6310 // this thread might be required for splitting and merging PGs to
6312 boot_finisher
.queue(
6315 std::unique_lock
l(osd_lock
);
6317 dout(10) << __func__
<< " waiting for peering work to drain"
6320 for (auto shard
: shards
) {
6321 shard
->wait_min_pg_epoch(get_osdmap_epoch());
6332 // get all the latest maps
6333 if (osdmap
->get_epoch() + 1 >= oldest
)
6334 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6336 osdmap_subscribe(oldest
- 1, true);
6339 void OSD::_get_purged_snaps()
6341 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6342 // overlapping requests to the mon, which will be somewhat inefficient, but
6343 // it should be reliable.
6344 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6345 << ", newest_map " << superblock
.current_epoch
<< dendl
;
6346 MMonGetPurgedSnaps
*m
= new MMonGetPurgedSnaps(
6347 superblock
.purged_snaps_last
+ 1,
6348 superblock
.current_epoch
+ 1);
6349 monc
->send_mon_message(m
);
6352 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply
*m
)
6354 dout(10) << __func__
<< " " << *m
<< dendl
;
6355 ObjectStore::Transaction t
;
6356 if (!is_preboot() ||
6357 m
->last
< superblock
.purged_snaps_last
) {
6360 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
6361 make_purged_snaps_oid(), &t
,
6363 superblock
.purged_snaps_last
= m
->last
;
6364 write_superblock(t
);
6365 store
->queue_transaction(
6368 service
.publish_superblock(superblock
);
6369 if (m
->last
< superblock
.current_epoch
) {
6370 _get_purged_snaps();
6378 void OSD::send_full_update()
6380 if (!service
.need_fullness_update())
6383 if (service
.is_full()) {
6384 state
= CEPH_OSD_FULL
;
6385 } else if (service
.is_backfillfull()) {
6386 state
= CEPH_OSD_BACKFILLFULL
;
6387 } else if (service
.is_nearfull()) {
6388 state
= CEPH_OSD_NEARFULL
;
6391 OSDMap::calc_state_set(state
, s
);
6392 dout(10) << __func__
<< " want state " << s
<< dendl
;
6393 monc
->send_mon_message(new MOSDFull(get_osdmap_epoch(), state
));
6396 void OSD::start_waiting_for_healthy()
6398 dout(1) << "start_waiting_for_healthy" << dendl
;
6399 set_state(STATE_WAITING_FOR_HEALTHY
);
6400 last_heartbeat_resample
= utime_t();
6402 // subscribe to osdmap updates, in case our peers really are known to be dead
6403 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6406 bool OSD::_is_healthy()
6408 if (!cct
->get_heartbeat_map()->is_healthy()) {
6409 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
6413 if (is_waiting_for_healthy()) {
6414 utime_t now
= ceph_clock_now();
6415 if (osd_markdown_log
.empty()) {
6416 dout(5) << __func__
<< " force returning true since last markdown"
6417 << " was " << cct
->_conf
->osd_max_markdown_period
6418 << "s ago" << dendl
;
6421 std::lock_guard
l(heartbeat_lock
);
6422 int num
= 0, up
= 0;
6423 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
6424 p
!= heartbeat_peers
.end();
6426 if (p
->second
.is_healthy(now
))
6430 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
6431 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
6432 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
6440 void OSD::_send_boot()
6442 dout(10) << "_send_boot" << dendl
;
6443 Connection
*local_connection
=
6444 cluster_messenger
->get_loopback_connection().get();
6445 entity_addrvec_t client_addrs
= client_messenger
->get_myaddrs();
6446 entity_addrvec_t cluster_addrs
= cluster_messenger
->get_myaddrs();
6447 entity_addrvec_t hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6448 entity_addrvec_t hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6450 dout(20) << " initial client_addrs " << client_addrs
6451 << ", cluster_addrs " << cluster_addrs
6452 << ", hb_back_addrs " << hb_back_addrs
6453 << ", hb_front_addrs " << hb_front_addrs
6455 if (cluster_messenger
->set_addr_unknowns(client_addrs
)) {
6456 dout(10) << " assuming cluster_addrs match client_addrs "
6457 << client_addrs
<< dendl
;
6458 cluster_addrs
= cluster_messenger
->get_myaddrs();
6460 if (auto session
= local_connection
->get_priv(); !session
) {
6461 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6464 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
6465 if (hb_back_server_messenger
->set_addr_unknowns(cluster_addrs
)) {
6466 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6467 << cluster_addrs
<< dendl
;
6468 hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6470 if (auto session
= local_connection
->get_priv(); !session
) {
6471 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6474 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
6475 if (hb_front_server_messenger
->set_addr_unknowns(client_addrs
)) {
6476 dout(10) << " assuming hb_front_addrs match client_addrs "
6477 << client_addrs
<< dendl
;
6478 hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6480 if (auto session
= local_connection
->get_priv(); !session
) {
6481 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6484 // we now know what our front and back addrs will be, and we are
6485 // about to tell the mon what our metadata (including numa bindings)
6486 // are, so now is a good time!
6487 set_numa_affinity();
6489 MOSDBoot
*mboot
= new MOSDBoot(
6490 superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
6491 hb_back_addrs
, hb_front_addrs
, cluster_addrs
,
6493 dout(10) << " final client_addrs " << client_addrs
6494 << ", cluster_addrs " << cluster_addrs
6495 << ", hb_back_addrs " << hb_back_addrs
6496 << ", hb_front_addrs " << hb_front_addrs
6498 _collect_metadata(&mboot
->metadata
);
6499 monc
->send_mon_message(mboot
);
6500 set_state(STATE_BOOTING
);
6503 void OSD::_collect_metadata(map
<string
,string
> *pm
)
6506 (*pm
)["osd_data"] = dev_path
;
6507 if (store
->get_type() == "filestore") {
6508 // not applicable for bluestore
6509 (*pm
)["osd_journal"] = journal_path
;
6511 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddrs());
6512 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddrs());
6513 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddrs());
6514 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddrs());
6517 (*pm
)["osd_objectstore"] = store
->get_type();
6518 (*pm
)["rotational"] = store_is_rotational
? "1" : "0";
6519 (*pm
)["journal_rotational"] = journal_is_rotational
? "1" : "0";
6520 (*pm
)["default_device_class"] = store
->get_default_device_class();
6521 string osdspec_affinity
;
6522 int r
= store
->read_meta("osdspec_affinity", &osdspec_affinity
);
6523 if (r
< 0 || osdspec_affinity
.empty()) {
6524 osdspec_affinity
= "";
6526 (*pm
)["osdspec_affinity"] = osdspec_affinity
;
6527 store
->collect_metadata(pm
);
6529 collect_sys_info(pm
, cct
);
6531 (*pm
)["front_iface"] = pick_iface(
6533 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
6534 (*pm
)["back_iface"] = pick_iface(
6536 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
6542 set
<string
> unknown
;
6543 for (auto nm
: { "front_iface", "back_iface" }) {
6544 if (!(*pm
)[nm
].size()) {
6549 int r
= get_iface_numa_node((*pm
)[nm
], &n
);
6551 unknown
.insert((*pm
)[nm
]);
6559 if (unknown
.size()) {
6560 (*pm
)["network_numa_unknown_ifaces"] = stringify(unknown
);
6562 if (!nodes
.empty()) {
6563 (*pm
)["network_numa_nodes"] = stringify(nodes
);
6565 if (node
>= 0 && nodes
.size() == 1 && unknown
.empty()) {
6566 (*pm
)["network_numa_node"] = stringify(node
);
6570 if (numa_node
>= 0) {
6571 (*pm
)["numa_node"] = stringify(numa_node
);
6572 (*pm
)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size
,
6576 set
<string
> devnames
;
6577 store
->get_devices(&devnames
);
6578 map
<string
,string
> errs
;
6579 get_device_metadata(devnames
, pm
, &errs
);
6580 for (auto& i
: errs
) {
6581 dout(1) << __func__
<< " " << i
.first
<< ": " << i
.second
<< dendl
;
6583 dout(10) << __func__
<< " " << *pm
<< dendl
;
6586 void OSD::queue_want_up_thru(epoch_t want
)
6588 std::shared_lock map_locker
{map_lock
};
6589 epoch_t cur
= get_osdmap()->get_up_thru(whoami
);
6590 std::lock_guard
report_locker(mon_report_lock
);
6591 if (want
> up_thru_wanted
) {
6592 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
6593 << ", currently " << cur
6595 up_thru_wanted
= want
;
6598 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
6599 << ", currently " << cur
6604 void OSD::send_alive()
6606 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6607 const auto osdmap
= get_osdmap();
6608 if (!osdmap
->exists(whoami
))
6610 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
6611 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
6612 if (up_thru_wanted
> up_thru
) {
6613 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
6614 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
6618 void OSD::request_full_map(epoch_t first
, epoch_t last
)
6620 dout(10) << __func__
<< " " << first
<< ".." << last
6621 << ", previously requested "
6622 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
6623 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6624 ceph_assert(first
> 0 && last
> 0);
6625 ceph_assert(first
<= last
);
6626 ceph_assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
6627 if (requested_full_first
== 0) {
6629 requested_full_first
= first
;
6630 requested_full_last
= last
;
6631 } else if (last
<= requested_full_last
) {
6635 // additional request
6636 first
= requested_full_last
+ 1;
6637 requested_full_last
= last
;
6639 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
6640 req
->request_full(first
, last
);
6641 monc
->send_mon_message(req
);
6644 void OSD::got_full_map(epoch_t e
)
6646 ceph_assert(requested_full_first
<= requested_full_last
);
6647 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6648 if (requested_full_first
== 0) {
6649 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
6652 if (e
< requested_full_first
) {
6653 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6654 << ".." << requested_full_last
6655 << ", ignoring" << dendl
;
6658 if (e
>= requested_full_last
) {
6659 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6660 << ".." << requested_full_last
<< ", resetting" << dendl
;
6661 requested_full_first
= requested_full_last
= 0;
6665 requested_full_first
= e
+ 1;
6667 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6668 << ".." << requested_full_last
6669 << ", still need more" << dendl
;
6672 void OSD::requeue_failures()
6674 std::lock_guard
l(heartbeat_lock
);
6675 unsigned old_queue
= failure_queue
.size();
6676 unsigned old_pending
= failure_pending
.size();
6677 for (auto p
= failure_pending
.begin(); p
!= failure_pending
.end(); ) {
6678 failure_queue
[p
->first
] = p
->second
.first
;
6679 failure_pending
.erase(p
++);
6681 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
6682 << failure_queue
.size() << dendl
;
6685 void OSD::send_failures()
6687 ceph_assert(ceph_mutex_is_locked(map_lock
));
6688 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6689 std::lock_guard
l(heartbeat_lock
);
6690 utime_t now
= ceph_clock_now();
6691 const auto osdmap
= get_osdmap();
6692 while (!failure_queue
.empty()) {
6693 int osd
= failure_queue
.begin()->first
;
6694 if (!failure_pending
.count(osd
)) {
6695 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
6696 monc
->send_mon_message(
6700 osdmap
->get_addrs(osd
),
6702 osdmap
->get_epoch()));
6703 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
,
6704 osdmap
->get_addrs(osd
));
6706 failure_queue
.erase(osd
);
6710 void OSD::send_still_alive(epoch_t epoch
, int osd
, const entity_addrvec_t
&addrs
)
6712 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), osd
, addrs
, 0, epoch
,
6713 MOSDFailure::FLAG_ALIVE
);
6714 monc
->send_mon_message(m
);
6717 void OSD::cancel_pending_failures()
6719 std::lock_guard
l(heartbeat_lock
);
6720 auto it
= failure_pending
.begin();
6721 while (it
!= failure_pending
.end()) {
6722 dout(10) << __func__
<< " canceling in-flight failure report for osd."
6723 << it
->first
<< dendl
;
6724 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
6725 failure_pending
.erase(it
++);
6729 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
6731 const auto& monmap
= monc
->monmap
;
6732 // send beacon to mon even if we are just connected, and the monmap is not
6733 // initialized yet by then.
6734 if (monmap
.epoch
> 0 &&
6735 monmap
.get_required_features().contains_all(
6736 ceph::features::mon::FEATURE_LUMINOUS
)) {
6737 dout(20) << __func__
<< " sending" << dendl
;
6738 MOSDBeacon
* beacon
= nullptr;
6740 std::lock_guard l
{min_last_epoch_clean_lock
};
6741 beacon
= new MOSDBeacon(get_osdmap_epoch(),
6742 min_last_epoch_clean
,
6743 superblock
.last_purged_snaps_scrub
);
6744 beacon
->pgs
= min_last_epoch_clean_pgs
;
6745 last_sent_beacon
= now
;
6747 monc
->send_mon_message(beacon
);
6749 dout(20) << __func__
<< " not sending" << dendl
;
6753 void OSD::handle_command(MCommand
*m
)
6755 ConnectionRef con
= m
->get_connection();
6756 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6758 con
->send_message(new MCommandReply(m
, -EACCES
));
6762 if (!session
->caps
.allow_all()) {
6763 con
->send_message(new MCommandReply(m
, -EACCES
));
6767 cct
->get_admin_socket()->queue_tell_command(m
);
6772 class unlock_guard
{
6775 explicit unlock_guard(ceph::mutex
& mutex
)
6780 unlock_guard(unlock_guard
&) = delete;
6787 void OSD::scrub_purged_snaps()
6789 dout(10) << __func__
<< dendl
;
6790 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6791 SnapMapper::Scrubber
s(cct
, store
, service
.meta_ch
,
6792 make_snapmapper_oid(),
6793 make_purged_snaps_oid());
6794 clog
->debug() << "purged_snaps scrub starts";
6797 if (s
.stray
.size()) {
6798 clog
->debug() << "purged_snaps scrub found " << s
.stray
.size() << " strays";
6800 clog
->debug() << "purged_snaps scrub ok";
6802 set
<pair
<spg_t
,snapid_t
>> queued
;
6803 for (auto& [pool
, snap
, hash
, shard
] : s
.stray
) {
6804 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(pool
);
6806 dout(20) << __func__
<< " pool " << pool
<< " dne" << dendl
;
6809 pg_t
pgid(pi
->raw_hash_to_pg(hash
), pool
);
6810 spg_t
spgid(pgid
, shard
);
6811 pair
<spg_t
,snapid_t
> p(spgid
, snap
);
6812 if (queued
.count(p
)) {
6813 dout(20) << __func__
<< " pg " << spgid
<< " snap " << snap
6814 << " already queued" << dendl
;
6817 PGRef pg
= lookup_lock_pg(spgid
);
6819 dout(20) << __func__
<< " pg " << spgid
<< " not found" << dendl
;
6823 dout(10) << __func__
<< " requeue pg " << spgid
<< " " << pg
<< " snap "
6825 pg
->queue_snap_retrim(snap
);
6829 if (is_stopping()) {
6832 dout(10) << __func__
<< " done queueing pgs, updating superblock" << dendl
;
6833 ObjectStore::Transaction t
;
6834 superblock
.last_purged_snaps_scrub
= ceph_clock_now();
6835 write_superblock(t
);
6836 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
6837 ceph_assert(tr
== 0);
6839 send_beacon(ceph::coarse_mono_clock::now());
6841 dout(10) << __func__
<< " done" << dendl
;
6844 void OSD::probe_smart(const string
& only_devid
, ostream
& ss
)
6846 set
<string
> devnames
;
6847 store
->get_devices(&devnames
);
6848 uint64_t smart_timeout
= cct
->_conf
.get_val
<uint64_t>(
6849 "osd_smart_report_timeout");
6851 // == typedef std::map<std::string, mValue> mObject;
6852 json_spirit::mObject json_map
;
6854 for (auto dev
: devnames
) {
6855 // smartctl works only on physical devices; filter out any logical device
6856 if (dev
.find("dm-") == 0) {
6861 string devid
= get_device_id(dev
, &err
);
6862 if (devid
.size() == 0) {
6863 dout(10) << __func__
<< " no unique id for dev " << dev
<< " ("
6864 << err
<< "), skipping" << dendl
;
6867 if (only_devid
.size() && devid
!= only_devid
) {
6871 json_spirit::mValue smart_json
;
6872 if (block_device_get_metrics(dev
, smart_timeout
,
6874 dout(10) << "block_device_get_metrics failed for /dev/" << dev
<< dendl
;
6877 json_map
[devid
] = smart_json
;
6879 json_spirit::write(json_map
, ss
, json_spirit::pretty_print
);
6882 bool OSD::heartbeat_dispatch(Message
*m
)
6884 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
6885 switch (m
->get_type()) {
6888 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
6893 handle_osd_ping(static_cast<MOSDPing
*>(m
));
6897 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
6904 bool OSD::ms_dispatch(Message
*m
)
6906 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
6907 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
6908 service
.got_stop_ack();
6916 if (is_stopping()) {
6930 void OSDService::maybe_share_map(
6932 const OSDMapRef
& osdmap
,
6933 epoch_t peer_epoch_lb
)
6935 // NOTE: we assume caller hold something that keeps the Connection itself
6936 // pinned (e.g., an OpRequest's MessageRef).
6937 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6942 // assume the peer has the newer of the op's sent_epoch and what
6943 // we think we sent them.
6944 session
->sent_epoch_lock
.lock();
6945 if (peer_epoch_lb
> session
->last_sent_epoch
) {
6946 dout(10) << __func__
<< " con " << con
6947 << " " << con
->get_peer_addr()
6948 << " map epoch " << session
->last_sent_epoch
6949 << " -> " << peer_epoch_lb
<< " (as per caller)" << dendl
;
6950 session
->last_sent_epoch
= peer_epoch_lb
;
6952 epoch_t last_sent_epoch
= session
->last_sent_epoch
;
6953 session
->sent_epoch_lock
.unlock();
6955 if (osdmap
->get_epoch() <= last_sent_epoch
) {
6959 send_incremental_map(last_sent_epoch
, con
, osdmap
);
6960 last_sent_epoch
= osdmap
->get_epoch();
6962 session
->sent_epoch_lock
.lock();
6963 if (session
->last_sent_epoch
< last_sent_epoch
) {
6964 dout(10) << __func__
<< " con " << con
6965 << " " << con
->get_peer_addr()
6966 << " map epoch " << session
->last_sent_epoch
6967 << " -> " << last_sent_epoch
<< " (shared)" << dendl
;
6968 session
->last_sent_epoch
= last_sent_epoch
;
6970 session
->sent_epoch_lock
.unlock();
6973 void OSD::dispatch_session_waiting(const ceph::ref_t
<Session
>& session
, OSDMapRef osdmap
)
6975 ceph_assert(ceph_mutex_is_locked(session
->session_dispatch_lock
));
6977 auto i
= session
->waiting_on_map
.begin();
6978 while (i
!= session
->waiting_on_map
.end()) {
6979 OpRequestRef op
= &(*i
);
6980 ceph_assert(ms_can_fast_dispatch(op
->get_req()));
6981 auto m
= op
->get_req
<MOSDFastDispatchOp
>();
6982 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
6985 session
->waiting_on_map
.erase(i
++);
6989 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
6990 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
6991 static_cast<const MOSDOp
*>(m
)->get_pg());
6992 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
6996 pgid
= m
->get_spg();
6998 enqueue_op(pgid
, std::move(op
), m
->get_map_epoch());
7001 if (session
->waiting_on_map
.empty()) {
7002 clear_session_waiting_on_map(session
);
7004 register_session_waiting_on_map(session
);
7008 void OSD::ms_fast_dispatch(Message
*m
)
7011 if (service
.is_stopping()) {
7017 switch (m
->get_type()) {
7019 dout(10) << "ping from " << m
->get_source() << dendl
;
7022 case MSG_OSD_FORCE_RECOVERY
:
7023 handle_fast_force_recovery(static_cast<MOSDForceRecovery
*>(m
));
7025 case MSG_OSD_SCRUB2
:
7026 handle_fast_scrub(static_cast<MOSDScrub2
*>(m
));
7029 case MSG_OSD_PG_CREATE2
:
7030 return handle_fast_pg_create(static_cast<MOSDPGCreate2
*>(m
));
7031 case MSG_OSD_PG_QUERY
:
7032 return handle_fast_pg_query(static_cast<MOSDPGQuery
*>(m
));
7033 case MSG_OSD_PG_NOTIFY
:
7034 return handle_fast_pg_notify(static_cast<MOSDPGNotify
*>(m
));
7035 case MSG_OSD_PG_INFO
:
7036 return handle_fast_pg_info(static_cast<MOSDPGInfo
*>(m
));
7037 case MSG_OSD_PG_REMOVE
:
7038 return handle_fast_pg_remove(static_cast<MOSDPGRemove
*>(m
));
7040 // these are single-pg messages that handle themselves
7041 case MSG_OSD_PG_LOG
:
7042 case MSG_OSD_PG_TRIM
:
7043 case MSG_OSD_PG_NOTIFY2
:
7044 case MSG_OSD_PG_QUERY2
:
7045 case MSG_OSD_PG_INFO2
:
7046 case MSG_OSD_BACKFILL_RESERVE
:
7047 case MSG_OSD_RECOVERY_RESERVE
:
7048 case MSG_OSD_PG_LEASE
:
7049 case MSG_OSD_PG_LEASE_ACK
:
7051 MOSDPeeringOp
*pm
= static_cast<MOSDPeeringOp
*>(m
);
7052 if (require_osd_peer(pm
)) {
7053 enqueue_peering_evt(
7055 PGPeeringEventRef(pm
->get_event()));
7062 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7065 osd_reqid_t reqid
= op
->get_reqid();
7067 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
7068 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
7072 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7074 // note sender epoch, min req's epoch
7075 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
7076 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
7077 ceph_assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
7079 service
.maybe_inject_dispatch_delay();
7081 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
7082 m
->get_type() != CEPH_MSG_OSD_OP
) {
7083 // queue it directly
7085 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
7087 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
7089 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7090 // message that didn't have an explicit spg_t); we need to map
7091 // them to an spg_t while preserving delivery order.
7092 auto priv
= m
->get_connection()->get_priv();
7093 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
7094 std::lock_guard l
{session
->session_dispatch_lock
};
7096 session
->waiting_on_map
.push_back(*op
);
7097 OSDMapRef nextmap
= service
.get_nextmap_reserved();
7098 dispatch_session_waiting(session
, nextmap
);
7099 service
.release_map(nextmap
);
7102 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
7105 int OSD::ms_handle_authentication(Connection
*con
)
7108 auto s
= ceph::ref_cast
<Session
>(con
->get_priv());
7110 s
= ceph::make_ref
<Session
>(cct
, con
);
7112 s
->entity_name
= con
->get_peer_entity_name();
7113 dout(10) << __func__
<< " new session " << s
<< " con " << s
->con
7114 << " entity " << s
->entity_name
7115 << " addr " << con
->get_peer_addrs() << dendl
;
7117 dout(10) << __func__
<< " existing session " << s
<< " con " << s
->con
7118 << " entity " << s
->entity_name
7119 << " addr " << con
->get_peer_addrs() << dendl
;
7122 AuthCapsInfo
&caps_info
= con
->get_peer_caps_info();
7123 if (caps_info
.allow_all
) {
7124 s
->caps
.set_allow_all();
7125 } else if (caps_info
.caps
.length() > 0) {
7126 bufferlist::const_iterator p
= caps_info
.caps
.cbegin();
7131 catch (buffer::error
& e
) {
7132 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7133 << " failed to decode caps string" << dendl
;
7137 bool success
= s
->caps
.parse(str
);
7139 dout(10) << __func__
<< " session " << s
7140 << " " << s
->entity_name
7141 << " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
7144 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7145 << " failed to parse caps '" << str
<< "'" << dendl
;
7153 void OSD::do_waiters()
7155 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7157 dout(10) << "do_waiters -- start" << dendl
;
7158 while (!finished
.empty()) {
7159 OpRequestRef next
= finished
.front();
7160 finished
.pop_front();
7163 dout(10) << "do_waiters -- finish" << dendl
;
7166 void OSD::dispatch_op(OpRequestRef op
)
7168 switch (op
->get_req()->get_type()) {
7170 case MSG_OSD_PG_CREATE
:
7171 handle_pg_create(op
);
7176 void OSD::_dispatch(Message
*m
)
7178 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7179 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
7181 switch (m
->get_type()) {
7182 // -- don't need OSDMap --
7184 // map and replication
7185 case CEPH_MSG_OSD_MAP
:
7186 handle_osd_map(static_cast<MOSDMap
*>(m
));
7188 case MSG_MON_GET_PURGED_SNAPS_REPLY
:
7189 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply
*>(m
));
7194 handle_scrub(static_cast<MOSDScrub
*>(m
));
7198 handle_command(static_cast<MCommand
*>(m
));
7201 // -- need OSDMap --
7203 case MSG_OSD_PG_CREATE
:
7205 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7207 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7208 // no map? starting up?
7209 if (!get_osdmap()) {
7210 dout(7) << "no OSDMap, not booted" << dendl
;
7211 logger
->inc(l_osd_waiting_for_map
);
7212 waiting_for_osdmap
.push_back(op
);
7213 op
->mark_delayed("no osdmap");
7223 // remove me post-nautilus
7224 void OSD::handle_scrub(MOSDScrub
*m
)
7226 dout(10) << "handle_scrub " << *m
<< dendl
;
7227 if (!require_mon_or_mgr_peer(m
)) {
7231 if (m
->fsid
!= monc
->get_fsid()) {
7232 dout(0) << "handle_scrub fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7241 if (!m
->scrub_pgs
.empty()) {
7243 for (auto pgid
: m
->scrub_pgs
) {
7245 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
7246 std::find(spgs
.begin(), spgs
.end(), pcand
) != spgs
.end()) {
7253 for (auto pgid
: spgs
) {
7254 enqueue_peering_evt(
7257 std::make_shared
<PGPeeringEvent
>(
7260 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7266 void OSD::handle_fast_scrub(MOSDScrub2
*m
)
7268 dout(10) << __func__
<< " " << *m
<< dendl
;
7269 if (!require_mon_or_mgr_peer(m
)) {
7273 if (m
->fsid
!= monc
->get_fsid()) {
7274 dout(0) << __func__
<< " fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7279 for (auto pgid
: m
->scrub_pgs
) {
7280 enqueue_peering_evt(
7283 std::make_shared
<PGPeeringEvent
>(
7286 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7291 bool OSD::scrub_random_backoff()
7293 bool coin_flip
= (rand() / (double)RAND_MAX
>=
7294 cct
->_conf
->osd_scrub_backoff_ratio
);
7296 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl
;
7302 OSDService::ScrubJob::ScrubJob(CephContext
* cct
,
7303 const spg_t
& pg
, const utime_t
& timestamp
,
7304 double pool_scrub_min_interval
,
7305 double pool_scrub_max_interval
, bool must
)
7308 sched_time(timestamp
),
7311 // if not explicitly requested, postpone the scrub with a random delay
7313 double scrub_min_interval
= pool_scrub_min_interval
> 0 ?
7314 pool_scrub_min_interval
: cct
->_conf
->osd_scrub_min_interval
;
7315 double scrub_max_interval
= pool_scrub_max_interval
> 0 ?
7316 pool_scrub_max_interval
: cct
->_conf
->osd_scrub_max_interval
;
7318 sched_time
+= scrub_min_interval
;
7319 double r
= rand() / (double)RAND_MAX
;
7321 scrub_min_interval
* cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
7322 if (scrub_max_interval
== 0) {
7323 deadline
= utime_t();
7325 deadline
+= scrub_max_interval
;
7331 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob
& rhs
) const {
7332 if (sched_time
< rhs
.sched_time
)
7334 if (sched_time
> rhs
.sched_time
)
7336 return pgid
< rhs
.pgid
;
7339 double OSD::scrub_sleep_time(bool must_scrub
)
7342 return cct
->_conf
->osd_scrub_sleep
;
7344 utime_t now
= ceph_clock_now();
7345 if (scrub_time_permit(now
)) {
7346 return cct
->_conf
->osd_scrub_sleep
;
7348 double normal_sleep
= cct
->_conf
->osd_scrub_sleep
;
7349 double extended_sleep
= cct
->_conf
->osd_scrub_extended_sleep
;
7350 return std::max(extended_sleep
, normal_sleep
);
7353 bool OSD::scrub_time_permit(utime_t now
)
7356 time_t tt
= now
.sec();
7357 localtime_r(&tt
, &bdt
);
7359 bool day_permit
= false;
7360 if (cct
->_conf
->osd_scrub_begin_week_day
< cct
->_conf
->osd_scrub_end_week_day
) {
7361 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
&& bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7365 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
|| bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7371 dout(20) << __func__
<< " should run between week day " << cct
->_conf
->osd_scrub_begin_week_day
7372 << " - " << cct
->_conf
->osd_scrub_end_week_day
7373 << " now " << bdt
.tm_wday
<< " = no" << dendl
;
7377 bool time_permit
= false;
7378 if (cct
->_conf
->osd_scrub_begin_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7379 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
&& bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7383 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
|| bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7388 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7389 << " - " << cct
->_conf
->osd_scrub_end_hour
7390 << " now " << bdt
.tm_hour
<< " = no" << dendl
;
7392 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7393 << " - " << cct
->_conf
->osd_scrub_end_hour
7394 << " now " << bdt
.tm_hour
<< " = yes" << dendl
;
7399 bool OSD::scrub_load_below_threshold()
7402 if (getloadavg(loadavgs
, 3) != 3) {
7403 dout(10) << __func__
<< " couldn't read loadavgs\n" << dendl
;
7407 // allow scrub if below configured threshold
7408 long cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
7409 double loadavg_per_cpu
= cpus
> 0 ? loadavgs
[0] / cpus
: loadavgs
[0];
7410 if (loadavg_per_cpu
< cct
->_conf
->osd_scrub_load_threshold
) {
7411 dout(20) << __func__
<< " loadavg per cpu " << loadavg_per_cpu
7412 << " < max " << cct
->_conf
->osd_scrub_load_threshold
7413 << " = yes" << dendl
;
7417 // allow scrub if below daily avg and currently decreasing
7418 if (loadavgs
[0] < daily_loadavg
&& loadavgs
[0] < loadavgs
[2]) {
7419 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7420 << " < daily_loadavg " << daily_loadavg
7421 << " and < 15m avg " << loadavgs
[2]
7422 << " = yes" << dendl
;
7426 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7427 << " >= max " << cct
->_conf
->osd_scrub_load_threshold
7428 << " and ( >= daily_loadavg " << daily_loadavg
7429 << " or >= 15m avg " << loadavgs
[2]
7430 << ") = no" << dendl
;
7434 void OSD::sched_scrub()
7436 // if not permitted, fail fast
7437 if (!service
.can_inc_scrubs()) {
7440 bool allow_requested_repair_only
= false;
7441 if (service
.is_recovery_active() && !cct
->_conf
->osd_scrub_during_recovery
) {
7442 if (!cct
->_conf
->osd_repair_during_recovery
) {
7443 dout(20) << __func__
<< " not scheduling scrubs due to active recovery" << dendl
;
7446 dout(10) << __func__
7447 << " will only schedule explicitly requested repair due to active recovery"
7449 allow_requested_repair_only
= true;
7452 utime_t now
= ceph_clock_now();
7453 bool time_permit
= scrub_time_permit(now
);
7454 bool load_is_low
= scrub_load_below_threshold();
7455 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low
<< dendl
;
7457 OSDService::ScrubJob scrub
;
7458 if (service
.first_scrub_stamp(&scrub
)) {
7460 dout(30) << "sched_scrub examine " << scrub
.pgid
<< " at " << scrub
.sched_time
<< dendl
;
7462 if (scrub
.sched_time
> now
) {
7463 // save ourselves some effort
7464 dout(10) << "sched_scrub " << scrub
.pgid
<< " scheduled at " << scrub
.sched_time
7465 << " > " << now
<< dendl
;
7469 if ((scrub
.deadline
.is_zero() || scrub
.deadline
>= now
) && !(time_permit
&& load_is_low
)) {
7470 dout(10) << __func__
<< " not scheduling scrub for " << scrub
.pgid
<< " due to "
7471 << (!time_permit
? "time not permit" : "high load") << dendl
;
7475 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7478 // This has already started, so go on to the next scrub job
7479 if (pg
->scrubber
.active
) {
7481 dout(30) << __func__
<< ": already in progress pgid " << scrub
.pgid
<< dendl
;
7484 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7485 if (allow_requested_repair_only
&& !pg
->scrubber
.must_repair
) {
7487 dout(10) << __func__
<< " skip " << scrub
.pgid
7488 << " because repairing is not explicitly requested on it"
7492 // If it is reserving, let it resolve before going to the next scrub job
7493 if (pg
->scrubber
.local_reserved
&& !pg
->scrubber
.active
) {
7495 dout(30) << __func__
<< ": reserve in progress pgid " << scrub
.pgid
<< dendl
;
7498 dout(10) << "sched_scrub scrubbing " << scrub
.pgid
<< " at " << scrub
.sched_time
7499 << (pg
->get_must_scrub() ? ", explicitly requested" :
7500 (load_is_low
? ", load_is_low" : " deadline < now"))
7502 if (pg
->sched_scrub()) {
7507 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7509 dout(20) << "sched_scrub done" << dendl
;
7512 void OSD::resched_all_scrubs()
7514 dout(10) << __func__
<< ": start" << dendl
;
7515 const vector
<spg_t
> pgs
= [this] {
7517 OSDService::ScrubJob job
;
7518 if (service
.first_scrub_stamp(&job
)) {
7520 pgs
.push_back(job
.pgid
);
7521 } while (service
.next_scrub_stamp(job
, &job
));
7525 for (auto& pgid
: pgs
) {
7526 dout(20) << __func__
<< ": examine " << pgid
<< dendl
;
7527 PGRef pg
= _lookup_lock_pg(pgid
);
7530 if (!pg
->scrubber
.must_scrub
&& !pg
->scrubber
.need_auto
) {
7531 dout(15) << __func__
<< ": reschedule " << pgid
<< dendl
;
7532 pg
->on_info_history_change();
7536 dout(10) << __func__
<< ": done" << dendl
;
7539 MPGStats
* OSD::collect_pg_stats()
7541 // This implementation unconditionally sends every is_primary PG's
7542 // stats every time we're called. This has equivalent cost to the
7543 // previous implementation's worst case where all PGs are busy and
7544 // their stats are always enqueued for sending.
7545 std::shared_lock l
{map_lock
};
7547 osd_stat_t cur_stat
= service
.get_osd_stat();
7548 cur_stat
.os_perf_stat
= store
->get_cur_stats();
7550 auto m
= new MPGStats(monc
->get_fsid(), get_osdmap_epoch());
7551 m
->osd_stat
= cur_stat
;
7553 std::lock_guard lec
{min_last_epoch_clean_lock
};
7554 min_last_epoch_clean
= get_osdmap_epoch();
7555 min_last_epoch_clean_pgs
.clear();
7557 std::set
<int64_t> pool_set
;
7560 for (auto& pg
: pgs
) {
7561 auto pool
= pg
->pg_id
.pgid
.pool();
7562 pool_set
.emplace((int64_t)pool
);
7563 if (!pg
->is_primary()) {
7566 pg
->get_pg_stats([&](const pg_stat_t
& s
, epoch_t lec
) {
7567 m
->pg_stat
[pg
->pg_id
.pgid
] = s
;
7568 min_last_epoch_clean
= min(min_last_epoch_clean
, lec
);
7569 min_last_epoch_clean_pgs
.push_back(pg
->pg_id
.pgid
);
7573 bool per_pool_stats
= false;
7574 bool per_pool_omap_stats
= false;
7575 for (auto p
: pool_set
) {
7576 int r
= store
->pool_statfs(p
, &st
, &per_pool_omap_stats
);
7577 if (r
== -ENOTSUP
) {
7581 m
->pool_stat
[p
] = st
;
7582 per_pool_stats
= true;
7586 // indicate whether we are reporting per-pool stats
7587 m
->osd_stat
.num_osds
= 1;
7588 m
->osd_stat
.num_per_pool_osds
= per_pool_stats
? 1 : 0;
7589 m
->osd_stat
.num_per_pool_omap_osds
= per_pool_omap_stats
? 1 : 0;
7594 vector
<DaemonHealthMetric
> OSD::get_health_metrics()
7596 vector
<DaemonHealthMetric
> metrics
;
7598 utime_t oldest_secs
;
7599 const utime_t now
= ceph_clock_now();
7601 too_old
-= cct
->_conf
.get_val
<double>("osd_op_complaint_time");
7603 TrackedOpRef oldest_op
;
7604 auto count_slow_ops
= [&](TrackedOp
& op
) {
7605 if (op
.get_initiated() < too_old
) {
7607 ss
<< "slow request " << op
.get_desc()
7609 << op
.get_initiated()
7611 << op
.state_string();
7612 lgeneric_subdout(cct
,osd
,20) << ss
.str() << dendl
;
7613 clog
->warn() << ss
.str();
7615 if (!oldest_op
|| op
.get_initiated() < oldest_op
->get_initiated()) {
7623 if (op_tracker
.visit_ops_in_flight(&oldest_secs
, count_slow_ops
)) {
7625 derr
<< __func__
<< " reporting " << slow
<< " slow ops, oldest is "
7626 << oldest_op
->get_desc() << dendl
;
7628 metrics
.emplace_back(daemon_metric::SLOW_OPS
, slow
, oldest_secs
);
7630 // no news is not good news.
7631 metrics
.emplace_back(daemon_metric::SLOW_OPS
, 0, 0);
7635 std::lock_guard
l(pending_creates_lock
);
7636 auto n_primaries
= pending_creates_from_mon
;
7637 for (const auto& create
: pending_creates_from_osd
) {
7638 if (create
.second
) {
7642 metrics
.emplace_back(daemon_metric::PENDING_CREATING_PGS
, n_primaries
);
7647 // =====================================================
7650 void OSD::wait_for_new_map(OpRequestRef op
)
7653 if (waiting_for_osdmap
.empty()) {
7654 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7657 logger
->inc(l_osd_waiting_for_map
);
7658 waiting_for_osdmap
.push_back(op
);
7659 op
->mark_delayed("wait for new map");
7664 * assimilate new OSDMap(s). scan pgs, etc.
7667 void OSD::note_down_osd(int peer
)
7669 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7670 cluster_messenger
->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer
));
7672 std::lock_guard l
{heartbeat_lock
};
7673 failure_queue
.erase(peer
);
7674 failure_pending
.erase(peer
);
7675 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
7676 if (p
!= heartbeat_peers
.end()) {
7677 p
->second
.clear_mark_down();
7678 heartbeat_peers
.erase(p
);
7682 void OSD::note_up_osd(int peer
)
7684 heartbeat_set_peers_need_update();
7687 struct C_OnMapCommit
: public Context
{
7689 epoch_t first
, last
;
7691 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
7692 : osd(o
), first(f
), last(l
), msg(m
) {}
7693 void finish(int r
) override
{
7694 osd
->_committed_osd_maps(first
, last
, msg
);
7699 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
7701 std::lock_guard
l(osdmap_subscribe_lock
);
7702 if (latest_subscribed_epoch
>= epoch
&& !force_request
)
7705 latest_subscribed_epoch
= std::max
<uint64_t>(epoch
, latest_subscribed_epoch
);
7707 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
7713 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
7715 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
7716 if (min
<= superblock
.oldest_map
)
7720 ObjectStore::Transaction t
;
7721 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
7722 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
7723 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
7724 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
7725 superblock
.oldest_map
= e
+ 1;
7727 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
7728 service
.publish_superblock(superblock
);
7729 write_superblock(t
);
7730 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7731 ceph_assert(tr
== 0);
7734 // skip_maps leaves us with a range of old maps if we fail to remove all
7735 // of them before moving superblock.oldest_map forward to the first map
7736 // in the incoming MOSDMap msg. so we should continue removing them in
7737 // this case, even we could do huge series of delete transactions all at
7744 service
.publish_superblock(superblock
);
7745 write_superblock(t
);
7746 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7747 ceph_assert(tr
== 0);
7749 // we should not remove the cached maps
7750 ceph_assert(min
<= service
.map_cache
.cached_key_lower_bound());
7753 void OSD::handle_osd_map(MOSDMap
*m
)
7755 // wait for pgs to catch up
7757 // we extend the map cache pins to accomodate pgs slow to consume maps
7758 // for some period, until we hit the max_lag_factor bound, at which point
7759 // we block here to stop injesting more maps than they are able to keep
7761 epoch_t max_lag
= cct
->_conf
->osd_map_cache_size
*
7762 m_osd_pg_epoch_max_lag_factor
;
7763 ceph_assert(max_lag
> 0);
7764 epoch_t osd_min
= 0;
7765 for (auto shard
: shards
) {
7766 epoch_t min
= shard
->get_min_pg_epoch();
7767 if (osd_min
== 0 || min
< osd_min
) {
7771 epoch_t osdmap_epoch
= get_osdmap_epoch();
7773 osdmap_epoch
> max_lag
&&
7774 osdmap_epoch
- max_lag
> osd_min
) {
7775 epoch_t need
= osdmap_epoch
- max_lag
;
7776 dout(10) << __func__
<< " waiting for pgs to catch up (need " << need
7777 << " max_lag " << max_lag
<< ")" << dendl
;
7778 for (auto shard
: shards
) {
7779 epoch_t min
= shard
->get_min_pg_epoch();
7781 dout(10) << __func__
<< " waiting for pgs to consume " << need
7782 << " (shard " << shard
->shard_id
<< " min " << min
7783 << ", map cache is " << cct
->_conf
->osd_map_cache_size
7784 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7786 unlock_guard unlock
{osd_lock
};
7787 shard
->wait_min_pg_epoch(need
);
7793 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7794 map
<epoch_t
,OSDMapRef
> added_maps
;
7795 map
<epoch_t
,bufferlist
> added_maps_bl
;
7796 if (m
->fsid
!= monc
->get_fsid()) {
7797 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
7798 << monc
->get_fsid() << dendl
;
7802 if (is_initializing()) {
7803 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
7808 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
7809 if (session
&& !(session
->entity_name
.is_mon() ||
7810 session
->entity_name
.is_osd())) {
7812 dout(10) << "got osd map from Session " << session
7813 << " which we can't take maps from (not a mon or osd)" << dendl
;
7818 // share with the objecter
7820 service
.objecter
->handle_osd_map(m
);
7822 epoch_t first
= m
->get_first();
7823 epoch_t last
= m
->get_last();
7824 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
7825 << superblock
.newest_map
7826 << ", src has [" << m
->oldest_map
<< "," << m
->newest_map
<< "]"
7829 logger
->inc(l_osd_map
);
7830 logger
->inc(l_osd_mape
, last
- first
+ 1);
7831 if (first
<= superblock
.newest_map
)
7832 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
7833 if (service
.max_oldest_map
< m
->oldest_map
) {
7834 service
.max_oldest_map
= m
->oldest_map
;
7835 ceph_assert(service
.max_oldest_map
>= superblock
.oldest_map
);
7838 // make sure there is something new, here, before we bother flushing
7839 // the queues and such
7840 if (last
<= superblock
.newest_map
) {
7841 dout(10) << " no new maps here, dropping" << dendl
;
7847 bool skip_maps
= false;
7848 if (first
> superblock
.newest_map
+ 1) {
7849 dout(10) << "handle_osd_map message skips epochs "
7850 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
7851 if (m
->oldest_map
<= superblock
.newest_map
+ 1) {
7852 osdmap_subscribe(superblock
.newest_map
+ 1, false);
7856 // always try to get the full range of maps--as many as we can. this
7857 // 1- is good to have
7858 // 2- is at present the only way to ensure that we get a *full* map as
7860 if (m
->oldest_map
< first
) {
7861 osdmap_subscribe(m
->oldest_map
- 1, true);
7868 ObjectStore::Transaction t
;
7869 uint64_t txn_size
= 0;
7871 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> purged_snaps
;
7873 // store new maps: queue for disk and put in the osdmap cache
7874 epoch_t start
= std::max(superblock
.newest_map
+ 1, first
);
7875 for (epoch_t e
= start
; e
<= last
; e
++) {
7876 if (txn_size
>= t
.get_num_bytes()) {
7877 derr
<< __func__
<< " transaction size overflowed" << dendl
;
7878 ceph_assert(txn_size
< t
.get_num_bytes());
7880 txn_size
= t
.get_num_bytes();
7881 map
<epoch_t
,bufferlist
>::iterator p
;
7882 p
= m
->maps
.find(e
);
7883 if (p
!= m
->maps
.end()) {
7884 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
7885 OSDMap
*o
= new OSDMap
;
7886 bufferlist
& bl
= p
->second
;
7890 purged_snaps
[e
] = o
->get_new_purged_snaps();
7892 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
7893 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
7894 added_maps
[e
] = add_map(o
);
7895 added_maps_bl
[e
] = bl
;
7900 p
= m
->incremental_maps
.find(e
);
7901 if (p
!= m
->incremental_maps
.end()) {
7902 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
7903 bufferlist
& bl
= p
->second
;
7904 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
7905 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
7907 OSDMap
*o
= new OSDMap
;
7910 bool got
= get_map_bl(e
- 1, obl
);
7912 auto p
= added_maps_bl
.find(e
- 1);
7913 ceph_assert(p
!= added_maps_bl
.end());
7919 OSDMap::Incremental inc
;
7920 auto p
= bl
.cbegin();
7923 if (o
->apply_incremental(inc
) < 0) {
7924 derr
<< "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
7925 ceph_abort_msg("bad fsid");
7929 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
7931 bool injected_failure
= false;
7932 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
7933 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
7934 derr
<< __func__
<< " injecting map crc failure" << dendl
;
7935 injected_failure
= true;
7938 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
7939 dout(2) << "got incremental " << e
7940 << " but failed to encode full with correct crc; requesting"
7942 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
7943 dout(20) << "my encoded map was:\n";
7944 fbl
.hexdump(*_dout
);
7947 request_full_map(e
, last
);
7950 // don't continue committing if we failed to enc the first inc map
7952 dout(10) << __func__
<< " bailing because last < start (" << last
<< "<" << start
<< ")" << dendl
;
7959 purged_snaps
[e
] = o
->get_new_purged_snaps();
7961 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
7962 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
7963 added_maps
[e
] = add_map(o
);
7964 added_maps_bl
[e
] = fbl
;
7968 ceph_abort_msg("MOSDMap lied about what maps it had?");
7971 // even if this map isn't from a mon, we may have satisfied our subscription
7972 monc
->sub_got("osdmap", last
);
7974 if (!m
->maps
.empty() && requested_full_first
) {
7975 dout(10) << __func__
<< " still missing full maps " << requested_full_first
7976 << ".." << requested_full_last
<< dendl
;
7977 rerequest_full_maps();
7980 if (superblock
.oldest_map
) {
7981 // make sure we at least keep pace with incoming maps
7982 trim_maps(m
->oldest_map
, last
- first
+ 1, skip_maps
);
7983 pg_num_history
.prune(superblock
.oldest_map
);
7986 if (!superblock
.oldest_map
|| skip_maps
)
7987 superblock
.oldest_map
= first
;
7988 superblock
.newest_map
= last
;
7989 superblock
.current_epoch
= last
;
7991 // note in the superblock that we were clean thru the prior epoch
7992 epoch_t boot_epoch
= service
.get_boot_epoch();
7993 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
7994 superblock
.mounted
= boot_epoch
;
7995 superblock
.clean_thru
= last
;
7998 // check for pg_num changes and deleted pools
8000 for (auto& i
: added_maps
) {
8002 if (!(lastmap
= service
.try_get_map(i
.first
- 1))) {
8003 dout(10) << __func__
<< " can't get previous map " << i
.first
- 1
8004 << " probably first start of this osd" << dendl
;
8008 ceph_assert(lastmap
->get_epoch() + 1 == i
.second
->get_epoch());
8009 for (auto& j
: lastmap
->get_pools()) {
8010 if (!i
.second
->have_pg_pool(j
.first
)) {
8011 pg_num_history
.log_pool_delete(i
.first
, j
.first
);
8012 dout(10) << __func__
<< " recording final pg_pool_t for pool "
8013 << j
.first
<< dendl
;
8014 // this information is needed by _make_pg() if have to restart before
8015 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8016 ghobject_t obj
= make_final_pool_info_oid(j
.first
);
8018 encode(j
.second
, bl
, CEPH_FEATURES_ALL
);
8019 string name
= lastmap
->get_pool_name(j
.first
);
8021 map
<string
,string
> profile
;
8022 if (lastmap
->get_pg_pool(j
.first
)->is_erasure()) {
8023 profile
= lastmap
->get_erasure_code_profile(
8024 lastmap
->get_pg_pool(j
.first
)->erasure_code_profile
);
8026 encode(profile
, bl
);
8027 t
.write(coll_t::meta(), obj
, 0, bl
.length(), bl
);
8028 } else if (unsigned new_pg_num
= i
.second
->get_pg_num(j
.first
);
8029 new_pg_num
!= j
.second
.get_pg_num()) {
8030 dout(10) << __func__
<< " recording pool " << j
.first
<< " pg_num "
8031 << j
.second
.get_pg_num() << " -> " << new_pg_num
<< dendl
;
8032 pg_num_history
.log_pg_num_change(i
.first
, j
.first
, new_pg_num
);
8035 for (auto& j
: i
.second
->get_pools()) {
8036 if (!lastmap
->have_pg_pool(j
.first
)) {
8037 dout(10) << __func__
<< " recording new pool " << j
.first
<< " pg_num "
8038 << j
.second
.get_pg_num() << dendl
;
8039 pg_num_history
.log_pg_num_change(i
.first
, j
.first
,
8040 j
.second
.get_pg_num());
8045 pg_num_history
.epoch
= last
;
8048 ::encode(pg_num_history
, bl
);
8049 t
.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl
.length(), bl
);
8050 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
8053 // record new purged_snaps
8054 if (superblock
.purged_snaps_last
== start
- 1) {
8055 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
8056 make_purged_snaps_oid(), &t
,
8058 superblock
.purged_snaps_last
= last
;
8060 dout(10) << __func__
<< " superblock purged_snaps_last is "
8061 << superblock
.purged_snaps_last
8062 << ", not recording new purged_snaps" << dendl
;
8065 // superblock and commit
8066 write_superblock(t
);
8067 t
.register_on_commit(new C_OnMapCommit(this, start
, last
, m
));
8068 store
->queue_transaction(
8071 service
.publish_superblock(superblock
);
8074 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
8076 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
8077 if (is_stopping()) {
8078 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8081 std::lock_guard
l(osd_lock
);
8082 if (is_stopping()) {
8083 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8088 ceph_assert(first
<= last
);
8090 bool do_shutdown
= false;
8091 bool do_restart
= false;
8092 bool network_error
= false;
8093 OSDMapRef osdmap
= get_osdmap();
8095 // advance through the new maps
8096 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
8097 dout(10) << " advance to epoch " << cur
8098 << " (<= last " << last
8099 << " <= newest_map " << superblock
.newest_map
8102 OSDMapRef newmap
= get_map(cur
);
8103 ceph_assert(newmap
); // we just cached it above!
8105 // start blacklisting messages sent to peers that go down.
8106 service
.pre_publish_map(newmap
);
8108 // kill connections to newly down osds
8109 bool waited_for_reservations
= false;
8111 osdmap
= get_osdmap();
8112 osdmap
->get_all_osds(old
);
8113 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
8115 osdmap
->is_up(*p
) && // in old map
8116 newmap
->is_down(*p
)) { // but not the new one
8117 if (!waited_for_reservations
) {
8118 service
.await_reserved_maps();
8119 waited_for_reservations
= true;
8122 } else if (*p
!= whoami
&&
8123 osdmap
->is_down(*p
) &&
8124 newmap
->is_up(*p
)) {
8129 if (osdmap
->is_noup(whoami
) != newmap
->is_noup(whoami
)) {
8130 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
8133 // this captures the case where we sent the boot message while
8134 // NOUP was being set on the mon and our boot request was
8135 // dropped, and then later it is cleared. it imperfectly
8136 // handles the case where our original boot message was not
8137 // dropped and we restart even though we might have booted, but
8138 // that is harmless (boot will just take slightly longer).
8143 osdmap
= std::move(newmap
);
8147 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
8149 osdmap
->is_up(whoami
) &&
8150 osdmap
->get_addrs(whoami
) == client_messenger
->get_myaddrs()) {
8151 up_epoch
= osdmap
->get_epoch();
8152 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
8154 boot_epoch
= osdmap
->get_epoch();
8155 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
8157 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
8161 epoch_t _bind_epoch
= service
.get_bind_epoch();
8162 if (osdmap
->is_up(whoami
) &&
8163 osdmap
->get_addrs(whoami
).legacy_equals(
8164 client_messenger
->get_myaddrs()) &&
8165 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
8168 dout(1) << "state: booting -> active" << dendl
;
8169 set_state(STATE_ACTIVE
);
8172 // set incarnation so that osd_reqid_t's we generate for our
8173 // objecter requests are unique across restarts.
8174 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
8175 cancel_pending_failures();
8179 if (osdmap
->get_epoch() > 0 &&
8181 if (!osdmap
->exists(whoami
)) {
8182 derr
<< "map says i do not exist. shutting down." << dendl
;
8183 do_shutdown
= true; // don't call shutdown() while we have
8184 // everything paused
8185 } else if (osdmap
->is_stop(whoami
)) {
8186 derr
<< "map says i am stopped by admin. shutting down." << dendl
;
8188 } else if (!osdmap
->is_up(whoami
) ||
8189 !osdmap
->get_addrs(whoami
).legacy_equals(
8190 client_messenger
->get_myaddrs()) ||
8191 !osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8192 cluster_messenger
->get_myaddrs()) ||
8193 !osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8194 hb_back_server_messenger
->get_myaddrs()) ||
8195 !osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8196 hb_front_server_messenger
->get_myaddrs())) {
8197 if (!osdmap
->is_up(whoami
)) {
8198 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
8199 service
.got_stop_ack();
8201 clog
->warn() << "Monitor daemon marked osd." << whoami
<< " down, "
8202 "but it is still running";
8203 clog
->debug() << "map e" << osdmap
->get_epoch()
8204 << " wrongly marked me down at e"
8205 << osdmap
->get_down_at(whoami
);
8207 if (monc
->monmap
.min_mon_release
>= ceph_release_t::octopus
) {
8208 // note that this is best-effort...
8209 monc
->send_mon_message(
8213 osdmap
->get_epoch()));
8215 } else if (!osdmap
->get_addrs(whoami
).legacy_equals(
8216 client_messenger
->get_myaddrs())) {
8217 clog
->error() << "map e" << osdmap
->get_epoch()
8218 << " had wrong client addr (" << osdmap
->get_addrs(whoami
)
8219 << " != my " << client_messenger
->get_myaddrs() << ")";
8220 } else if (!osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8221 cluster_messenger
->get_myaddrs())) {
8222 clog
->error() << "map e" << osdmap
->get_epoch()
8223 << " had wrong cluster addr ("
8224 << osdmap
->get_cluster_addrs(whoami
)
8225 << " != my " << cluster_messenger
->get_myaddrs() << ")";
8226 } else if (!osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8227 hb_back_server_messenger
->get_myaddrs())) {
8228 clog
->error() << "map e" << osdmap
->get_epoch()
8229 << " had wrong heartbeat back addr ("
8230 << osdmap
->get_hb_back_addrs(whoami
)
8231 << " != my " << hb_back_server_messenger
->get_myaddrs()
8233 } else if (!osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8234 hb_front_server_messenger
->get_myaddrs())) {
8235 clog
->error() << "map e" << osdmap
->get_epoch()
8236 << " had wrong heartbeat front addr ("
8237 << osdmap
->get_hb_front_addrs(whoami
)
8238 << " != my " << hb_front_server_messenger
->get_myaddrs()
8242 if (!service
.is_stopping()) {
8243 epoch_t up_epoch
= 0;
8244 epoch_t bind_epoch
= osdmap
->get_epoch();
8245 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
8249 utime_t now
= ceph_clock_now();
8250 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
8251 osd_markdown_log
.push_back(now
);
8252 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
8253 derr
<< __func__
<< " marked down "
8254 << osd_markdown_log
.size()
8255 << " > osd_max_markdown_count "
8256 << cct
->_conf
->osd_max_markdown_count
8257 << " in last " << grace
<< " seconds, shutting down"
8263 start_waiting_for_healthy();
8265 set
<int> avoid_ports
;
8266 #if defined(__FreeBSD__)
8267 // prevent FreeBSD from grabbing the client_messenger port during
8268 // rebinding. In which case a cluster_meesneger will connect also
8270 client_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8272 cluster_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8274 int r
= cluster_messenger
->rebind(avoid_ports
);
8276 do_shutdown
= true; // FIXME: do_restart?
8277 network_error
= true;
8278 derr
<< __func__
<< " marked down:"
8279 << " rebind cluster_messenger failed" << dendl
;
8282 hb_back_server_messenger
->mark_down_all();
8283 hb_front_server_messenger
->mark_down_all();
8284 hb_front_client_messenger
->mark_down_all();
8285 hb_back_client_messenger
->mark_down_all();
8287 reset_heartbeat_peers(true);
8290 } else if (osdmap
->get_epoch() > 0 && osdmap
->is_stop(whoami
)) {
8291 derr
<< "map says i am stopped by admin. shutting down." << dendl
;
8297 check_osdmap_features();
8302 if (is_active() || is_waiting_for_healthy())
8303 maybe_update_heartbeat_peers();
8310 if (network_error
) {
8311 cancel_pending_failures();
8313 // trigger shutdown in a different thread
8314 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
8315 queue_async_signal(SIGINT
);
8317 else if (m
->newest_map
&& m
->newest_map
> last
) {
8318 dout(10) << " msg say newest map is " << m
->newest_map
8319 << ", requesting more" << dendl
;
8320 osdmap_subscribe(osdmap
->get_epoch()+1, false);
8322 else if (is_preboot()) {
8323 if (m
->get_source().is_mon())
8324 _preboot(m
->oldest_map
, m
->newest_map
);
8328 else if (do_restart
)
8333 void OSD::check_osdmap_features()
8335 // adjust required feature bits?
8337 // we have to be a bit careful here, because we are accessing the
8338 // Policy structures without taking any lock. in particular, only
8339 // modify integer values that can safely be read by a racing CPU.
8340 // since we are only accessing existing Policy structures a their
8341 // current memory location, and setting or clearing bits in integer
8342 // fields, and we are the only writer, this is not a problem.
8344 const auto osdmap
= get_osdmap();
8346 Messenger::Policy p
= client_messenger
->get_default_policy();
8348 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
8349 if ((p
.features_required
& mask
) != features
) {
8350 dout(0) << "crush map has features " << features
8351 << ", adjusting msgr requires for clients" << dendl
;
8352 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8353 client_messenger
->set_default_policy(p
);
8357 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
8359 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
8360 if ((p
.features_required
& mask
) != features
) {
8361 dout(0) << "crush map has features " << features
8362 << " was " << p
.features_required
8363 << ", adjusting msgr requires for mons" << dendl
;
8364 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8365 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
8369 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
8371 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
8373 if ((p
.features_required
& mask
) != features
) {
8374 dout(0) << "crush map has features " << features
8375 << ", adjusting msgr requires for osds" << dendl
;
8376 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8377 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
8380 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
8381 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
8382 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
8383 ObjectStore::Transaction t
;
8384 write_superblock(t
);
8385 int err
= store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
8386 ceph_assert(err
== 0);
8390 if (osdmap
->require_osd_release
< ceph_release_t::nautilus
) {
8391 hb_front_server_messenger
->set_require_authorizer(false);
8392 hb_back_server_messenger
->set_require_authorizer(false);
8394 hb_front_server_messenger
->set_require_authorizer(true);
8395 hb_back_server_messenger
->set_require_authorizer(true);
8398 if (osdmap
->require_osd_release
!= last_require_osd_release
) {
8399 dout(1) << __func__
<< " require_osd_release " << last_require_osd_release
8400 << " -> " << to_string(osdmap
->require_osd_release
) << dendl
;
8401 store
->write_meta("require_osd_release",
8402 stringify((int)osdmap
->require_osd_release
));
8403 last_require_osd_release
= osdmap
->require_osd_release
;
8407 struct C_FinishSplits
: public Context
{
8410 C_FinishSplits(OSD
*osd
, const set
<PGRef
> &in
)
8411 : osd(osd
), pgs(in
) {}
8412 void finish(int r
) override
{
8413 osd
->_finish_splits(pgs
);
8417 void OSD::_finish_splits(set
<PGRef
>& pgs
)
8419 dout(10) << __func__
<< " " << pgs
<< dendl
;
8422 for (set
<PGRef
>::iterator i
= pgs
.begin();
8427 PeeringCtx rctx
= create_context();
8429 dout(10) << __func__
<< " " << *pg
<< dendl
;
8430 epoch_t e
= pg
->get_osdmap_epoch();
8431 pg
->handle_initialize(rctx
);
8432 pg
->queue_null(e
, e
);
8433 dispatch_context(rctx
, pg
, service
.get_osdmap());
8436 unsigned shard_index
= pg
->pg_id
.hash_to_shard(num_shards
);
8437 shards
[shard_index
]->register_and_wake_split_child(pg
);
8441 bool OSD::add_merge_waiter(OSDMapRef nextmap
, spg_t target
, PGRef src
,
8444 std::lock_guard
l(merge_lock
);
8445 auto& p
= merge_waiters
[nextmap
->get_epoch()][target
];
8446 p
[src
->pg_id
] = src
;
8447 dout(10) << __func__
<< " added merge_waiter " << src
->pg_id
8448 << " for " << target
<< ", have " << p
.size() << "/" << need
8450 return p
.size() == need
;
8453 bool OSD::advance_pg(
8456 ThreadPool::TPHandle
&handle
,
8459 if (osd_epoch
<= pg
->get_osdmap_epoch()) {
8462 ceph_assert(pg
->is_locked());
8463 OSDMapRef lastmap
= pg
->get_osdmap();
8464 ceph_assert(lastmap
->get_epoch() < osd_epoch
);
8465 set
<PGRef
> new_pgs
; // any split children
8468 unsigned old_pg_num
= lastmap
->have_pg_pool(pg
->pg_id
.pool()) ?
8469 lastmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8470 for (epoch_t next_epoch
= pg
->get_osdmap_epoch() + 1;
8471 next_epoch
<= osd_epoch
;
8473 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
8475 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
8479 unsigned new_pg_num
=
8480 (old_pg_num
&& nextmap
->have_pg_pool(pg
->pg_id
.pool())) ?
8481 nextmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8482 if (old_pg_num
&& new_pg_num
&& old_pg_num
!= new_pg_num
) {
8484 if (nextmap
->have_pg_pool(pg
->pg_id
.pool())) {
8486 if (pg
->pg_id
.is_merge_source(
8490 // we are merge source
8491 PGRef spg
= pg
; // carry a ref
8492 dout(1) << __func__
<< " " << pg
->pg_id
8493 << " is merge source, target is " << parent
8495 pg
->write_if_dirty(rctx
);
8496 if (!new_pgs
.empty()) {
8497 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8501 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8503 // release backoffs explicitly, since the on_shutdown path
8504 // aggressively tears down backoff state.
8505 if (pg
->is_primary()) {
8506 pg
->release_pg_backoffs();
8509 OSDShard
*sdata
= pg
->osd_shard
;
8511 std::lock_guard
l(sdata
->shard_lock
);
8513 sdata
->_detach_pg(pg
->pg_slot
);
8514 // update pg count now since we might not get an osdmap
8516 if (pg
->is_primary())
8517 logger
->dec(l_osd_pg_primary
);
8518 else if (pg
->is_nonprimary())
8519 logger
->dec(l_osd_pg_replica
); // misnomer
8521 logger
->dec(l_osd_pg_stray
);
8526 set
<spg_t
> children
;
8527 parent
.is_split(new_pg_num
, old_pg_num
, &children
);
8528 if (add_merge_waiter(nextmap
, parent
, pg
, children
.size())) {
8529 enqueue_peering_evt(
8532 std::make_shared
<PGPeeringEvent
>(
8533 nextmap
->get_epoch(),
8534 nextmap
->get_epoch(),
8539 } else if (pg
->pg_id
.is_merge_target(old_pg_num
, new_pg_num
)) {
8540 // we are merge target
8541 set
<spg_t
> children
;
8542 pg
->pg_id
.is_split(new_pg_num
, old_pg_num
, &children
);
8543 dout(20) << __func__
<< " " << pg
->pg_id
8544 << " is merge target, sources are " << children
8546 map
<spg_t
,PGRef
> sources
;
8548 std::lock_guard
l(merge_lock
);
8549 auto& s
= merge_waiters
[nextmap
->get_epoch()][pg
->pg_id
];
8550 unsigned need
= children
.size();
8551 dout(20) << __func__
<< " have " << s
.size() << "/"
8553 if (s
.size() == need
) {
8555 merge_waiters
[nextmap
->get_epoch()].erase(pg
->pg_id
);
8556 if (merge_waiters
[nextmap
->get_epoch()].empty()) {
8557 merge_waiters
.erase(nextmap
->get_epoch());
8561 if (!sources
.empty()) {
8562 unsigned new_pg_num
= nextmap
->get_pg_num(pg
->pg_id
.pool());
8563 unsigned split_bits
= pg
->pg_id
.get_split_bits(new_pg_num
);
8564 dout(1) << __func__
<< " merging " << pg
->pg_id
<< dendl
;
8566 sources
, rctx
, split_bits
,
8567 nextmap
->get_pg_pool(
8568 pg
->pg_id
.pool())->last_pg_merge_meta
);
8569 pg
->pg_slot
->waiting_for_merge_epoch
= 0;
8571 dout(20) << __func__
<< " not ready to merge yet" << dendl
;
8572 pg
->write_if_dirty(rctx
);
8573 if (!new_pgs
.empty()) {
8574 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8578 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8580 // kick source(s) to get them ready
8581 for (auto& i
: children
) {
8582 dout(20) << __func__
<< " kicking source " << i
<< dendl
;
8583 enqueue_peering_evt(
8586 std::make_shared
<PGPeeringEvent
>(
8587 nextmap
->get_epoch(),
8588 nextmap
->get_epoch(),
8598 vector
<int> newup
, newacting
;
8599 int up_primary
, acting_primary
;
8600 nextmap
->pg_to_up_acting_osds(
8602 &newup
, &up_primary
,
8603 &newacting
, &acting_primary
);
8604 pg
->handle_advance_map(
8605 nextmap
, lastmap
, newup
, up_primary
,
8606 newacting
, acting_primary
, rctx
);
8608 auto oldpool
= lastmap
->get_pools().find(pg
->pg_id
.pool());
8609 auto newpool
= nextmap
->get_pools().find(pg
->pg_id
.pool());
8610 if (oldpool
!= lastmap
->get_pools().end()
8611 && newpool
!= nextmap
->get_pools().end()) {
8612 dout(20) << __func__
8613 << " new pool opts " << newpool
->second
.opts
8614 << " old pool opts " << oldpool
->second
.opts
8617 double old_min_interval
= 0, new_min_interval
= 0;
8618 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &old_min_interval
);
8619 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &new_min_interval
);
8621 double old_max_interval
= 0, new_max_interval
= 0;
8622 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &old_max_interval
);
8623 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &new_max_interval
);
8625 // Assume if an interval is change from set to unset or vice versa the actual config
8626 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8628 if (old_min_interval
!= new_min_interval
|| old_max_interval
!= new_max_interval
) {
8629 pg
->on_info_history_change();
8633 if (new_pg_num
&& old_pg_num
!= new_pg_num
) {
8635 set
<spg_t
> children
;
8636 if (pg
->pg_id
.is_split(
8641 pg
, children
, &new_pgs
, lastmap
, nextmap
,
8647 old_pg_num
= new_pg_num
;
8648 handle
.reset_tp_timeout();
8650 pg
->handle_activate_map(rctx
);
8654 if (!new_pgs
.empty()) {
8655 rctx
.transaction
.register_on_applied(new C_FinishSplits(this, new_pgs
));
8660 void OSD::consume_map()
8662 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8663 auto osdmap
= get_osdmap();
8664 dout(7) << "consume_map version " << osdmap
->get_epoch() << dendl
;
8666 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8667 * speak the older sorting version any more. Be careful not to force
8668 * a shutdown if we are merely processing old maps, though.
8670 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
) && is_active()) {
8671 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
8675 service
.pre_publish_map(osdmap
);
8676 service
.await_reserved_maps();
8677 service
.publish_map(osdmap
);
8679 // prime splits and merges
8680 set
<pair
<spg_t
,epoch_t
>> newly_split
; // splits, and when
8681 set
<pair
<spg_t
,epoch_t
>> merge_pgs
; // merge participants, and when
8682 for (auto& shard
: shards
) {
8683 shard
->identify_splits_and_merges(osdmap
, &newly_split
, &merge_pgs
);
8685 if (!newly_split
.empty()) {
8686 for (auto& shard
: shards
) {
8687 shard
->prime_splits(osdmap
, &newly_split
);
8689 ceph_assert(newly_split
.empty());
8692 // prune sent_ready_to_merge
8693 service
.prune_sent_ready_to_merge(osdmap
);
8695 // FIXME, maybe: We could race against an incoming peering message
8696 // that instantiates a merge PG after identify_merges() below and
8697 // never set up its peer to complete the merge. An OSD restart
8698 // would clear it up. This is a hard race to resolve,
8699 // extraordinarily rare (we only merge PGs that are stable and
8700 // clean, so it'd have to be an imported PG to an OSD with a
8701 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8702 // replace all of this with a seastar-based code soon anyway.
8703 if (!merge_pgs
.empty()) {
8704 // mark the pgs we already have, or create new and empty merge
8705 // participants for those we are missing. do this all under the
8706 // shard lock so we don't have to worry about racing pg creates
8708 for (auto& shard
: shards
) {
8709 shard
->prime_merges(osdmap
, &merge_pgs
);
8711 ceph_assert(merge_pgs
.empty());
8714 service
.prune_pg_created();
8716 unsigned pushes_to_free
= 0;
8717 for (auto& shard
: shards
) {
8718 shard
->consume_map(osdmap
, &pushes_to_free
);
8721 vector
<spg_t
> pgids
;
8724 // count (FIXME, probably during seastar rewrite)
8725 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
8728 for (auto& pg
: pgs
) {
8729 // FIXME (probably during seastar rewrite): this is lockless and
8730 // racy, but we don't want to take pg lock here.
8731 if (pg
->is_primary())
8733 else if (pg
->is_nonprimary())
8734 num_pg_replica
++; // misnomer
8740 // FIXME (as part of seastar rewrite): move to OSDShard
8741 std::lock_guard
l(pending_creates_lock
);
8742 for (auto pg
= pending_creates_from_osd
.begin();
8743 pg
!= pending_creates_from_osd
.end();) {
8744 if (osdmap
->get_pg_acting_role(pg
->first
, whoami
) < 0) {
8745 dout(10) << __func__
<< " pg " << pg
->first
<< " doesn't map here, "
8746 << "discarding pending_create_from_osd" << dendl
;
8747 pg
= pending_creates_from_osd
.erase(pg
);
8754 service
.maybe_inject_dispatch_delay();
8756 dispatch_sessions_waiting_on_map();
8758 service
.maybe_inject_dispatch_delay();
8760 service
.release_reserved_pushes(pushes_to_free
);
8762 // queue null events to push maps down to individual PGs
8763 for (auto pgid
: pgids
) {
8764 enqueue_peering_evt(
8767 std::make_shared
<PGPeeringEvent
>(
8768 osdmap
->get_epoch(),
8769 osdmap
->get_epoch(),
8772 logger
->set(l_osd_pg
, pgids
.size());
8773 logger
->set(l_osd_pg_primary
, num_pg_primary
);
8774 logger
->set(l_osd_pg_replica
, num_pg_replica
);
8775 logger
->set(l_osd_pg_stray
, num_pg_stray
);
8778 void OSD::activate_map()
8780 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8781 auto osdmap
= get_osdmap();
8783 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
8786 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
8787 if (!service
.recovery_is_paused()) {
8788 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
8789 service
.pause_recovery();
8792 if (service
.recovery_is_paused()) {
8793 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
8794 service
.unpause_recovery();
8798 service
.activate_map();
8801 take_waiters(waiting_for_osdmap
);
8804 bool OSD::require_mon_peer(const Message
*m
)
8806 if (!m
->get_connection()->peer_is_mon()) {
8807 dout(0) << "require_mon_peer received from non-mon "
8808 << m
->get_connection()->get_peer_addr()
8809 << " " << *m
<< dendl
;
8815 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
8817 if (!m
->get_connection()->peer_is_mon() &&
8818 !m
->get_connection()->peer_is_mgr()) {
8819 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8820 << m
->get_connection()->get_peer_addr()
8821 << " " << *m
<< dendl
;
8827 bool OSD::require_osd_peer(const Message
*m
)
8829 if (!m
->get_connection()->peer_is_osd()) {
8830 dout(0) << "require_osd_peer received from non-osd "
8831 << m
->get_connection()->get_peer_addr()
8832 << " " << *m
<< dendl
;
8838 bool OSD::require_self_aliveness(const Message
*m
, epoch_t epoch
)
8840 epoch_t up_epoch
= service
.get_up_epoch();
8841 if (epoch
< up_epoch
) {
8842 dout(7) << "from pre-up epoch " << epoch
<< " < " << up_epoch
<< dendl
;
8847 dout(7) << "still in boot state, dropping message " << *m
<< dendl
;
8854 bool OSD::require_same_peer_instance(const Message
*m
, const OSDMapRef
& map
,
8855 bool is_fast_dispatch
)
8857 int from
= m
->get_source().num();
8859 if (map
->is_down(from
) ||
8860 (map
->get_cluster_addrs(from
) != m
->get_source_addrs())) {
8861 dout(5) << "from dead osd." << from
<< ", marking down, "
8862 << " msg was " << m
->get_source_inst().addr
8864 << (map
->is_up(from
) ?
8865 map
->get_cluster_addrs(from
) : entity_addrvec_t())
8867 ConnectionRef con
= m
->get_connection();
8869 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); s
) {
8870 if (!is_fast_dispatch
)
8871 s
->session_dispatch_lock
.lock();
8872 clear_session_waiting_on_map(s
);
8873 con
->set_priv(nullptr); // break ref <-> session cycle, if any
8875 if (!is_fast_dispatch
)
8876 s
->session_dispatch_lock
.unlock();
8885 * require that we have same (or newer) map, and that
8886 * the source is the pg primary.
8888 bool OSD::require_same_or_newer_map(OpRequestRef
& op
, epoch_t epoch
,
8889 bool is_fast_dispatch
)
8891 const Message
*m
= op
->get_req();
8892 const auto osdmap
= get_osdmap();
8893 dout(15) << "require_same_or_newer_map " << epoch
8894 << " (i am " << osdmap
->get_epoch() << ") " << m
<< dendl
;
8896 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8898 // do they have a newer map?
8899 if (epoch
> osdmap
->get_epoch()) {
8900 dout(7) << "waiting for newer map epoch " << epoch
8901 << " > my " << osdmap
->get_epoch() << " with " << m
<< dendl
;
8902 wait_for_new_map(op
);
8906 if (!require_self_aliveness(op
->get_req(), epoch
)) {
8910 // ok, our map is same or newer.. do they still exist?
8911 if (m
->get_connection()->get_messenger() == cluster_messenger
&&
8912 !require_same_peer_instance(op
->get_req(), osdmap
, is_fast_dispatch
)) {
8923 // ----------------------------------------
8926 void OSD::split_pgs(
8928 const set
<spg_t
> &childpgids
, set
<PGRef
> *out_pgs
,
8933 unsigned pg_num
= nextmap
->get_pg_num(parent
->pg_id
.pool());
8934 parent
->update_snap_mapper_bits(parent
->get_pgid().get_split_bits(pg_num
));
8936 vector
<object_stat_sum_t
> updated_stats
;
8937 parent
->start_split_stats(childpgids
, &updated_stats
);
8939 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
8940 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
8941 i
!= childpgids
.end();
8943 ceph_assert(stat_iter
!= updated_stats
.end());
8944 dout(10) << __func__
<< " splitting " << *parent
<< " into " << *i
<< dendl
;
8945 PG
* child
= _make_pg(nextmap
, *i
);
8947 out_pgs
->insert(child
);
8948 child
->ch
= store
->create_new_collection(child
->coll
);
8951 uint32_t shard_index
= i
->hash_to_shard(shards
.size());
8952 assert(NULL
!= shards
[shard_index
]);
8953 store
->set_collection_commit_queue(child
->coll
, &(shards
[shard_index
]->context_queue
));
8956 unsigned split_bits
= i
->get_split_bits(pg_num
);
8957 dout(10) << " pg_num is " << pg_num
8958 << ", m_seed " << i
->ps()
8959 << ", split_bits is " << split_bits
<< dendl
;
8960 parent
->split_colls(
8964 &child
->get_pool().info
,
8971 child
->init_collection_pool_opts();
8973 child
->finish_split_stats(*stat_iter
, rctx
.transaction
);
8976 ceph_assert(stat_iter
!= updated_stats
.end());
8977 parent
->finish_split_stats(*stat_iter
, rctx
.transaction
);
8983 void OSD::handle_pg_create(OpRequestRef op
)
8985 // NOTE: this can be removed in P release (mimic is the last version to
8986 // send MOSDPGCreate messages).
8988 auto m
= op
->get_req
<MOSDPGCreate
>();
8989 ceph_assert(m
->get_type() == MSG_OSD_PG_CREATE
);
8991 dout(10) << "handle_pg_create " << *m
<< dendl
;
8993 if (!require_mon_peer(op
->get_req())) {
8997 if (!require_same_or_newer_map(op
, m
->epoch
, false))
9002 const auto osdmap
= get_osdmap();
9003 map
<pg_t
,utime_t
>::const_iterator ci
= m
->ctimes
.begin();
9004 for (map
<pg_t
,pg_create_t
>::const_iterator p
= m
->mkpg
.begin();
9007 ceph_assert(ci
!= m
->ctimes
.end() && ci
->first
== p
->first
);
9008 epoch_t created
= p
->second
.created
;
9009 if (p
->second
.split_bits
) // Skip split pgs
9013 if (!osdmap
->have_pg_pool(on
.pool())) {
9014 dout(20) << "ignoring pg on deleted pool " << on
<< dendl
;
9018 dout(20) << "mkpg " << on
<< " e" << created
<< "@" << ci
->second
<< dendl
;
9021 bool mapped
= osdmap
->get_primary_shard(on
, &pgid
);
9022 ceph_assert(mapped
);
9024 // is it still ours?
9025 vector
<int> up
, acting
;
9026 int up_primary
= -1;
9027 int acting_primary
= -1;
9028 osdmap
->pg_to_up_acting_osds(on
, &up
, &up_primary
, &acting
, &acting_primary
);
9029 int role
= osdmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
9031 if (acting_primary
!= whoami
) {
9032 dout(10) << "mkpg " << on
<< " not acting_primary (" << acting_primary
9033 << "), my role=" << role
<< ", skipping" << dendl
;
9039 pg_history_t history
;
9040 build_initial_pg_history(pgid
, created
, ci
->second
, &history
, &pi
);
9042 // The mon won't resend unless the primary changed, so we ignore
9043 // same_interval_since. We'll pass this history with the current
9044 // epoch as the event.
9045 if (history
.same_primary_since
> m
->epoch
) {
9046 dout(10) << __func__
<< ": got obsolete pg create on pgid "
9047 << pgid
<< " from epoch " << m
->epoch
9048 << ", primary changed in " << history
.same_primary_since
9052 enqueue_peering_evt(
9055 std::make_shared
<PGPeeringEvent
>(
9056 osdmap
->get_epoch(),
9057 osdmap
->get_epoch(),
9062 osdmap
->get_epoch(),
9070 std::lock_guard
l(pending_creates_lock
);
9071 if (pending_creates_from_mon
== 0) {
9072 last_pg_create_epoch
= m
->epoch
;
9076 maybe_update_heartbeat_peers();
9080 // ----------------------------------------
9081 // peering and recovery
9083 PeeringCtx
OSD::create_context()
9085 return PeeringCtx(get_osdmap()->require_osd_release
);
9088 void OSD::dispatch_context(PeeringCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
9089 ThreadPool::TPHandle
*handle
)
9091 if (!service
.get_osdmap()->is_up(whoami
)) {
9092 dout(20) << __func__
<< " not up in osdmap" << dendl
;
9093 } else if (!is_active()) {
9094 dout(20) << __func__
<< " not active" << dendl
;
9096 for (auto& [osd
, ls
] : ctx
.message_map
) {
9097 if (!curmap
->is_up(osd
)) {
9098 dout(20) << __func__
<< " skipping down osd." << osd
<< dendl
;
9101 ConnectionRef con
= service
.get_con_osd_cluster(
9102 osd
, curmap
->get_epoch());
9104 dout(20) << __func__
<< " skipping osd." << osd
<< " (NULL con)"
9108 service
.maybe_share_map(con
.get(), curmap
);
9110 con
->send_message2(m
);
9115 if ((!ctx
.transaction
.empty() || ctx
.transaction
.has_contexts()) && pg
) {
9116 int tr
= store
->queue_transaction(
9118 std::move(ctx
.transaction
), TrackedOpRef(),
9120 ceph_assert(tr
== 0);
9124 void OSD::handle_fast_pg_create(MOSDPGCreate2
*m
)
9126 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9127 if (!require_mon_peer(m
)) {
9131 for (auto& p
: m
->pgs
) {
9132 spg_t pgid
= p
.first
;
9133 epoch_t created
= p
.second
.first
;
9134 utime_t created_stamp
= p
.second
.second
;
9135 auto q
= m
->pg_extra
.find(pgid
);
9136 if (q
== m
->pg_extra
.end()) {
9137 dout(20) << __func__
<< " " << pgid
<< " e" << created
9138 << "@" << created_stamp
9139 << " (no history or past_intervals)" << dendl
;
9140 // pre-octopus ... no pg history. this can be removed in Q release.
9141 enqueue_peering_evt(
9144 std::make_shared
<PGPeeringEvent
>(
9152 pg_history_t(created
, created_stamp
),
9157 dout(20) << __func__
<< " " << pgid
<< " e" << created
9158 << "@" << created_stamp
9159 << " history " << q
->second
.first
9160 << " pi " << q
->second
.second
<< dendl
;
9161 if (!q
->second
.second
.empty() &&
9162 m
->epoch
< q
->second
.second
.get_bounds().second
) {
9163 clog
->error() << "got pg_create on " << pgid
<< " epoch " << m
->epoch
9164 << " and unmatched past_intervals " << q
->second
.second
9165 << " (history " << q
->second
.first
<< ")";
9167 enqueue_peering_evt(
9170 std::make_shared
<PGPeeringEvent
>(
9187 std::lock_guard
l(pending_creates_lock
);
9188 if (pending_creates_from_mon
== 0) {
9189 last_pg_create_epoch
= m
->epoch
;
9196 void OSD::handle_fast_pg_query(MOSDPGQuery
*m
)
9198 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9199 if (!require_osd_peer(m
)) {
9203 int from
= m
->get_source().num();
9204 for (auto& p
: m
->pg_list
) {
9205 enqueue_peering_evt(
9208 std::make_shared
<PGPeeringEvent
>(
9209 p
.second
.epoch_sent
, p
.second
.epoch_sent
,
9212 pg_shard_t(from
, p
.second
.from
),
9214 p
.second
.epoch_sent
),
9221 void OSD::handle_fast_pg_notify(MOSDPGNotify
* m
)
9223 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9224 if (!require_osd_peer(m
)) {
9228 int from
= m
->get_source().num();
9229 for (auto& p
: m
->get_pg_list()) {
9230 spg_t
pgid(p
.info
.pgid
.pgid
, p
.to
);
9231 enqueue_peering_evt(
9234 std::make_shared
<PGPeeringEvent
>(
9238 pgid
, pg_shard_t(from
, p
.from
),
9240 m
->get_connection()->get_features()),
9253 void OSD::handle_fast_pg_info(MOSDPGInfo
* m
)
9255 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9256 if (!require_osd_peer(m
)) {
9260 int from
= m
->get_source().num();
9261 for (auto& p
: m
->pg_list
) {
9262 enqueue_peering_evt(
9263 spg_t(p
.info
.pgid
.pgid
, p
.to
),
9265 std::make_shared
<PGPeeringEvent
>(
9266 p
.epoch_sent
, p
.query_epoch
,
9268 pg_shard_t(from
, p
.from
),
9276 void OSD::handle_fast_pg_remove(MOSDPGRemove
*m
)
9278 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9279 if (!require_osd_peer(m
)) {
9283 for (auto& pgid
: m
->pg_list
) {
9284 enqueue_peering_evt(
9287 std::make_shared
<PGPeeringEvent
>(
9288 m
->get_epoch(), m
->get_epoch(),
9289 PeeringState::DeleteStart())));
9294 void OSD::handle_fast_force_recovery(MOSDForceRecovery
*m
)
9296 dout(10) << __func__
<< " " << *m
<< dendl
;
9297 if (!require_mon_or_mgr_peer(m
)) {
9301 epoch_t epoch
= get_osdmap_epoch();
9302 for (auto pgid
: m
->forced_pgs
) {
9303 if (m
->options
& OFR_BACKFILL
) {
9304 if (m
->options
& OFR_CANCEL
) {
9305 enqueue_peering_evt(
9308 std::make_shared
<PGPeeringEvent
>(
9310 PeeringState::UnsetForceBackfill())));
9312 enqueue_peering_evt(
9315 std::make_shared
<PGPeeringEvent
>(
9317 PeeringState::SetForceBackfill())));
9319 } else if (m
->options
& OFR_RECOVERY
) {
9320 if (m
->options
& OFR_CANCEL
) {
9321 enqueue_peering_evt(
9324 std::make_shared
<PGPeeringEvent
>(
9326 PeeringState::UnsetForceRecovery())));
9328 enqueue_peering_evt(
9331 std::make_shared
<PGPeeringEvent
>(
9333 PeeringState::SetForceRecovery())));
9340 void OSD::handle_pg_query_nopg(const MQuery
& q
)
9342 spg_t pgid
= q
.pgid
;
9343 dout(10) << __func__
<< " " << pgid
<< dendl
;
9345 OSDMapRef osdmap
= get_osdmap();
9346 if (!osdmap
->have_pg_pool(pgid
.pool()))
9349 dout(10) << " pg " << pgid
<< " dne" << dendl
;
9350 pg_info_t
empty(spg_t(pgid
.pgid
, q
.query
.to
));
9351 ConnectionRef con
= service
.get_con_osd_cluster(q
.from
.osd
, osdmap
->get_epoch());
9354 if (q
.query
.type
== pg_query_t::LOG
||
9355 q
.query
.type
== pg_query_t::FULLLOG
) {
9357 q
.query
.from
, q
.query
.to
,
9358 osdmap
->get_epoch(), empty
,
9359 q
.query
.epoch_sent
);
9361 vector
<pg_notify_t
> ls
;
9364 q
.query
.from
, q
.query
.to
,
9366 osdmap
->get_epoch(),
9369 m
= new MOSDPGNotify(osdmap
->get_epoch(), std::move(ls
));
9371 service
.maybe_share_map(con
.get(), osdmap
);
9372 con
->send_message(m
);
9376 void OSDService::queue_check_readable(spg_t spgid
,
9378 ceph::signedspan delay
)
9380 if (delay
== ceph::signedspan::zero()) {
9381 osd
->enqueue_peering_evt(
9384 std::make_shared
<PGPeeringEvent
>(
9386 PeeringState::CheckReadable())));
9388 mono_timer
.add_event(
9390 [this, spgid
, lpr
]() {
9391 queue_check_readable(spgid
, lpr
);
9397 // =========================================================
9400 void OSDService::_maybe_queue_recovery() {
9401 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
9402 uint64_t available_pushes
;
9403 while (!awaiting_throttle
.empty() &&
9404 _recover_now(&available_pushes
)) {
9405 uint64_t to_start
= std::min(
9407 cct
->_conf
->osd_recovery_max_single_start
);
9408 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
9409 awaiting_throttle
.pop_front();
9410 dout(10) << __func__
<< " starting " << to_start
9411 << ", recovery_ops_reserved " << recovery_ops_reserved
9412 << " -> " << (recovery_ops_reserved
+ to_start
) << dendl
;
9413 recovery_ops_reserved
+= to_start
;
9417 bool OSDService::_recover_now(uint64_t *available_pushes
)
9419 if (available_pushes
)
9420 *available_pushes
= 0;
9422 if (ceph_clock_now() < defer_recovery_until
) {
9423 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
9427 if (recovery_paused
) {
9428 dout(15) << __func__
<< " paused" << dendl
;
9432 uint64_t max
= osd
->get_recovery_max_active();
9433 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
9434 dout(15) << __func__
<< " active " << recovery_ops_active
9435 << " + reserved " << recovery_ops_reserved
9436 << " >= max " << max
<< dendl
;
9440 if (available_pushes
)
9441 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
9446 unsigned OSDService::get_target_pg_log_entries() const
9448 auto num_pgs
= osd
->get_num_pgs();
9449 auto target
= cct
->_conf
->osd_target_pg_log_entries_per_osd
;
9450 if (num_pgs
> 0 && target
> 0) {
9451 // target an even spread of our budgeted log entries across all
9452 // PGs. note that while we only get to control the entry count
9453 // for primary PGs, we'll normally be responsible for a mix of
9454 // primary and replica PGs (for the same pool(s) even), so this
9456 return std::max
<unsigned>(
9457 std::min
<unsigned>(target
/ num_pgs
,
9458 cct
->_conf
->osd_max_pg_log_entries
),
9459 cct
->_conf
->osd_min_pg_log_entries
);
9461 // fall back to a per-pg value.
9462 return cct
->_conf
->osd_min_pg_log_entries
;
9466 void OSD::do_recovery(
9467 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
,
9468 ThreadPool::TPHandle
&handle
)
9470 uint64_t started
= 0;
9473 * When the value of osd_recovery_sleep is set greater than zero, recovery
9474 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9475 * recovery event's schedule time. This is done by adding a
9476 * recovery_requeue_callback event, which re-queues the recovery op using
9477 * queue_recovery_after_sleep.
9479 float recovery_sleep
= get_osd_recovery_sleep();
9481 std::lock_guard
l(service
.sleep_lock
);
9482 if (recovery_sleep
> 0 && service
.recovery_needs_sleep
) {
9484 auto recovery_requeue_callback
= new LambdaContext([this, pgref
, queued
, reserved_pushes
](int r
) {
9485 dout(20) << "do_recovery wake up at "
9487 << ", re-queuing recovery" << dendl
;
9488 std::lock_guard
l(service
.sleep_lock
);
9489 service
.recovery_needs_sleep
= false;
9490 service
.queue_recovery_after_sleep(pgref
.get(), queued
, reserved_pushes
);
9493 // This is true for the first recovery op and when the previous recovery op
9494 // has been scheduled in the past. The next recovery op is scheduled after
9495 // completing the sleep from now.
9497 if (auto now
= ceph::real_clock::now();
9498 service
.recovery_schedule_time
< now
) {
9499 service
.recovery_schedule_time
= now
;
9501 service
.recovery_schedule_time
+= ceph::make_timespan(recovery_sleep
);
9502 service
.sleep_timer
.add_event_at(service
.recovery_schedule_time
,
9503 recovery_requeue_callback
);
9504 dout(20) << "Recovery event scheduled at "
9505 << service
.recovery_schedule_time
<< dendl
;
9512 std::lock_guard
l(service
.sleep_lock
);
9513 service
.recovery_needs_sleep
= true;
9516 if (pg
->pg_has_reset_since(queued
)) {
9520 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
9521 #ifdef DEBUG_RECOVERY_OIDS
9522 dout(20) << " active was " << service
.recovery_oids
[pg
->pg_id
] << dendl
;
9525 bool do_unfound
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
9526 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
9527 << " on " << *pg
<< dendl
;
9530 PeeringCtx rctx
= create_context();
9531 rctx
.handle
= &handle
;
9532 pg
->find_unfound(queued
, rctx
);
9533 dispatch_context(rctx
, pg
, pg
->get_osdmap());
9538 ceph_assert(started
<= reserved_pushes
);
9539 service
.release_reserved_pushes(reserved_pushes
);
9542 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
9544 std::lock_guard
l(recovery_lock
);
9545 dout(10) << "start_recovery_op " << *pg
<< " " << soid
9546 << " (" << recovery_ops_active
<< "/"
9547 << osd
->get_recovery_max_active() << " rops)"
9549 recovery_ops_active
++;
9551 #ifdef DEBUG_RECOVERY_OIDS
9552 dout(20) << " active was " << recovery_oids
[pg
->pg_id
] << dendl
;
9553 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
) == 0);
9554 recovery_oids
[pg
->pg_id
].insert(soid
);
9558 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
9560 std::lock_guard
l(recovery_lock
);
9561 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
9562 << " dequeue=" << dequeue
9563 << " (" << recovery_ops_active
<< "/"
9564 << osd
->get_recovery_max_active() << " rops)"
9568 ceph_assert(recovery_ops_active
> 0);
9569 recovery_ops_active
--;
9571 #ifdef DEBUG_RECOVERY_OIDS
9572 dout(20) << " active oids was " << recovery_oids
[pg
->pg_id
] << dendl
;
9573 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
));
9574 recovery_oids
[pg
->pg_id
].erase(soid
);
9577 _maybe_queue_recovery();
9580 bool OSDService::is_recovery_active()
9582 if (cct
->_conf
->osd_debug_pretend_recovery_active
) {
9585 return local_reserver
.has_reservation() || remote_reserver
.has_reservation();
9588 void OSDService::release_reserved_pushes(uint64_t pushes
)
9590 std::lock_guard
l(recovery_lock
);
9591 dout(10) << __func__
<< "(" << pushes
<< "), recovery_ops_reserved "
9592 << recovery_ops_reserved
<< " -> " << (recovery_ops_reserved
-pushes
)
9594 ceph_assert(recovery_ops_reserved
>= pushes
);
9595 recovery_ops_reserved
-= pushes
;
9596 _maybe_queue_recovery();
9599 // =========================================================
9602 bool OSD::op_is_discardable(const MOSDOp
*op
)
9604 // drop client request if they are not connected and can't get the
9606 if (!op
->get_connection()->is_connected()) {
9612 void OSD::enqueue_op(spg_t pg
, OpRequestRef
&& op
, epoch_t epoch
)
9614 const utime_t stamp
= op
->get_req()->get_recv_stamp();
9615 const utime_t latency
= ceph_clock_now() - stamp
;
9616 const unsigned priority
= op
->get_req()->get_priority();
9617 const int cost
= op
->get_req()->get_cost();
9618 const uint64_t owner
= op
->get_req()->get_source().num();
9620 dout(15) << "enqueue_op " << op
<< " prio " << priority
9622 << " latency " << latency
9623 << " epoch " << epoch
9624 << " " << *(op
->get_req()) << dendl
;
9625 op
->osd_trace
.event("enqueue op");
9626 op
->osd_trace
.keyval("priority", priority
);
9627 op
->osd_trace
.keyval("cost", cost
);
9628 op
->mark_queued_for_pg();
9629 logger
->tinc(l_osd_op_before_queue_op_lat
, latency
);
9632 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGOpItem(pg
, std::move(op
))),
9633 cost
, priority
, stamp
, owner
, epoch
));
9636 void OSD::enqueue_peering_evt(spg_t pgid
, PGPeeringEventRef evt
)
9638 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
9641 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
9643 cct
->_conf
->osd_peering_op_priority
,
9646 evt
->get_epoch_sent()));
9650 * NOTE: dequeue called in worker thread, with pg lock
9652 void OSD::dequeue_op(
9653 PGRef pg
, OpRequestRef op
,
9654 ThreadPool::TPHandle
&handle
)
9656 const Message
*m
= op
->get_req();
9659 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_BEGIN", false);
9661 utime_t now
= ceph_clock_now();
9662 op
->set_dequeued_time(now
);
9664 utime_t latency
= now
- m
->get_recv_stamp();
9665 dout(10) << "dequeue_op " << op
<< " prio " << m
->get_priority()
9666 << " cost " << m
->get_cost()
9667 << " latency " << latency
9669 << " pg " << *pg
<< dendl
;
9671 logger
->tinc(l_osd_op_before_dequeue_op_lat
, latency
);
9673 service
.maybe_share_map(m
->get_connection().get(),
9677 if (pg
->is_deleting())
9680 op
->mark_reached_pg();
9681 op
->osd_trace
.event("dequeue_op");
9683 pg
->do_request(op
, handle
);
9686 dout(10) << "dequeue_op " << op
<< " finish" << dendl
;
9687 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_END", false);
9691 void OSD::dequeue_peering_evt(
9694 PGPeeringEventRef evt
,
9695 ThreadPool::TPHandle
& handle
)
9697 PeeringCtx rctx
= create_context();
9698 auto curmap
= sdata
->get_osdmap();
9699 bool need_up_thru
= false;
9700 epoch_t same_interval_since
= 0;
9702 if (const MQuery
*q
= dynamic_cast<const MQuery
*>(evt
->evt
.get())) {
9703 handle_pg_query_nopg(*q
);
9705 derr
<< __func__
<< " unrecognized pg-less event " << evt
->get_desc() << dendl
;
9708 } else if (advance_pg(curmap
->get_epoch(), pg
, handle
, rctx
)) {
9709 pg
->do_peering_event(evt
, rctx
);
9710 if (pg
->is_deleted()) {
9714 dispatch_context(rctx
, pg
, curmap
, &handle
);
9715 need_up_thru
= pg
->get_need_up_thru();
9716 same_interval_since
= pg
->get_same_interval_since();
9721 queue_want_up_thru(same_interval_since
);
9724 service
.send_pg_temp();
9727 void OSD::dequeue_delete(
9731 ThreadPool::TPHandle
& handle
)
9733 dequeue_peering_evt(
9737 std::make_shared
<PGPeeringEvent
>(
9739 PeeringState::DeleteSome())),
9745 // --------------------------------
9747 const char** OSD::get_tracked_conf_keys() const
9749 static const char* KEYS
[] = {
9750 "osd_max_backfills",
9751 "osd_min_recovery_priority",
9752 "osd_max_trimming_pgs",
9753 "osd_op_complaint_time",
9754 "osd_op_log_threshold",
9755 "osd_op_history_size",
9756 "osd_op_history_duration",
9757 "osd_op_history_slow_op_size",
9758 "osd_op_history_slow_op_threshold",
9759 "osd_enable_op_tracker",
9760 "osd_map_cache_size",
9761 "osd_pg_epoch_max_lag_factor",
9762 "osd_pg_epoch_persisted_max_stale",
9763 // clog & admin clog
9766 "clog_to_syslog_facility",
9767 "clog_to_syslog_level",
9768 "osd_objectstore_fuse",
9770 "clog_to_graylog_host",
9771 "clog_to_graylog_port",
9774 "osd_recovery_delay_start",
9775 "osd_client_message_size_cap",
9776 "osd_client_message_cap",
9777 "osd_heartbeat_min_size",
9778 "osd_heartbeat_interval",
9779 "osd_object_clean_region_max_num_intervals",
9780 "osd_scrub_min_interval",
9781 "osd_scrub_max_interval",
9787 void OSD::handle_conf_change(const ConfigProxy
& conf
,
9788 const std::set
<std::string
> &changed
)
9790 std::lock_guard l
{osd_lock
};
9791 if (changed
.count("osd_max_backfills")) {
9792 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9793 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9795 if (changed
.count("osd_min_recovery_priority")) {
9796 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9797 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9799 if (changed
.count("osd_max_trimming_pgs")) {
9800 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
9802 if (changed
.count("osd_op_complaint_time") ||
9803 changed
.count("osd_op_log_threshold")) {
9804 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
9805 cct
->_conf
->osd_op_log_threshold
);
9807 if (changed
.count("osd_op_history_size") ||
9808 changed
.count("osd_op_history_duration")) {
9809 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
9810 cct
->_conf
->osd_op_history_duration
);
9812 if (changed
.count("osd_op_history_slow_op_size") ||
9813 changed
.count("osd_op_history_slow_op_threshold")) {
9814 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
9815 cct
->_conf
->osd_op_history_slow_op_threshold
);
9817 if (changed
.count("osd_enable_op_tracker")) {
9818 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
9820 if (changed
.count("osd_map_cache_size")) {
9821 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9822 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9823 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9825 if (changed
.count("clog_to_monitors") ||
9826 changed
.count("clog_to_syslog") ||
9827 changed
.count("clog_to_syslog_level") ||
9828 changed
.count("clog_to_syslog_facility") ||
9829 changed
.count("clog_to_graylog") ||
9830 changed
.count("clog_to_graylog_host") ||
9831 changed
.count("clog_to_graylog_port") ||
9832 changed
.count("host") ||
9833 changed
.count("fsid")) {
9834 update_log_config();
9836 if (changed
.count("osd_pg_epoch_max_lag_factor")) {
9837 m_osd_pg_epoch_max_lag_factor
= conf
.get_val
<double>(
9838 "osd_pg_epoch_max_lag_factor");
9842 if (changed
.count("osd_objectstore_fuse")) {
9844 enable_disable_fuse(false);
9849 if (changed
.count("osd_recovery_delay_start")) {
9850 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
9851 service
.kick_recovery_queue();
9854 if (changed
.count("osd_client_message_cap")) {
9855 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
9856 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9857 if (pol
.throttler_messages
&& newval
> 0) {
9858 pol
.throttler_messages
->reset_max(newval
);
9861 if (changed
.count("osd_client_message_size_cap")) {
9862 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
9863 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9864 if (pol
.throttler_bytes
&& newval
> 0) {
9865 pol
.throttler_bytes
->reset_max(newval
);
9868 if (changed
.count("osd_object_clean_region_max_num_intervals")) {
9869 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
9872 if (changed
.count("osd_scrub_min_interval") ||
9873 changed
.count("osd_scrub_max_interval")) {
9874 resched_all_scrubs();
9875 dout(0) << __func__
<< ": scrub interval change" << dendl
;
9880 void OSD::update_log_config()
9882 map
<string
,string
> log_to_monitors
;
9883 map
<string
,string
> log_to_syslog
;
9884 map
<string
,string
> log_channel
;
9885 map
<string
,string
> log_prio
;
9886 map
<string
,string
> log_to_graylog
;
9887 map
<string
,string
> log_to_graylog_host
;
9888 map
<string
,string
> log_to_graylog_port
;
9892 if (parse_log_client_options(cct
, log_to_monitors
, log_to_syslog
,
9893 log_channel
, log_prio
, log_to_graylog
,
9894 log_to_graylog_host
, log_to_graylog_port
,
9896 clog
->update_config(log_to_monitors
, log_to_syslog
,
9897 log_channel
, log_prio
, log_to_graylog
,
9898 log_to_graylog_host
, log_to_graylog_port
,
9900 derr
<< "log_to_monitors " << log_to_monitors
<< dendl
;
9903 void OSD::check_config()
9905 // some sanity checks
9906 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
9907 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
9908 << " is not > osd_pg_epoch_persisted_max_stale ("
9909 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
9911 if (cct
->_conf
->osd_object_clean_region_max_num_intervals
< 0) {
9912 clog
->warn() << "osd_object_clean_region_max_num_intervals ("
9913 << cct
->_conf
->osd_object_clean_region_max_num_intervals
9918 // --------------------------------
9920 void OSD::get_latest_osdmap()
9922 dout(10) << __func__
<< " -- start" << dendl
;
9925 service
.objecter
->wait_for_latest_osdmap(&cond
);
9928 dout(10) << __func__
<< " -- finish" << dendl
;
9931 // --------------------------------
9933 void OSD::set_perf_queries(const ConfigPayload
&config_payload
) {
9934 const OSDConfigPayload
&osd_config_payload
= boost::get
<OSDConfigPayload
>(config_payload
);
9935 const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
= osd_config_payload
.config
;
9936 dout(10) << "setting " << queries
.size() << " queries" << dendl
;
9938 std::list
<OSDPerfMetricQuery
> supported_queries
;
9939 for (auto &it
: queries
) {
9940 auto &query
= it
.first
;
9941 if (!query
.key_descriptor
.empty()) {
9942 supported_queries
.push_back(query
);
9945 if (supported_queries
.size() < queries
.size()) {
9946 dout(1) << queries
.size() - supported_queries
.size()
9947 << " unsupported queries" << dendl
;
9950 std::lock_guard locker
{m_perf_queries_lock
};
9951 m_perf_queries
= supported_queries
;
9952 m_perf_limits
= queries
;
9954 std::vector
<PGRef
> pgs
;
9956 for (auto& pg
: pgs
) {
9957 std::scoped_lock l
{*pg
};
9958 pg
->set_dynamic_perf_stats_queries(supported_queries
);
9962 MetricPayload
OSD::get_perf_reports() {
9963 OSDMetricPayload payload
;
9964 std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> &reports
= payload
.report
;
9966 std::vector
<PGRef
> pgs
;
9968 DynamicPerfStats dps
;
9969 for (auto& pg
: pgs
) {
9970 // m_perf_queries can be modified only in set_perf_queries by mgr client
9971 // request, and it is protected by by mgr client's lock, which is held
9972 // when set_perf_queries/get_perf_reports are called, so we may not hold
9973 // m_perf_queries_lock here.
9974 DynamicPerfStats
pg_dps(m_perf_queries
);
9976 pg
->get_dynamic_perf_stats(&pg_dps
);
9980 dps
.add_to_reports(m_perf_limits
, &reports
);
9981 dout(20) << "reports for " << reports
.size() << " queries" << dendl
;
9986 // =============================================================
9989 #define dout_context cct
9991 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
9993 void OSDShard::_attach_pg(OSDShardPGSlot
*slot
, PG
*pg
)
9995 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
9997 pg
->osd_shard
= this;
10001 slot
->epoch
= pg
->get_osdmap_epoch();
10002 pg_slots_by_epoch
.insert(*slot
);
10005 void OSDShard::_detach_pg(OSDShardPGSlot
*slot
)
10007 dout(10) << slot
->pg
->pg_id
<< " " << slot
->pg
<< dendl
;
10008 slot
->pg
->osd_shard
= nullptr;
10009 slot
->pg
->pg_slot
= nullptr;
10010 slot
->pg
= nullptr;
10011 osd
->dec_num_pgs();
10013 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10015 if (waiting_for_min_pg_epoch
) {
10016 min_pg_epoch_cond
.notify_all();
10020 void OSDShard::update_pg_epoch(OSDShardPGSlot
*slot
, epoch_t e
)
10022 std::lock_guard
l(shard_lock
);
10023 dout(30) << "min was " << pg_slots_by_epoch
.begin()->epoch
10024 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10025 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10026 dout(20) << slot
->pg
->pg_id
<< " " << slot
->epoch
<< " -> " << e
<< dendl
;
10028 pg_slots_by_epoch
.insert(*slot
);
10029 dout(30) << "min is now " << pg_slots_by_epoch
.begin()->epoch
10030 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10031 if (waiting_for_min_pg_epoch
) {
10032 min_pg_epoch_cond
.notify_all();
10036 epoch_t
OSDShard::get_min_pg_epoch()
10038 std::lock_guard
l(shard_lock
);
10039 auto p
= pg_slots_by_epoch
.begin();
10040 if (p
== pg_slots_by_epoch
.end()) {
10046 void OSDShard::wait_min_pg_epoch(epoch_t need
)
10048 std::unique_lock l
{shard_lock
};
10049 ++waiting_for_min_pg_epoch
;
10050 min_pg_epoch_cond
.wait(l
, [need
, this] {
10051 if (pg_slots_by_epoch
.empty()) {
10053 } else if (pg_slots_by_epoch
.begin()->epoch
>= need
) {
10056 dout(10) << need
<< " waiting on "
10057 << pg_slots_by_epoch
.begin()->epoch
<< dendl
;
10061 --waiting_for_min_pg_epoch
;
10064 epoch_t
OSDShard::get_max_waiting_epoch()
10066 std::lock_guard
l(shard_lock
);
10068 for (auto& i
: pg_slots
) {
10069 if (!i
.second
->waiting_peering
.empty()) {
10070 r
= std::max(r
, i
.second
->waiting_peering
.rbegin()->first
);
10076 void OSDShard::consume_map(
10077 const OSDMapRef
& new_osdmap
,
10078 unsigned *pushes_to_free
)
10080 std::lock_guard
l(shard_lock
);
10081 OSDMapRef old_osdmap
;
10083 std::lock_guard
l(osdmap_lock
);
10084 old_osdmap
= std::move(shard_osdmap
);
10085 shard_osdmap
= new_osdmap
;
10087 dout(10) << new_osdmap
->get_epoch()
10088 << " (was " << (old_osdmap
? old_osdmap
->get_epoch() : 0) << ")"
10090 bool queued
= false;
10093 auto p
= pg_slots
.begin();
10094 while (p
!= pg_slots
.end()) {
10095 OSDShardPGSlot
*slot
= p
->second
.get();
10096 const spg_t
& pgid
= p
->first
;
10097 dout(20) << __func__
<< " " << pgid
<< dendl
;
10098 if (!slot
->waiting_for_split
.empty()) {
10099 dout(20) << __func__
<< " " << pgid
10100 << " waiting for split " << slot
->waiting_for_split
<< dendl
;
10104 if (slot
->waiting_for_merge_epoch
> new_osdmap
->get_epoch()) {
10105 dout(20) << __func__
<< " " << pgid
10106 << " waiting for merge by epoch " << slot
->waiting_for_merge_epoch
10111 if (!slot
->waiting_peering
.empty()) {
10112 epoch_t first
= slot
->waiting_peering
.begin()->first
;
10113 if (first
<= new_osdmap
->get_epoch()) {
10114 dout(20) << __func__
<< " " << pgid
10115 << " pending_peering first epoch " << first
10116 << " <= " << new_osdmap
->get_epoch() << ", requeueing" << dendl
;
10117 _wake_pg_slot(pgid
, slot
);
10123 if (!slot
->waiting
.empty()) {
10124 if (new_osdmap
->is_up_acting_osd_shard(pgid
, osd
->get_nodeid())) {
10125 dout(20) << __func__
<< " " << pgid
<< " maps to us, keeping"
10130 while (!slot
->waiting
.empty() &&
10131 slot
->waiting
.front().get_map_epoch() <= new_osdmap
->get_epoch()) {
10132 auto& qi
= slot
->waiting
.front();
10133 dout(20) << __func__
<< " " << pgid
10134 << " waiting item " << qi
10135 << " epoch " << qi
.get_map_epoch()
10136 << " <= " << new_osdmap
->get_epoch()
10138 << (qi
.get_map_epoch() < new_osdmap
->get_epoch() ? "stale" :
10140 << ", dropping" << dendl
;
10141 *pushes_to_free
+= qi
.get_reserved_pushes();
10142 slot
->waiting
.pop_front();
10145 if (slot
->waiting
.empty() &&
10146 slot
->num_running
== 0 &&
10147 slot
->waiting_for_split
.empty() &&
10149 dout(20) << __func__
<< " " << pgid
<< " empty, pruning" << dendl
;
10150 p
= pg_slots
.erase(p
);
10157 std::lock_guard l
{sdata_wait_lock
};
10158 sdata_cond
.notify_one();
10162 void OSDShard::_wake_pg_slot(
10164 OSDShardPGSlot
*slot
)
10166 dout(20) << __func__
<< " " << pgid
10167 << " to_process " << slot
->to_process
10168 << " waiting " << slot
->waiting
10169 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10170 for (auto i
= slot
->to_process
.rbegin();
10171 i
!= slot
->to_process
.rend();
10173 scheduler
->enqueue_front(std::move(*i
));
10175 slot
->to_process
.clear();
10176 for (auto i
= slot
->waiting
.rbegin();
10177 i
!= slot
->waiting
.rend();
10179 scheduler
->enqueue_front(std::move(*i
));
10181 slot
->waiting
.clear();
10182 for (auto i
= slot
->waiting_peering
.rbegin();
10183 i
!= slot
->waiting_peering
.rend();
10185 // this is overkill; we requeue everything, even if some of these
10186 // items are waiting for maps we don't have yet. FIXME, maybe,
10187 // someday, if we decide this inefficiency matters
10188 for (auto j
= i
->second
.rbegin(); j
!= i
->second
.rend(); ++j
) {
10189 scheduler
->enqueue_front(std::move(*j
));
10192 slot
->waiting_peering
.clear();
10193 ++slot
->requeue_seq
;
10196 void OSDShard::identify_splits_and_merges(
10197 const OSDMapRef
& as_of_osdmap
,
10198 set
<pair
<spg_t
,epoch_t
>> *split_pgs
,
10199 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10201 std::lock_guard
l(shard_lock
);
10202 if (shard_osdmap
) {
10203 for (auto& i
: pg_slots
) {
10204 const spg_t
& pgid
= i
.first
;
10205 auto *slot
= i
.second
.get();
10207 osd
->service
.identify_splits_and_merges(
10208 shard_osdmap
, as_of_osdmap
, pgid
,
10209 split_pgs
, merge_pgs
);
10210 } else if (!slot
->waiting_for_split
.empty()) {
10211 osd
->service
.identify_splits_and_merges(
10212 shard_osdmap
, as_of_osdmap
, pgid
,
10213 split_pgs
, nullptr);
10215 dout(20) << __func__
<< " slot " << pgid
10216 << " has no pg and waiting_for_split " << dendl
;
10222 void OSDShard::prime_splits(const OSDMapRef
& as_of_osdmap
,
10223 set
<pair
<spg_t
,epoch_t
>> *pgids
)
10225 std::lock_guard
l(shard_lock
);
10226 _prime_splits(pgids
);
10227 if (shard_osdmap
->get_epoch() > as_of_osdmap
->get_epoch()) {
10228 set
<pair
<spg_t
,epoch_t
>> newer_children
;
10229 for (auto i
: *pgids
) {
10230 osd
->service
.identify_splits_and_merges(
10231 as_of_osdmap
, shard_osdmap
, i
.first
,
10232 &newer_children
, nullptr);
10234 newer_children
.insert(pgids
->begin(), pgids
->end());
10235 dout(10) << "as_of_osdmap " << as_of_osdmap
->get_epoch() << " < shard "
10236 << shard_osdmap
->get_epoch() << ", new children " << newer_children
10238 _prime_splits(&newer_children
);
10239 // note: we don't care what is left over here for other shards.
10240 // if this shard is ahead of us and one isn't, e.g., one thread is
10241 // calling into prime_splits via _process (due to a newly created
10242 // pg) and this shard has a newer map due to a racing consume_map,
10243 // then any grandchildren left here will be identified (or were
10244 // identified) when the slower shard's osdmap is advanced.
10245 // _prime_splits() will tolerate the case where the pgid is
10250 void OSDShard::_prime_splits(set
<pair
<spg_t
,epoch_t
>> *pgids
)
10252 dout(10) << *pgids
<< dendl
;
10253 auto p
= pgids
->begin();
10254 while (p
!= pgids
->end()) {
10255 unsigned shard_index
= p
->first
.hash_to_shard(osd
->num_shards
);
10256 if (shard_index
== shard_id
) {
10257 auto r
= pg_slots
.emplace(p
->first
, nullptr);
10259 dout(10) << "priming slot " << p
->first
<< " e" << p
->second
<< dendl
;
10260 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10261 r
.first
->second
->waiting_for_split
.insert(p
->second
);
10264 ceph_assert(q
!= pg_slots
.end());
10265 dout(10) << "priming (existing) slot " << p
->first
<< " e" << p
->second
10267 q
->second
->waiting_for_split
.insert(p
->second
);
10269 p
= pgids
->erase(p
);
10276 void OSDShard::prime_merges(const OSDMapRef
& as_of_osdmap
,
10277 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10279 std::lock_guard
l(shard_lock
);
10280 dout(20) << __func__
<< " checking shard " << shard_id
10281 << " for remaining merge pgs " << merge_pgs
<< dendl
;
10282 auto p
= merge_pgs
->begin();
10283 while (p
!= merge_pgs
->end()) {
10284 spg_t pgid
= p
->first
;
10285 epoch_t epoch
= p
->second
;
10286 unsigned shard_index
= pgid
.hash_to_shard(osd
->num_shards
);
10287 if (shard_index
!= shard_id
) {
10291 OSDShardPGSlot
*slot
;
10292 auto r
= pg_slots
.emplace(pgid
, nullptr);
10294 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10296 slot
= r
.first
->second
.get();
10299 dout(20) << __func__
<< " have merge participant pg " << pgid
10300 << " " << slot
->pg
<< dendl
;
10301 } else if (!slot
->waiting_for_split
.empty() &&
10302 *slot
->waiting_for_split
.begin() < epoch
) {
10303 dout(20) << __func__
<< " pending split on merge participant pg " << pgid
10304 << " " << slot
->waiting_for_split
<< dendl
;
10306 dout(20) << __func__
<< " creating empty merge participant " << pgid
10307 << " for merge in " << epoch
<< dendl
;
10308 // leave history zeroed; PG::merge_from() will fill it in.
10309 pg_history_t history
;
10310 PGCreateInfo
cinfo(pgid
, epoch
- 1,
10311 history
, PastIntervals(), false);
10312 PGRef pg
= osd
->handle_pg_create_info(shard_osdmap
, &cinfo
);
10313 _attach_pg(r
.first
->second
.get(), pg
.get());
10314 _wake_pg_slot(pgid
, slot
);
10317 // mark slot for merge
10318 dout(20) << __func__
<< " marking merge participant " << pgid
<< dendl
;
10319 slot
->waiting_for_merge_epoch
= epoch
;
10320 p
= merge_pgs
->erase(p
);
10324 void OSDShard::register_and_wake_split_child(PG
*pg
)
10328 std::lock_guard
l(shard_lock
);
10329 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10330 auto p
= pg_slots
.find(pg
->pg_id
);
10331 ceph_assert(p
!= pg_slots
.end());
10332 auto *slot
= p
->second
.get();
10333 dout(20) << pg
->pg_id
<< " waiting_for_split " << slot
->waiting_for_split
10335 ceph_assert(!slot
->pg
);
10336 ceph_assert(!slot
->waiting_for_split
.empty());
10337 _attach_pg(slot
, pg
);
10339 epoch
= pg
->get_osdmap_epoch();
10340 ceph_assert(slot
->waiting_for_split
.count(epoch
));
10341 slot
->waiting_for_split
.erase(epoch
);
10342 if (slot
->waiting_for_split
.empty()) {
10343 _wake_pg_slot(pg
->pg_id
, slot
);
10345 dout(10) << __func__
<< " still waiting for split on "
10346 << slot
->waiting_for_split
<< dendl
;
10350 // kick child to ensure it pulls up to the latest osdmap
10351 osd
->enqueue_peering_evt(
10354 std::make_shared
<PGPeeringEvent
>(
10359 std::lock_guard l
{sdata_wait_lock
};
10360 sdata_cond
.notify_one();
10363 void OSDShard::unprime_split_children(spg_t parent
, unsigned old_pg_num
)
10365 std::lock_guard
l(shard_lock
);
10366 vector
<spg_t
> to_delete
;
10367 for (auto& i
: pg_slots
) {
10368 if (i
.first
!= parent
&&
10369 i
.first
.get_ancestor(old_pg_num
) == parent
) {
10370 dout(10) << __func__
<< " parent " << parent
<< " clearing " << i
.first
10372 _wake_pg_slot(i
.first
, i
.second
.get());
10373 to_delete
.push_back(i
.first
);
10376 for (auto pgid
: to_delete
) {
10377 pg_slots
.erase(pgid
);
10381 OSDShard::OSDShard(
10388 shard_name(string("OSDShard.") + stringify(id
)),
10389 sdata_wait_lock_name(shard_name
+ "::sdata_wait_lock"),
10390 sdata_wait_lock
{make_mutex(sdata_wait_lock_name
)},
10391 osdmap_lock
{make_mutex(shard_name
+ "::osdmap_lock")},
10392 shard_lock_name(shard_name
+ "::shard_lock"),
10393 shard_lock
{make_mutex(shard_lock_name
)},
10394 scheduler(ceph::osd::scheduler::make_scheduler(cct
)),
10395 context_queue(sdata_wait_lock
, sdata_cond
)
10397 dout(0) << "using op scheduler " << *scheduler
<< dendl
;
10401 // =============================================================
10403 #undef dout_context
10404 #define dout_context osd->cct
10406 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10408 void OSD::ShardedOpWQ::_add_slot_waiter(
10410 OSDShardPGSlot
*slot
,
10411 OpSchedulerItem
&& qi
)
10413 if (qi
.is_peering()) {
10414 dout(20) << __func__
<< " " << pgid
10415 << " peering, item epoch is "
10416 << qi
.get_map_epoch()
10417 << ", will wait on " << qi
<< dendl
;
10418 slot
->waiting_peering
[qi
.get_map_epoch()].push_back(std::move(qi
));
10420 dout(20) << __func__
<< " " << pgid
10421 << " item epoch is "
10422 << qi
.get_map_epoch()
10423 << ", will wait on " << qi
<< dendl
;
10424 slot
->waiting
.push_back(std::move(qi
));
10429 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10431 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
10433 uint32_t shard_index
= thread_index
% osd
->num_shards
;
10434 auto& sdata
= osd
->shards
[shard_index
];
10435 ceph_assert(sdata
);
10437 // If all threads of shards do oncommits, there is a out-of-order
10438 // problem. So we choose the thread which has the smallest
10439 // thread_index(thread_index < num_shards) of shard to do oncommit
10441 bool is_smallest_thread_index
= thread_index
< osd
->num_shards
;
10444 sdata
->shard_lock
.lock();
10445 if (sdata
->scheduler
->empty() &&
10446 (!is_smallest_thread_index
|| sdata
->context_queue
.empty())) {
10447 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10448 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
10449 // we raced with a context_queue addition, don't wait
10450 wait_lock
.unlock();
10451 } else if (!sdata
->stop_waiting
) {
10452 dout(20) << __func__
<< " empty q, waiting" << dendl
;
10453 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
10454 sdata
->shard_lock
.unlock();
10455 sdata
->sdata_cond
.wait(wait_lock
);
10456 wait_lock
.unlock();
10457 sdata
->shard_lock
.lock();
10458 if (sdata
->scheduler
->empty() &&
10459 !(is_smallest_thread_index
&& !sdata
->context_queue
.empty())) {
10460 sdata
->shard_lock
.unlock();
10463 // found a work item; reapply default wq timeouts
10464 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10465 timeout_interval
, suicide_interval
);
10467 dout(20) << __func__
<< " need return immediately" << dendl
;
10468 wait_lock
.unlock();
10469 sdata
->shard_lock
.unlock();
10474 list
<Context
*> oncommits
;
10475 if (is_smallest_thread_index
) {
10476 sdata
->context_queue
.move_to(oncommits
);
10479 if (sdata
->scheduler
->empty()) {
10480 if (osd
->is_stopping()) {
10481 sdata
->shard_lock
.unlock();
10482 for (auto c
: oncommits
) {
10483 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10486 return; // OSD shutdown, discard.
10488 sdata
->shard_lock
.unlock();
10489 handle_oncommits(oncommits
);
10493 OpSchedulerItem item
= sdata
->scheduler
->dequeue();
10494 if (osd
->is_stopping()) {
10495 sdata
->shard_lock
.unlock();
10496 for (auto c
: oncommits
) {
10497 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10500 return; // OSD shutdown, discard.
10503 const auto token
= item
.get_ordering_token();
10504 auto r
= sdata
->pg_slots
.emplace(token
, nullptr);
10506 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10508 OSDShardPGSlot
*slot
= r
.first
->second
.get();
10509 dout(20) << __func__
<< " " << token
10510 << (r
.second
? " (new)" : "")
10511 << " to_process " << slot
->to_process
10512 << " waiting " << slot
->waiting
10513 << " waiting_peering " << slot
->waiting_peering
10515 slot
->to_process
.push_back(std::move(item
));
10516 dout(20) << __func__
<< " " << slot
->to_process
.back()
10517 << " queued" << dendl
;
10520 PGRef pg
= slot
->pg
;
10522 // lock pg (if we have it)
10524 // note the requeue seq now...
10525 uint64_t requeue_seq
= slot
->requeue_seq
;
10526 ++slot
->num_running
;
10528 sdata
->shard_lock
.unlock();
10529 osd
->service
.maybe_inject_dispatch_delay();
10531 osd
->service
.maybe_inject_dispatch_delay();
10532 sdata
->shard_lock
.lock();
10534 auto q
= sdata
->pg_slots
.find(token
);
10535 if (q
== sdata
->pg_slots
.end()) {
10536 // this can happen if we race with pg removal.
10537 dout(20) << __func__
<< " slot " << token
<< " no longer there" << dendl
;
10539 sdata
->shard_lock
.unlock();
10540 handle_oncommits(oncommits
);
10543 slot
= q
->second
.get();
10544 --slot
->num_running
;
10546 if (slot
->to_process
.empty()) {
10547 // raced with _wake_pg_slot or consume_map
10548 dout(20) << __func__
<< " " << token
10549 << " nothing queued" << dendl
;
10551 sdata
->shard_lock
.unlock();
10552 handle_oncommits(oncommits
);
10555 if (requeue_seq
!= slot
->requeue_seq
) {
10556 dout(20) << __func__
<< " " << token
10557 << " requeue_seq " << slot
->requeue_seq
<< " > our "
10558 << requeue_seq
<< ", we raced with _wake_pg_slot"
10561 sdata
->shard_lock
.unlock();
10562 handle_oncommits(oncommits
);
10565 if (slot
->pg
!= pg
) {
10566 // this can happen if we race with pg removal.
10567 dout(20) << __func__
<< " slot " << token
<< " no longer attached to "
10574 dout(20) << __func__
<< " " << token
10575 << " to_process " << slot
->to_process
10576 << " waiting " << slot
->waiting
10577 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10579 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
10583 auto qi
= std::move(slot
->to_process
.front());
10584 slot
->to_process
.pop_front();
10585 dout(20) << __func__
<< " " << qi
<< " pg " << pg
<< dendl
;
10586 set
<pair
<spg_t
,epoch_t
>> new_children
;
10590 // should this pg shard exist on this osd in this (or a later) epoch?
10591 osdmap
= sdata
->shard_osdmap
;
10592 const PGCreateInfo
*create_info
= qi
.creates_pg();
10593 if (!slot
->waiting_for_split
.empty()) {
10594 dout(20) << __func__
<< " " << token
10595 << " splitting " << slot
->waiting_for_split
<< dendl
;
10596 _add_slot_waiter(token
, slot
, std::move(qi
));
10597 } else if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10598 dout(20) << __func__
<< " " << token
10599 << " map " << qi
.get_map_epoch() << " > "
10600 << osdmap
->get_epoch() << dendl
;
10601 _add_slot_waiter(token
, slot
, std::move(qi
));
10602 } else if (qi
.is_peering()) {
10603 if (!qi
.peering_requires_pg()) {
10604 // for pg-less events, we run them under the ordering lock, since
10605 // we don't have the pg lock to keep them ordered.
10606 qi
.run(osd
, sdata
, pg
, tp_handle
);
10607 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10609 if (create_info
->by_mon
&&
10610 osdmap
->get_pg_acting_primary(token
.pgid
) != osd
->whoami
) {
10611 dout(20) << __func__
<< " " << token
10612 << " no pg, no longer primary, ignoring mon create on "
10615 dout(20) << __func__
<< " " << token
10616 << " no pg, should create on " << qi
<< dendl
;
10617 pg
= osd
->handle_pg_create_info(osdmap
, create_info
);
10619 // we created the pg! drop out and continue "normally"!
10620 sdata
->_attach_pg(slot
, pg
.get());
10621 sdata
->_wake_pg_slot(token
, slot
);
10623 // identify split children between create epoch and shard epoch.
10624 osd
->service
.identify_splits_and_merges(
10625 pg
->get_osdmap(), osdmap
, pg
->pg_id
, &new_children
, nullptr);
10626 sdata
->_prime_splits(&new_children
);
10627 // distribute remaining split children to other shards below!
10630 dout(20) << __func__
<< " ignored create on " << qi
<< dendl
;
10633 dout(20) << __func__
<< " " << token
10634 << " no pg, peering, !create, discarding " << qi
<< dendl
;
10637 dout(20) << __func__
<< " " << token
10638 << " no pg, peering, doesn't map here e" << osdmap
->get_epoch()
10639 << ", discarding " << qi
10642 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10643 dout(20) << __func__
<< " " << token
10644 << " no pg, should exist e" << osdmap
->get_epoch()
10645 << ", will wait on " << qi
<< dendl
;
10646 _add_slot_waiter(token
, slot
, std::move(qi
));
10648 dout(20) << __func__
<< " " << token
10649 << " no pg, shouldn't exist e" << osdmap
->get_epoch()
10650 << ", dropping " << qi
<< dendl
;
10651 // share map with client?
10652 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10653 osd
->service
.maybe_share_map((*_op
)->get_req()->get_connection().get(),
10654 sdata
->shard_osdmap
,
10655 (*_op
)->sent_epoch
);
10657 unsigned pushes_to_free
= qi
.get_reserved_pushes();
10658 if (pushes_to_free
> 0) {
10659 sdata
->shard_lock
.unlock();
10660 osd
->service
.release_reserved_pushes(pushes_to_free
);
10661 handle_oncommits(oncommits
);
10665 sdata
->shard_lock
.unlock();
10666 handle_oncommits(oncommits
);
10669 if (qi
.is_peering()) {
10670 OSDMapRef osdmap
= sdata
->shard_osdmap
;
10671 if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10672 _add_slot_waiter(token
, slot
, std::move(qi
));
10673 sdata
->shard_lock
.unlock();
10675 handle_oncommits(oncommits
);
10679 sdata
->shard_lock
.unlock();
10681 if (!new_children
.empty()) {
10682 for (auto shard
: osd
->shards
) {
10683 shard
->prime_splits(osdmap
, &new_children
);
10685 ceph_assert(new_children
.empty());
10688 // osd_opwq_process marks the point at which an operation has been dequeued
10689 // and will begin to be handled by a worker thread.
10693 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10694 reqid
= (*_op
)->get_reqid();
10697 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
10698 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
10701 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
10702 Formatter
*f
= Formatter::create("json");
10703 f
->open_object_section("q");
10705 f
->close_section();
10710 qi
.run(osd
, sdata
, pg
, tp_handle
);
10715 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10716 reqid
= (*_op
)->get_reqid();
10719 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
10720 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
10723 handle_oncommits(oncommits
);
10726 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem
&& item
) {
10727 uint32_t shard_index
=
10728 item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
10730 dout(20) << __func__
<< " " << item
<< dendl
;
10732 OSDShard
* sdata
= osd
->shards
[shard_index
];
10733 assert (NULL
!= sdata
);
10737 std::lock_guard l
{sdata
->shard_lock
};
10738 empty
= sdata
->scheduler
->empty();
10739 sdata
->scheduler
->enqueue(std::move(item
));
10743 std::lock_guard l
{sdata
->sdata_wait_lock
};
10744 sdata
->sdata_cond
.notify_all();
10748 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem
&& item
)
10750 auto shard_index
= item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
10751 auto& sdata
= osd
->shards
[shard_index
];
10752 ceph_assert(sdata
);
10753 sdata
->shard_lock
.lock();
10754 auto p
= sdata
->pg_slots
.find(item
.get_ordering_token());
10755 if (p
!= sdata
->pg_slots
.end() &&
10756 !p
->second
->to_process
.empty()) {
10757 // we may be racing with _process, which has dequeued a new item
10758 // from scheduler, put it on to_process, and is now busy taking the
10759 // pg lock. ensure this old requeued item is ordered before any
10760 // such newer item in to_process.
10761 p
->second
->to_process
.push_front(std::move(item
));
10762 item
= std::move(p
->second
->to_process
.back());
10763 p
->second
->to_process
.pop_back();
10764 dout(20) << __func__
10765 << " " << p
->second
->to_process
.front()
10766 << " shuffled w/ " << item
<< dendl
;
10768 dout(20) << __func__
<< " " << item
<< dendl
;
10770 sdata
->scheduler
->enqueue_front(std::move(item
));
10771 sdata
->shard_lock
.unlock();
10772 std::lock_guard l
{sdata
->sdata_wait_lock
};
10773 sdata
->sdata_cond
.notify_one();
10777 namespace osd_cmds
{
10779 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
,
10782 if (!ceph_using_tcmalloc()) {
10783 os
<< "could not issue heap profiler command -- not using tcmalloc!";
10784 return -EOPNOTSUPP
;
10788 if (!cmd_getval(cmdmap
, "heapcmd", cmd
)) {
10789 os
<< "unable to get value for command \"" << cmd
<< "\"";
10793 std::vector
<std::string
> cmd_vec
;
10794 get_str_vec(cmd
, cmd_vec
);
10797 if (cmd_getval(cmdmap
, "value", val
)) {
10798 cmd_vec
.push_back(val
);
10801 ceph_heap_profiler_handle_command(cmd_vec
, os
);
10806 }} // namespace ceph::osd_cmds