1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
47 #include "osdc/Objecter.h"
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_releases.h"
52 #include "common/ceph_time.h"
53 #include "common/version.h"
54 #include "common/pick_address.h"
55 #include "common/blkdev.h"
56 #include "common/numa.h"
58 #include "os/ObjectStore.h"
60 #include "os/FuseStore.h"
63 #include "PrimaryLogPG.h"
65 #include "msg/Messenger.h"
66 #include "msg/Message.h"
68 #include "mon/MonClient.h"
70 #include "messages/MLog.h"
72 #include "messages/MGenericMessage.h"
73 #include "messages/MOSDPing.h"
74 #include "messages/MOSDFailure.h"
75 #include "messages/MOSDMarkMeDown.h"
76 #include "messages/MOSDMarkMeDead.h"
77 #include "messages/MOSDFull.h"
78 #include "messages/MOSDOp.h"
79 #include "messages/MOSDOpReply.h"
80 #include "messages/MOSDBackoff.h"
81 #include "messages/MOSDBeacon.h"
82 #include "messages/MOSDRepOp.h"
83 #include "messages/MOSDRepOpReply.h"
84 #include "messages/MOSDBoot.h"
85 #include "messages/MOSDPGTemp.h"
86 #include "messages/MOSDPGReadyToMerge.h"
88 #include "messages/MOSDMap.h"
89 #include "messages/MMonGetOSDMap.h"
90 #include "messages/MOSDPGNotify.h"
91 #include "messages/MOSDPGNotify2.h"
92 #include "messages/MOSDPGQuery.h"
93 #include "messages/MOSDPGQuery2.h"
94 #include "messages/MOSDPGLog.h"
95 #include "messages/MOSDPGRemove.h"
96 #include "messages/MOSDPGInfo.h"
97 #include "messages/MOSDPGInfo2.h"
98 #include "messages/MOSDPGCreate.h"
99 #include "messages/MOSDPGCreate2.h"
100 #include "messages/MOSDPGScan.h"
101 #include "messages/MBackfillReserve.h"
102 #include "messages/MRecoveryReserve.h"
103 #include "messages/MOSDForceRecovery.h"
104 #include "messages/MOSDECSubOpWrite.h"
105 #include "messages/MOSDECSubOpWriteReply.h"
106 #include "messages/MOSDECSubOpRead.h"
107 #include "messages/MOSDECSubOpReadReply.h"
108 #include "messages/MOSDPGCreated.h"
109 #include "messages/MOSDPGUpdateLogMissing.h"
110 #include "messages/MOSDPGUpdateLogMissingReply.h"
112 #include "messages/MOSDPeeringOp.h"
114 #include "messages/MOSDAlive.h"
116 #include "messages/MOSDScrub.h"
117 #include "messages/MOSDScrub2.h"
118 #include "messages/MOSDRepScrub.h"
120 #include "messages/MCommand.h"
121 #include "messages/MCommandReply.h"
123 #include "messages/MPGStats.h"
124 #include "messages/MPGStatsAck.h"
126 #include "messages/MWatchNotify.h"
127 #include "messages/MOSDPGPush.h"
128 #include "messages/MOSDPGPushReply.h"
129 #include "messages/MOSDPGPull.h"
131 #include "messages/MMonGetPurgedSnaps.h"
132 #include "messages/MMonGetPurgedSnapsReply.h"
134 #include "common/perf_counters.h"
135 #include "common/Timer.h"
136 #include "common/LogClient.h"
137 #include "common/AsyncReserver.h"
138 #include "common/HeartbeatMap.h"
139 #include "common/admin_socket.h"
140 #include "common/ceph_context.h"
142 #include "global/signal_handler.h"
143 #include "global/pidfile.h"
145 #include "include/color.h"
146 #include "perfglue/cpu_profiler.h"
147 #include "perfglue/heap_profiler.h"
149 #include "osd/OpRequest.h"
151 #include "auth/AuthAuthorizeHandler.h"
152 #include "auth/RotatingKeyRing.h"
154 #include "objclass/objclass.h"
156 #include "common/cmdparse.h"
157 #include "include/str_list.h"
158 #include "include/util.h"
160 #include "include/ceph_assert.h"
161 #include "common/config.h"
162 #include "common/EventTrace.h"
164 #include "json_spirit/json_spirit_reader.h"
165 #include "json_spirit/json_spirit_writer.h"
168 #define TRACEPOINT_DEFINE
169 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170 #include "tracing/osd.h"
171 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #undef TRACEPOINT_DEFINE
174 #define tracepoint(...)
177 #define dout_context cct
178 #define dout_subsys ceph_subsys_osd
180 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
182 using namespace ceph::osd::scheduler
;
183 using TOPNSPC::common::cmd_getval
;
185 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
186 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
189 //Initial features in new superblock.
190 //Features here are also automatically upgraded
191 CompatSet
OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat
;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
194 CompatSet::FeatureSet ceph_osd_feature_incompat
;
195 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
196 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
197 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
198 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
199 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
200 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
201 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
202 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
203 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
204 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
205 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
206 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
207 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
208 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
209 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES
);
210 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
);
211 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
212 ceph_osd_feature_incompat
);
215 //Features are added here that this OSD supports.
216 CompatSet
OSD::get_osd_compat_set() {
217 CompatSet compat
= get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
223 OSDService::OSDService(OSD
*osd
) :
226 whoami(osd
->whoami
), store(osd
->store
),
227 log_client(osd
->log_client
), clog(osd
->clog
),
228 pg_recovery_stats(osd
->pg_recovery_stats
),
229 cluster_messenger(osd
->cluster_messenger
),
230 client_messenger(osd
->client_messenger
),
232 recoverystate_perf(osd
->recoverystate_perf
),
234 osd_max_object_size(cct
->_conf
, "osd_max_object_size"),
235 osd_skip_data_digest(cct
->_conf
, "osd_skip_data_digest"),
236 publish_lock
{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock
{ceph::make_mutex("OSDService::pre_publish_lock")},
241 agent_valid_iterator(false),
243 flush_mode_high_count(0),
246 agent_stop_flag(false),
247 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
251 objecter(make_unique
<Objecter
>(osd
->client_messenger
->cct
,
252 osd
->objecter_messenger
,
253 osd
->monc
, nullptr, 0, 0)),
254 m_objecter_finishers(cct
->_conf
->osd_objecter_finishers
),
255 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
257 recovery_request_timer(cct
, recovery_request_lock
, false),
258 sleep_timer(cct
, sleep_lock
, false),
259 reserver_finisher(cct
),
260 local_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
261 cct
->_conf
->osd_min_recovery_priority
),
262 remote_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
263 cct
->_conf
->osd_min_recovery_priority
),
264 snap_reserver(cct
, &reserver_finisher
,
265 cct
->_conf
->osd_max_trimming_pgs
),
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
269 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
270 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
271 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
273 cur_ratio(0), physical_ratio(0),
274 boot_epoch(0), up_epoch(0), bind_epoch(0)
278 for (int i
= 0; i
< m_objecter_finishers
; i
++) {
280 str
<< "objecter-finisher-" << i
;
281 auto fin
= make_unique
<Finisher
>(osd
->client_messenger
->cct
, str
.str(), "finisher");
282 objecter_finishers
.push_back(std::move(fin
));
287 void OSDService::add_pgid(spg_t pgid
, PG
*pg
){
288 std::lock_guard
l(pgid_lock
);
289 if (!pgid_tracker
.count(pgid
)) {
292 pgid_tracker
[pgid
]++;
294 void OSDService::remove_pgid(spg_t pgid
, PG
*pg
)
296 std::lock_guard
l(pgid_lock
);
297 ceph_assert(pgid_tracker
.count(pgid
));
298 ceph_assert(pgid_tracker
[pgid
] > 0);
299 pgid_tracker
[pgid
]--;
300 if (pgid_tracker
[pgid
] == 0) {
301 pgid_tracker
.erase(pgid
);
302 live_pgs
.erase(pgid
);
305 void OSDService::dump_live_pgids()
307 std::lock_guard
l(pgid_lock
);
308 derr
<< "live pgids:" << dendl
;
309 for (map
<spg_t
, int>::const_iterator i
= pgid_tracker
.cbegin();
310 i
!= pgid_tracker
.cend();
312 derr
<< "\t" << *i
<< dendl
;
313 live_pgs
[i
->first
]->dump_live_ids();
319 ceph::signedspan
OSDService::get_mnow()
321 return ceph::mono_clock::now() - osd
->startup_time
;
324 void OSDService::identify_splits_and_merges(
328 set
<pair
<spg_t
,epoch_t
>> *split_children
,
329 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
331 if (!old_map
->have_pg_pool(pgid
.pool())) {
334 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
335 auto p
= osd
->pg_num_history
.pg_nums
.find(pgid
.pool());
336 if (p
== osd
->pg_num_history
.pg_nums
.end()) {
339 dout(20) << __func__
<< " " << pgid
<< " e" << old_map
->get_epoch()
340 << " to e" << new_map
->get_epoch()
341 << " pg_nums " << p
->second
<< dendl
;
343 queue
.push_back(pgid
);
345 while (!queue
.empty()) {
346 auto cur
= queue
.front();
349 unsigned pgnum
= old_pgnum
;
350 for (auto q
= p
->second
.lower_bound(old_map
->get_epoch());
351 q
!= p
->second
.end() &&
352 q
->first
<= new_map
->get_epoch();
354 if (pgnum
< q
->second
) {
356 if (cur
.ps() < pgnum
) {
358 if (cur
.is_split(pgnum
, q
->second
, &children
)) {
359 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
360 << " pg_num " << pgnum
<< " -> " << q
->second
361 << " children " << children
<< dendl
;
362 for (auto i
: children
) {
363 split_children
->insert(make_pair(i
, q
->first
));
368 } else if (cur
.ps() < q
->second
) {
369 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
370 << " pg_num " << pgnum
<< " -> " << q
->second
371 << " is a child" << dendl
;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children
->insert(make_pair(cur
, q
->first
));
378 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
379 << " pg_num " << pgnum
<< " -> " << q
->second
380 << " is post-split, skipping" << dendl
;
382 } else if (merge_pgs
) {
384 if (cur
.ps() >= q
->second
) {
385 if (cur
.ps() < pgnum
) {
387 if (cur
.is_merge_source(pgnum
, q
->second
, &parent
)) {
389 parent
.is_split(q
->second
, pgnum
, &children
);
390 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
391 << " pg_num " << pgnum
<< " -> " << q
->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children
<< dendl
;
394 merge_pgs
->insert(make_pair(parent
, q
->first
));
395 if (!did
.count(parent
)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue
.push_back(parent
);
400 for (auto c
: children
) {
401 merge_pgs
->insert(make_pair(c
, q
->first
));
407 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
408 << " pg_num " << pgnum
<< " -> " << q
->second
409 << " is beyond old pgnum, skipping" << dendl
;
413 if (cur
.is_split(q
->second
, pgnum
, &children
)) {
414 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
415 << " pg_num " << pgnum
<< " -> " << q
->second
416 << " is merge target, source " << children
<< dendl
;
417 for (auto c
: children
) {
418 merge_pgs
->insert(make_pair(c
, q
->first
));
422 merge_pgs
->insert(make_pair(cur
, q
->first
));
431 void OSDService::need_heartbeat_peer_update()
433 osd
->need_heartbeat_peer_update();
436 HeartbeatStampsRef
OSDService::get_hb_stamps(unsigned peer
)
438 std::lock_guard
l(hb_stamp_lock
);
439 if (peer
>= hb_stamps
.size()) {
440 hb_stamps
.resize(peer
+ 1);
442 if (!hb_stamps
[peer
]) {
443 hb_stamps
[peer
] = ceph::make_ref
<HeartbeatStamps
>(peer
);
445 return hb_stamps
[peer
];
448 void OSDService::queue_renew_lease(epoch_t epoch
, spg_t spgid
)
450 osd
->enqueue_peering_evt(
453 std::make_shared
<PGPeeringEvent
>(
458 void OSDService::start_shutdown()
461 std::lock_guard
l(agent_timer_lock
);
462 agent_timer
.shutdown();
466 std::lock_guard
l(sleep_lock
);
467 sleep_timer
.shutdown();
471 std::lock_guard
l(recovery_request_lock
);
472 recovery_request_timer
.shutdown();
476 void OSDService::shutdown_reserver()
478 reserver_finisher
.wait_for_empty();
479 reserver_finisher
.stop();
482 void OSDService::shutdown()
484 mono_timer
.suspend();
487 std::lock_guard
l(watch_lock
);
488 watch_timer
.shutdown();
491 objecter
->shutdown();
492 for (auto& f
: objecter_finishers
) {
497 publish_map(OSDMapRef());
498 next_osdmap
= OSDMapRef();
501 void OSDService::init()
503 reserver_finisher
.start();
504 for (auto& f
: objecter_finishers
) {
507 objecter
->set_client_incarnation(0);
509 // deprioritize objecter in daemonperf output
510 objecter
->get_logger()->set_prio_adjust(-3);
516 agent_thread
.create("osd_srv_agent");
518 if (cct
->_conf
->osd_recovery_delay_start
)
519 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
522 void OSDService::final_init()
524 objecter
->start(osdmap
.get());
527 void OSDService::activate_map()
529 // wake/unwake the tiering agent
530 std::lock_guard l
{agent_lock
};
532 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
534 agent_cond
.notify_all();
537 void OSDService::request_osdmap_update(epoch_t e
)
539 osd
->osdmap_subscribe(e
, false);
543 class AgentTimeoutCB
: public Context
{
546 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
547 void finish(int) override
{
548 pg
->agent_choose_mode_restart();
552 void OSDService::agent_entry()
554 dout(10) << __func__
<< " start" << dendl
;
555 std::unique_lock agent_locker
{agent_lock
};
557 while (!agent_stop_flag
) {
558 if (agent_queue
.empty()) {
559 dout(20) << __func__
<< " empty queue" << dendl
;
560 agent_cond
.wait(agent_locker
);
563 uint64_t level
= agent_queue
.rbegin()->first
;
564 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
566 << " tiers " << agent_queue
.size()
567 << ", top is " << level
568 << " with pgs " << top
.size()
569 << ", ops " << agent_ops
<< "/"
570 << cct
->_conf
->osd_agent_max_ops
571 << (agent_active
? " active" : " NOT ACTIVE")
573 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
574 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
575 int agent_flush_quota
= max
;
576 if (!flush_mode_high_count
)
577 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
578 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
579 agent_cond
.wait(agent_locker
);
583 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
584 agent_queue_pos
= top
.begin();
585 agent_valid_iterator
= true;
587 PGRef pg
= *agent_queue_pos
;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota
<< dendl
;
591 agent_locker
.unlock();
592 if (!pg
->agent_work(max
, agent_flush_quota
)) {
593 dout(10) << __func__
<< " " << pg
->pg_id
594 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
595 << " seconds" << dendl
;
597 osd
->logger
->inc(l_osd_tier_delay
);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
599 std::lock_guard timer_locker
{agent_timer_lock
};
600 Context
*cb
= new AgentTimeoutCB(pg
);
601 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
605 dout(10) << __func__
<< " finish" << dendl
;
608 void OSDService::agent_stop()
611 std::lock_guard
l(agent_lock
);
613 // By this time all ops should be cancelled
614 ceph_assert(agent_ops
== 0);
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue
.empty()) {
617 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
618 derr
<< "agent queue not empty, for example " << (*top
.begin())->get_pgid() << dendl
;
619 ceph_abort_msg("agent queue not empty");
622 agent_stop_flag
= true;
623 agent_cond
.notify_all();
628 // -------------------------------------
630 void OSDService::promote_throttle_recalibrate()
632 utime_t now
= ceph_clock_now();
633 double dur
= now
- last_recalibrate
;
634 last_recalibrate
= now
;
635 unsigned prob
= promote_probability_millis
;
637 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
638 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
640 unsigned min_prob
= 1;
642 uint64_t attempts
, obj
, bytes
;
643 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
644 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
645 << obj
<< " objects and " << byte_u_t(bytes
) << "; target "
646 << target_obj_sec
<< " obj/sec or "
647 << byte_u_t(target_bytes_sec
) << "/sec"
650 // calculate what the probability *should* be, given the targets
652 if (attempts
&& dur
> 0) {
653 uint64_t avg_size
= 1;
655 avg_size
= std::max
<uint64_t>(bytes
/ obj
, 1);
656 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
657 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
659 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
660 << avg_size
<< dendl
;
661 if (target_obj_sec
&& target_bytes_sec
)
662 new_prob
= std::min(po
, pb
);
663 else if (target_obj_sec
)
665 else if (target_bytes_sec
)
672 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
674 // correct for persistent skew between target rate and actual rate, adjust
677 if (attempts
&& obj
) {
678 actual
= obj
* 1000 / attempts
;
679 ratio
= (double)actual
/ (double)prob
;
680 new_prob
= (double)new_prob
/ ratio
;
682 new_prob
= std::max(new_prob
, min_prob
);
683 new_prob
= std::min(new_prob
, 1000u);
686 prob
= (prob
+ new_prob
) / 2;
687 prob
= std::max(prob
, min_prob
);
688 prob
= std::min(prob
, 1000u);
689 dout(10) << __func__
<< " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis
<< " -> " << prob
694 promote_probability_millis
= prob
;
696 // set hard limits for this interval to mitigate stampedes
697 promote_max_objects
= target_obj_sec
* osd
->OSD_TICK_INTERVAL
* 2;
698 promote_max_bytes
= target_bytes_sec
* osd
->OSD_TICK_INTERVAL
* 2;
701 // -------------------------------------
703 float OSDService::get_failsafe_full_ratio()
705 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
706 if (full_ratio
> 1.0) full_ratio
/= 100.0;
710 OSDService::s_names
OSDService::recalc_full_state(float ratio
, float pratio
, string
&inject
)
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap
= get_osdmap();
717 if (!osdmap
|| osdmap
->get_epoch() == 0) {
720 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
721 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
722 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
723 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
725 if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio
= failsafe_ratio
;
729 backfillfull_ratio
= failsafe_ratio
;
730 nearfull_ratio
= failsafe_ratio
;
731 } else if (full_ratio
<= 0 ||
732 backfillfull_ratio
<= 0 ||
733 nearfull_ratio
<= 0) {
734 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio
= failsafe_ratio
;
738 backfillfull_ratio
= failsafe_ratio
;
739 nearfull_ratio
= failsafe_ratio
;
742 if (injectfull_state
> NONE
&& injectfull
) {
743 inject
= "(Injected)";
744 return injectfull_state
;
745 } else if (pratio
> failsafe_ratio
) {
747 } else if (ratio
> full_ratio
) {
749 } else if (ratio
> backfillfull_ratio
) {
751 } else if (pratio
> nearfull_ratio
) {
757 void OSDService::check_full_status(float ratio
, float pratio
)
759 std::lock_guard
l(full_status_lock
);
762 physical_ratio
= pratio
;
766 new_state
= recalc_full_state(ratio
, pratio
, inject
);
768 dout(20) << __func__
<< " cur ratio " << ratio
769 << ", physical ratio " << pratio
770 << ", new state " << get_full_state_name(new_state
)
775 if (cur_state
!= new_state
) {
776 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
777 << " -> " << get_full_state_name(new_state
) << dendl
;
778 if (new_state
== FAILSAFE
) {
779 clog
->error() << "full status failsafe engaged, dropping updates, now "
780 << (int)roundf(ratio
* 100) << "% full";
781 } else if (cur_state
== FAILSAFE
) {
782 clog
->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio
* 100) << "% full";
785 cur_state
= new_state
;
789 bool OSDService::need_fullness_update()
791 OSDMapRef osdmap
= get_osdmap();
793 if (osdmap
->exists(whoami
)) {
794 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
796 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
798 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
805 else if (is_backfillfull())
807 else if (is_nearfull())
812 bool OSDService::_check_inject_full(DoutPrefixProvider
*dpp
, s_names type
) const
814 if (injectfull
&& injectfull_state
>= type
) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
819 ldpp_dout(dpp
, 10) << __func__
<< " Injected " << get_full_state_name(type
) << " OSD ("
820 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")"
827 bool OSDService::_check_full(DoutPrefixProvider
*dpp
, s_names type
) const
829 std::lock_guard
l(full_status_lock
);
831 if (_check_inject_full(dpp
, type
))
834 if (cur_state
>= type
)
835 ldpp_dout(dpp
, 10) << __func__
<< " current usage is " << cur_ratio
836 << " physical " << physical_ratio
<< dendl
;
838 return cur_state
>= type
;
841 bool OSDService::_tentative_full(DoutPrefixProvider
*dpp
, s_names type
, uint64_t adjust_used
, osd_stat_t adjusted_stat
)
843 ldpp_dout(dpp
, 20) << __func__
<< " type " << get_full_state_name(type
) << " adjust_used " << (adjust_used
>> 10) << "KiB" << dendl
;
845 std::lock_guard
l(full_status_lock
);
846 if (_check_inject_full(dpp
, type
)) {
852 float ratio
= compute_adjusted_ratio(adjusted_stat
, &pratio
, adjust_used
);
855 s_names tentative_state
= recalc_full_state(ratio
, pratio
, notused
);
857 if (tentative_state
>= type
)
858 ldpp_dout(dpp
, 10) << __func__
<< " tentative usage is " << ratio
<< dendl
;
860 return tentative_state
>= type
;
863 bool OSDService::check_failsafe_full(DoutPrefixProvider
*dpp
) const
865 return _check_full(dpp
, FAILSAFE
);
868 bool OSDService::check_full(DoutPrefixProvider
*dpp
) const
870 return _check_full(dpp
, FULL
);
873 bool OSDService::tentative_backfill_full(DoutPrefixProvider
*dpp
, uint64_t adjust_used
, osd_stat_t stats
)
875 return _tentative_full(dpp
, BACKFILLFULL
, adjust_used
, stats
);
878 bool OSDService::check_backfill_full(DoutPrefixProvider
*dpp
) const
880 return _check_full(dpp
, BACKFILLFULL
);
883 bool OSDService::check_nearfull(DoutPrefixProvider
*dpp
) const
885 return _check_full(dpp
, NEARFULL
);
888 bool OSDService::is_failsafe_full() const
890 std::lock_guard
l(full_status_lock
);
891 return cur_state
== FAILSAFE
;
894 bool OSDService::is_full() const
896 std::lock_guard
l(full_status_lock
);
897 return cur_state
>= FULL
;
900 bool OSDService::is_backfillfull() const
902 std::lock_guard
l(full_status_lock
);
903 return cur_state
>= BACKFILLFULL
;
906 bool OSDService::is_nearfull() const
908 std::lock_guard
l(full_status_lock
);
909 return cur_state
>= NEARFULL
;
912 void OSDService::set_injectfull(s_names type
, int64_t count
)
914 std::lock_guard
l(full_status_lock
);
915 injectfull_state
= type
;
919 void OSDService::set_statfs(const struct store_statfs_t
&stbuf
,
920 osd_alert_list_t
& alerts
)
922 uint64_t bytes
= stbuf
.total
;
923 uint64_t avail
= stbuf
.available
;
924 uint64_t used
= stbuf
.get_used_raw();
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct
->_conf
->fake_statfs_for_testing
) {
929 uint64_t total_num_bytes
= 0;
933 total_num_bytes
+= p
->get_stats_num_bytes();
935 bytes
= cct
->_conf
->fake_statfs_for_testing
;
936 if (total_num_bytes
< bytes
)
937 avail
= bytes
- total_num_bytes
;
940 dout(0) << __func__
<< " fake total " << cct
->_conf
->fake_statfs_for_testing
941 << " adjust available " << avail
943 used
= bytes
- avail
;
946 osd
->logger
->set(l_osd_stat_bytes
, bytes
);
947 osd
->logger
->set(l_osd_stat_bytes_used
, used
);
948 osd
->logger
->set(l_osd_stat_bytes_avail
, avail
);
950 std::lock_guard
l(stat_lock
);
951 osd_stat
.statfs
= stbuf
;
952 osd_stat
.os_alerts
.clear();
953 osd_stat
.os_alerts
[whoami
].swap(alerts
);
954 if (cct
->_conf
->fake_statfs_for_testing
) {
955 osd_stat
.statfs
.total
= bytes
;
956 osd_stat
.statfs
.available
= avail
;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat
.statfs
.internally_reserved
= 0;
962 osd_stat_t
OSDService::set_osd_stat(vector
<int>& hb_peers
,
965 utime_t now
= ceph_clock_now();
966 auto stale_time
= g_conf().get_val
<int64_t>("osd_mon_heartbeat_stat_stale");
967 std::lock_guard
l(stat_lock
);
968 osd_stat
.hb_peers
.swap(hb_peers
);
969 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
970 osd_stat
.num_pgs
= num_pgs
;
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i
: osd_stat
.hb_pingtime
) {
974 if (i
.second
.last_update
== 0)
976 if (stale_time
&& now
.sec() - i
.second
.last_update
> stale_time
) {
977 dout(20) << __func__
<< " time out heartbeat for osd " << i
.first
978 << " last_update " << i
.second
.last_update
<< dendl
;
979 osd_stat
.hb_pingtime
.erase(i
.first
);
986 void OSDService::inc_osd_stat_repaired()
988 std::lock_guard
l(stat_lock
);
989 osd_stat
.num_shards_repaired
++;
993 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat
, float *pratio
,
994 uint64_t adjust_used
)
997 ((float)new_stat
.statfs
.get_used()) / ((float)new_stat
.statfs
.total
);
1000 dout(20) << __func__
<< " Before kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1001 if (new_stat
.statfs
.available
> adjust_used
)
1002 new_stat
.statfs
.available
-= adjust_used
;
1004 new_stat
.statfs
.available
= 0;
1005 dout(20) << __func__
<< " After kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted
= 0;
1011 osd
->_get_pgs(&pgs
);
1012 for (auto p
: pgs
) {
1013 backfill_adjusted
+= p
->pg_stat_adjust(&new_stat
);
1015 if (backfill_adjusted
) {
1016 dout(20) << __func__
<< " backfill adjusted " << new_stat
<< dendl
;
1018 return ((float)new_stat
.statfs
.get_used()) / ((float)new_stat
.statfs
.total
);
1021 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
1023 OSDMapRef next_map
= get_nextmap_reserved();
1024 // service map is always newer/newest
1025 ceph_assert(from_epoch
<= next_map
->get_epoch());
1027 if (next_map
->is_down(peer
) ||
1028 next_map
->get_info(peer
).up_from
> from_epoch
) {
1030 release_map(next_map
);
1033 ConnectionRef peer_con
;
1034 if (peer
== whoami
) {
1035 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1037 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1038 next_map
->get_cluster_addrs(peer
), false, true);
1040 maybe_share_map(peer_con
.get(), next_map
);
1041 peer_con
->send_message(m
);
1042 release_map(next_map
);
1045 void OSDService::send_message_osd_cluster(std::vector
<std::pair
<int, Message
*>>& messages
, epoch_t from_epoch
)
1047 OSDMapRef next_map
= get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch
<= next_map
->get_epoch());
1051 for (auto& iter
: messages
) {
1052 if (next_map
->is_down(iter
.first
) ||
1053 next_map
->get_info(iter
.first
).up_from
> from_epoch
) {
1057 ConnectionRef peer_con
;
1058 if (iter
.first
== whoami
) {
1059 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1061 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1062 next_map
->get_cluster_addrs(iter
.first
), false, true);
1064 maybe_share_map(peer_con
.get(), next_map
);
1065 peer_con
->send_message(iter
.second
);
1067 release_map(next_map
);
1069 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
1071 OSDMapRef next_map
= get_nextmap_reserved();
1072 // service map is always newer/newest
1073 ceph_assert(from_epoch
<= next_map
->get_epoch());
1075 if (next_map
->is_down(peer
) ||
1076 next_map
->get_info(peer
).up_from
> from_epoch
) {
1077 release_map(next_map
);
1081 if (peer
== whoami
) {
1082 con
= osd
->cluster_messenger
->get_loopback_connection();
1084 con
= osd
->cluster_messenger
->connect_to_osd(
1085 next_map
->get_cluster_addrs(peer
), false, true);
1087 release_map(next_map
);
1091 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
1093 OSDMapRef next_map
= get_nextmap_reserved();
1094 // service map is always newer/newest
1095 ceph_assert(from_epoch
<= next_map
->get_epoch());
1097 pair
<ConnectionRef
,ConnectionRef
> ret
;
1098 if (next_map
->is_down(peer
) ||
1099 next_map
->get_info(peer
).up_from
> from_epoch
) {
1100 release_map(next_map
);
1103 ret
.first
= osd
->hb_back_client_messenger
->connect_to_osd(
1104 next_map
->get_hb_back_addrs(peer
));
1105 ret
.second
= osd
->hb_front_client_messenger
->connect_to_osd(
1106 next_map
->get_hb_front_addrs(peer
));
1107 release_map(next_map
);
1111 entity_name_t
OSDService::get_cluster_msgr_name() const
1113 return cluster_messenger
->get_myname();
1116 void OSDService::queue_want_pg_temp(pg_t pgid
,
1117 const vector
<int>& want
,
1120 std::lock_guard
l(pg_temp_lock
);
1121 auto p
= pg_temp_pending
.find(pgid
);
1122 if (p
== pg_temp_pending
.end() ||
1123 p
->second
.acting
!= want
||
1125 pg_temp_wanted
[pgid
] = {want
, forced
};
1129 void OSDService::remove_want_pg_temp(pg_t pgid
)
1131 std::lock_guard
l(pg_temp_lock
);
1132 pg_temp_wanted
.erase(pgid
);
1133 pg_temp_pending
.erase(pgid
);
1136 void OSDService::_sent_pg_temp()
1138 #ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending
.merge(pg_temp_wanted
);
1141 pg_temp_pending
.insert(make_move_iterator(begin(pg_temp_wanted
)),
1142 make_move_iterator(end(pg_temp_wanted
)));
1144 pg_temp_wanted
.clear();
1147 void OSDService::requeue_pg_temp()
1149 std::lock_guard
l(pg_temp_lock
);
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted
= pg_temp_wanted
.size();
1153 unsigned old_pending
= pg_temp_pending
.size();
1155 pg_temp_wanted
.swap(pg_temp_pending
);
1156 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1157 << pg_temp_wanted
.size() << dendl
;
1160 std::ostream
& operator<<(std::ostream
& out
,
1161 const OSDService::pg_temp_t
& pg_temp
)
1163 out
<< pg_temp
.acting
;
1164 if (pg_temp
.forced
) {
1170 void OSDService::send_pg_temp()
1172 std::lock_guard
l(pg_temp_lock
);
1173 if (pg_temp_wanted
.empty())
1175 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1176 MOSDPGTemp
*ms
[2] = {nullptr, nullptr};
1177 for (auto& [pgid
, pg_temp
] : pg_temp_wanted
) {
1178 auto& m
= ms
[pg_temp
.forced
];
1180 m
= new MOSDPGTemp(osdmap
->get_epoch());
1181 m
->forced
= pg_temp
.forced
;
1183 m
->pg_temp
.emplace(pgid
, pg_temp
.acting
);
1187 monc
->send_mon_message(m
);
1193 void OSDService::send_pg_created(pg_t pgid
)
1195 std::lock_guard
l(pg_created_lock
);
1196 dout(20) << __func__
<< dendl
;
1197 auto o
= get_osdmap();
1198 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1199 pg_created
.insert(pgid
);
1200 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1204 void OSDService::send_pg_created()
1206 std::lock_guard
l(pg_created_lock
);
1207 dout(20) << __func__
<< dendl
;
1208 auto o
= get_osdmap();
1209 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1210 for (auto pgid
: pg_created
) {
1211 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1216 void OSDService::prune_pg_created()
1218 std::lock_guard
l(pg_created_lock
);
1219 dout(20) << __func__
<< dendl
;
1220 auto o
= get_osdmap();
1221 auto i
= pg_created
.begin();
1222 while (i
!= pg_created
.end()) {
1223 auto p
= o
->get_pg_pool(i
->pool());
1224 if (!p
|| !p
->has_flag(pg_pool_t::FLAG_CREATING
)) {
1225 dout(20) << __func__
<< " pruning " << *i
<< dendl
;
1226 i
= pg_created
.erase(i
);
1228 dout(20) << __func__
<< " keeping " << *i
<< dendl
;
1235 // --------------------------------------
1238 bool OSDService::can_inc_scrubs()
1240 bool can_inc
= false;
1241 std::lock_guard
l(sched_scrub_lock
);
1243 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1244 dout(20) << __func__
<< " == true " << scrubs_local
<< " local + " << scrubs_remote
1245 << " remote < max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1248 dout(20) << __func__
<< " == false " << scrubs_local
<< " local + " << scrubs_remote
1249 << " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1255 bool OSDService::inc_scrubs_local()
1257 bool result
= false;
1258 std::lock_guard l
{sched_scrub_lock
};
1259 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1260 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
+1)
1261 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1265 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1270 void OSDService::dec_scrubs_local()
1272 std::lock_guard l
{sched_scrub_lock
};
1273 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
-1)
1274 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1276 ceph_assert(scrubs_local
>= 0);
1279 bool OSDService::inc_scrubs_remote()
1281 bool result
= false;
1282 std::lock_guard l
{sched_scrub_lock
};
1283 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1284 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
+1)
1285 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1289 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1294 void OSDService::dec_scrubs_remote()
1296 std::lock_guard l
{sched_scrub_lock
};
1297 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
-1)
1298 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1300 ceph_assert(scrubs_remote
>= 0);
1303 void OSDService::dump_scrub_reservations(Formatter
*f
)
1305 std::lock_guard l
{sched_scrub_lock
};
1306 f
->dump_int("scrubs_local", scrubs_local
);
1307 f
->dump_int("scrubs_remote", scrubs_remote
);
1308 f
->dump_int("osd_max_scrubs", cct
->_conf
->osd_max_scrubs
);
1311 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1312 epoch_t
*_bind_epoch
) const
1314 std::lock_guard
l(epoch_lock
);
1316 *_boot_epoch
= boot_epoch
;
1318 *_up_epoch
= up_epoch
;
1320 *_bind_epoch
= bind_epoch
;
1323 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1324 const epoch_t
*_bind_epoch
)
1326 std::lock_guard
l(epoch_lock
);
1328 ceph_assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1329 boot_epoch
= *_boot_epoch
;
1332 ceph_assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1333 up_epoch
= *_up_epoch
;
1336 ceph_assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1337 bind_epoch
= *_bind_epoch
;
1341 bool OSDService::prepare_to_stop()
1343 std::unique_lock
l(is_stopping_lock
);
1344 if (get_state() != NOT_STOPPING
)
1347 OSDMapRef osdmap
= get_osdmap();
1348 if (osdmap
&& osdmap
->is_up(whoami
)) {
1349 dout(0) << __func__
<< " telling mon we are shutting down" << dendl
;
1350 set_state(PREPARING_TO_STOP
);
1351 monc
->send_mon_message(
1355 osdmap
->get_addrs(whoami
),
1356 osdmap
->get_epoch(),
1359 const auto timeout
= ceph::make_timespan(cct
->_conf
->osd_mon_shutdown_timeout
);
1360 is_stopping_cond
.wait_for(l
, timeout
,
1361 [this] { return get_state() == STOPPING
; });
1363 dout(0) << __func__
<< " starting shutdown" << dendl
;
1364 set_state(STOPPING
);
1368 void OSDService::got_stop_ack()
1370 std::scoped_lock
l(is_stopping_lock
);
1371 if (get_state() == PREPARING_TO_STOP
) {
1372 dout(0) << __func__
<< " starting shutdown" << dendl
;
1373 set_state(STOPPING
);
1374 is_stopping_cond
.notify_all();
1376 dout(10) << __func__
<< " ignoring msg" << dendl
;
1380 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1381 OSDSuperblock
& sblock
)
1383 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1384 osdmap
->get_encoding_features());
1385 m
->oldest_map
= max_oldest_map
;
1386 m
->newest_map
= sblock
.newest_map
;
1388 int max
= cct
->_conf
->osd_map_message_max
;
1389 ssize_t max_bytes
= cct
->_conf
->osd_map_message_max_bytes
;
1391 if (since
< m
->oldest_map
) {
1392 // we don't have the next map the target wants, so start with a
1395 dout(10) << __func__
<< " oldest map " << max_oldest_map
<< " > since "
1396 << since
<< ", starting with full map" << dendl
;
1397 since
= m
->oldest_map
;
1398 if (!get_map_bl(since
, bl
)) {
1399 derr
<< __func__
<< " missing full map " << since
<< dendl
;
1403 max_bytes
-= bl
.length();
1404 m
->maps
[since
].claim(bl
);
1406 for (epoch_t e
= since
+ 1; e
<= to
; ++e
) {
1408 if (get_inc_map_bl(e
, bl
)) {
1409 m
->incremental_maps
[e
].claim(bl
);
1411 derr
<< __func__
<< " missing incremental map " << e
<< dendl
;
1412 if (!get_map_bl(e
, bl
)) {
1413 derr
<< __func__
<< " also missing full map " << e
<< dendl
;
1416 m
->maps
[e
].claim(bl
);
1419 max_bytes
-= bl
.length();
1420 if (max
<= 0 || max_bytes
<= 0) {
1427 if (!m
->maps
.empty() ||
1428 !m
->incremental_maps
.empty()) {
1429 // send what we have so far
1434 if (get_inc_map_bl(m
->newest_map
, bl
)) {
1435 m
->incremental_maps
[m
->newest_map
].claim(bl
);
1437 derr
<< __func__
<< " unable to load latest map " << m
->newest_map
<< dendl
;
1438 if (!get_map_bl(m
->newest_map
, bl
)) {
1439 derr
<< __func__
<< " unable to load latest full map " << m
->newest_map
1443 m
->maps
[m
->newest_map
].claim(bl
);
1448 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1450 con
->send_message(m
);
1453 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1454 const OSDMapRef
& osdmap
)
1456 epoch_t to
= osdmap
->get_epoch();
1457 dout(10) << "send_incremental_map " << since
<< " -> " << to
1458 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1462 OSDSuperblock
sblock(get_superblock());
1463 if (since
< sblock
.oldest_map
) {
1464 // just send latest full map
1465 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1466 osdmap
->get_encoding_features());
1467 m
->oldest_map
= max_oldest_map
;
1468 m
->newest_map
= sblock
.newest_map
;
1469 get_map_bl(to
, m
->maps
[to
]);
1474 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1475 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl
;
1477 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1480 m
= build_incremental_map_msg(since
, to
, sblock
);
1485 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1487 bool found
= map_bl_cache
.lookup(e
, &bl
);
1490 logger
->inc(l_osd_map_bl_cache_hit
);
1494 logger
->inc(l_osd_map_bl_cache_miss
);
1495 found
= store
->read(meta_ch
,
1496 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1504 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1506 std::lock_guard
l(map_cache_lock
);
1507 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1510 logger
->inc(l_osd_map_bl_cache_hit
);
1514 logger
->inc(l_osd_map_bl_cache_miss
);
1515 found
= store
->read(meta_ch
,
1516 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1519 _add_map_inc_bl(e
, bl
);
1524 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1526 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1527 // cache a contiguous buffer
1528 if (bl
.get_num_buffers() > 1) {
1531 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1532 map_bl_cache
.add(e
, bl
);
1535 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1537 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1538 // cache a contiguous buffer
1539 if (bl
.get_num_buffers() > 1) {
1542 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1543 map_bl_inc_cache
.add(e
, bl
);
1546 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1548 epoch_t e
= o
->get_epoch();
1550 if (cct
->_conf
->osd_map_dedup
) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1554 OSDMap::dedup(for_dedup
.get(), o
);
1558 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1565 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1567 std::lock_guard
l(map_cache_lock
);
1568 OSDMapRef retval
= map_cache
.lookup(epoch
);
1570 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1572 logger
->inc(l_osd_map_cache_hit
);
1577 logger
->inc(l_osd_map_cache_miss
);
1578 epoch_t lb
= map_cache
.cached_key_lower_bound();
1580 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1581 logger
->inc(l_osd_map_cache_miss_low
);
1582 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1586 OSDMap
*map
= new OSDMap
;
1588 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1590 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1591 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1597 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1599 return _add_map(map
);
1605 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1607 reply_op_error(op
, err
, eversion_t(), 0, {});
1610 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1612 vector
<pg_log_op_return_item_t
> op_returns
)
1614 auto m
= op
->get_req
<MOSDOp
>();
1615 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1617 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1619 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
,
1620 !m
->has_flag(CEPH_OSD_FLAG_RETURNVEC
));
1621 reply
->set_reply_versions(v
, uv
);
1622 reply
->set_op_returns(op_returns
);
1623 m
->get_connection()->send_message(reply
);
1626 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1628 if (!cct
->_conf
->osd_debug_misdirected_ops
) {
1632 auto m
= op
->get_req
<MOSDOp
>();
1633 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1635 ceph_assert(m
->get_map_epoch() >= pg
->get_history().same_primary_since
);
1637 if (pg
->is_ec_pg()) {
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1654 ceph_assert(m
->get_map_epoch() <= superblock
.newest_map
);
1655 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1657 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1658 << m
->get_map_epoch() << ", dropping" << dendl
;
1661 pg_t _pgid
= m
->get_raw_pg();
1663 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1664 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1665 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1666 pgid
.shard
!= pg
->pg_id
.shard
) {
1667 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1668 << m
->get_map_epoch() << ", dropping" << dendl
;
1673 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1674 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1675 << " pg " << m
->get_raw_pg()
1676 << " to osd." << whoami
1677 << " not " << pg
->get_acting()
1678 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1681 void OSDService::enqueue_back(OpSchedulerItem
&& qi
)
1683 osd
->op_shardedwq
.queue(std::move(qi
));
1686 void OSDService::enqueue_front(OpSchedulerItem
&& qi
)
1688 osd
->op_shardedwq
.queue_front(std::move(qi
));
1691 void OSDService::queue_recovery_context(
1693 GenContext
<ThreadPool::TPHandle
&> *c
)
1695 epoch_t e
= get_osdmap_epoch();
1698 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1699 new PGRecoveryContext(pg
->get_pgid(), c
, e
)),
1700 cct
->_conf
->osd_recovery_cost
,
1701 cct
->_conf
->osd_recovery_priority
,
1707 void OSDService::queue_for_snap_trim(PG
*pg
)
1709 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1712 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1713 new PGSnapTrim(pg
->get_pgid(), pg
->get_osdmap_epoch())),
1714 cct
->_conf
->osd_snap_trim_cost
,
1715 cct
->_conf
->osd_snap_trim_priority
,
1718 pg
->get_osdmap_epoch()));
1721 void OSDService::queue_for_scrub(PG
*pg
, bool with_high_priority
)
1723 unsigned scrub_queue_priority
= pg
->scrubber
.priority
;
1724 if (with_high_priority
&& scrub_queue_priority
< cct
->_conf
->osd_client_op_priority
) {
1725 scrub_queue_priority
= cct
->_conf
->osd_client_op_priority
;
1727 const auto epoch
= pg
->get_osdmap_epoch();
1730 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGScrub(pg
->get_pgid(), epoch
)),
1731 cct
->_conf
->osd_scrub_cost
,
1732 scrub_queue_priority
,
1738 void OSDService::queue_for_pg_delete(spg_t pgid
, epoch_t e
)
1740 dout(10) << __func__
<< " on " << pgid
<< " e " << e
<< dendl
;
1743 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1744 new PGDelete(pgid
, e
)),
1745 cct
->_conf
->osd_pg_delete_cost
,
1746 cct
->_conf
->osd_pg_delete_priority
,
1752 bool OSDService::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
1754 return osd
->try_finish_pg_delete(pg
, old_pg_num
);
1759 void OSDService::set_ready_to_merge_source(PG
*pg
, eversion_t version
)
1761 std::lock_guard
l(merge_lock
);
1762 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1763 ready_to_merge_source
[pg
->pg_id
.pgid
] = version
;
1764 assert(not_ready_to_merge_source
.count(pg
->pg_id
.pgid
) == 0);
1765 _send_ready_to_merge();
1768 void OSDService::set_ready_to_merge_target(PG
*pg
,
1770 epoch_t last_epoch_started
,
1771 epoch_t last_epoch_clean
)
1773 std::lock_guard
l(merge_lock
);
1774 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1775 ready_to_merge_target
.insert(make_pair(pg
->pg_id
.pgid
,
1778 last_epoch_clean
)));
1779 assert(not_ready_to_merge_target
.count(pg
->pg_id
.pgid
) == 0);
1780 _send_ready_to_merge();
1783 void OSDService::set_not_ready_to_merge_source(pg_t source
)
1785 std::lock_guard
l(merge_lock
);
1786 dout(10) << __func__
<< " " << source
<< dendl
;
1787 not_ready_to_merge_source
.insert(source
);
1788 assert(ready_to_merge_source
.count(source
) == 0);
1789 _send_ready_to_merge();
1792 void OSDService::set_not_ready_to_merge_target(pg_t target
, pg_t source
)
1794 std::lock_guard
l(merge_lock
);
1795 dout(10) << __func__
<< " " << target
<< " source " << source
<< dendl
;
1796 not_ready_to_merge_target
[target
] = source
;
1797 assert(ready_to_merge_target
.count(target
) == 0);
1798 _send_ready_to_merge();
1801 void OSDService::send_ready_to_merge()
1803 std::lock_guard
l(merge_lock
);
1804 _send_ready_to_merge();
1807 void OSDService::_send_ready_to_merge()
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1816 for (auto src
: not_ready_to_merge_source
) {
1817 if (sent_ready_to_merge_source
.count(src
) == 0) {
1818 monc
->send_mon_message(new MOSDPGReadyToMerge(
1822 osdmap
->get_epoch()));
1823 sent_ready_to_merge_source
.insert(src
);
1826 for (auto p
: not_ready_to_merge_target
) {
1827 if (sent_ready_to_merge_source
.count(p
.second
) == 0) {
1828 monc
->send_mon_message(new MOSDPGReadyToMerge(
1832 osdmap
->get_epoch()));
1833 sent_ready_to_merge_source
.insert(p
.second
);
1836 for (auto src
: ready_to_merge_source
) {
1837 if (not_ready_to_merge_source
.count(src
.first
) ||
1838 not_ready_to_merge_target
.count(src
.first
.get_parent())) {
1841 auto p
= ready_to_merge_target
.find(src
.first
.get_parent());
1842 if (p
!= ready_to_merge_target
.end() &&
1843 sent_ready_to_merge_source
.count(src
.first
) == 0) {
1844 monc
->send_mon_message(new MOSDPGReadyToMerge(
1845 src
.first
, // source pgid
1846 src
.second
, // src version
1847 std::get
<0>(p
->second
), // target version
1848 std::get
<1>(p
->second
), // PG's last_epoch_started
1849 std::get
<2>(p
->second
), // PG's last_epoch_clean
1851 osdmap
->get_epoch()));
1852 sent_ready_to_merge_source
.insert(src
.first
);
1857 void OSDService::clear_ready_to_merge(PG
*pg
)
1859 std::lock_guard
l(merge_lock
);
1860 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1861 ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1862 ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1863 not_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1864 not_ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1865 sent_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1868 void OSDService::clear_sent_ready_to_merge()
1870 std::lock_guard
l(merge_lock
);
1871 sent_ready_to_merge_source
.clear();
1874 void OSDService::prune_sent_ready_to_merge(const OSDMapRef
& osdmap
)
1876 std::lock_guard
l(merge_lock
);
1877 auto i
= sent_ready_to_merge_source
.begin();
1878 while (i
!= sent_ready_to_merge_source
.end()) {
1879 if (!osdmap
->pg_exists(*i
)) {
1880 dout(10) << __func__
<< " " << *i
<< dendl
;
1881 i
= sent_ready_to_merge_source
.erase(i
);
1890 void OSDService::_queue_for_recovery(
1891 std::pair
<epoch_t
, PGRef
> p
,
1892 uint64_t reserved_pushes
)
1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
1897 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1899 p
.second
->get_pgid(), p
.first
, reserved_pushes
)),
1900 cct
->_conf
->osd_recovery_cost
,
1901 cct
->_conf
->osd_recovery_priority
,
1907 // ====================================================================
1911 #define dout_prefix *_dout
1913 // Commands shared between OSD's console and admin console:
1915 namespace osd_cmds
{
1917 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
);
1919 }} // namespace ceph::osd_cmds
1921 int OSD::mkfs(CephContext
*cct
, ObjectStore
*store
, uuid_d fsid
, int whoami
)
1927 ObjectStore::CollectionHandle ch
;
1929 // if we are fed a uuid for this osd, use it.
1930 store
->set_fsid(cct
->_conf
->osd_uuid
);
1932 ret
= store
->mkfs();
1934 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret
) << dendl
;
1939 store
->set_cache_shards(1); // doesn't matter for mkfs!
1941 ret
= store
->mount();
1943 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret
) << dendl
;
1948 ch
= store
->open_collection(coll_t::meta());
1950 ret
= store
->read(ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
1952 derr
<< "OSD::mkfs: have meta collection but no superblock" << dendl
;
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl
;
1957 auto p
= sbbl
.cbegin();
1959 if (whoami
!= sb
.whoami
) {
1960 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
1965 if (fsid
!= sb
.cluster_fsid
) {
1966 derr
<< "provided cluster fsid " << fsid
1967 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
1972 // create superblock
1973 sb
.cluster_fsid
= fsid
;
1974 sb
.osd_fsid
= store
->get_fsid();
1976 sb
.compat_features
= get_osd_initial_compat_set();
1981 ObjectStore::CollectionHandle ch
= store
->create_new_collection(
1983 ObjectStore::Transaction t
;
1984 t
.create_collection(coll_t::meta(), 0);
1985 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
1986 ret
= store
->queue_transaction(ch
, std::move(t
));
1988 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1989 << "queue_transaction returned " << cpp_strerror(ret
) << dendl
;
1994 ret
= write_meta(cct
, store
, sb
.cluster_fsid
, sb
.osd_fsid
, whoami
);
1996 derr
<< "OSD::mkfs: failed to write fsid file: error "
1997 << cpp_strerror(ret
) << dendl
;
2011 int OSD::write_meta(CephContext
*cct
, ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
)
2016 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
2017 r
= store
->write_meta("magic", val
);
2021 snprintf(val
, sizeof(val
), "%d", whoami
);
2022 r
= store
->write_meta("whoami", val
);
2026 cluster_fsid
.print(val
);
2027 r
= store
->write_meta("ceph_fsid", val
);
2031 string key
= cct
->_conf
.get_val
<string
>("key");
2033 r
= store
->write_meta("osd_key", key
);
2037 string keyfile
= cct
->_conf
.get_val
<string
>("keyfile");
2038 if (!keyfile
.empty()) {
2041 r
= keybl
.read_file(keyfile
.c_str(), &err
);
2043 derr
<< __func__
<< " failed to read keyfile " << keyfile
<< ": "
2044 << err
<< ": " << cpp_strerror(r
) << dendl
;
2047 r
= store
->write_meta("osd_key", keybl
.to_str());
2053 r
= store
->write_meta("ready", "ready");
2060 int OSD::peek_meta(ObjectStore
*store
,
2062 uuid_d
*cluster_fsid
,
2065 ceph_release_t
*require_osd_release
)
2069 int r
= store
->read_meta("magic", &val
);
2074 r
= store
->read_meta("whoami", &val
);
2077 *whoami
= atoi(val
.c_str());
2079 r
= store
->read_meta("ceph_fsid", &val
);
2082 r
= cluster_fsid
->parse(val
.c_str());
2086 r
= store
->read_meta("fsid", &val
);
2088 *osd_fsid
= uuid_d();
2090 r
= osd_fsid
->parse(val
.c_str());
2095 r
= store
->read_meta("require_osd_release", &val
);
2097 *require_osd_release
= ceph_release_from_name(val
);
2105 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2109 OSD::OSD(CephContext
*cct_
, ObjectStore
*store_
,
2111 Messenger
*internal_messenger
,
2112 Messenger
*external_messenger
,
2113 Messenger
*hb_client_front
,
2114 Messenger
*hb_client_back
,
2115 Messenger
*hb_front_serverm
,
2116 Messenger
*hb_back_serverm
,
2117 Messenger
*osdc_messenger
,
2119 const std::string
&dev
, const std::string
&jdev
) :
2121 tick_timer(cct
, osd_lock
),
2122 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
2123 gss_ktfile_client(cct
->_conf
.get_val
<std::string
>("gss_ktab_client_file")),
2124 cluster_messenger(internal_messenger
),
2125 client_messenger(external_messenger
),
2126 objecter_messenger(osdc_messenger
),
2128 mgrc(cct_
, client_messenger
, &mc
->monmap
),
2130 recoverystate_perf(NULL
),
2132 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
2133 clog(log_client
.create_channel()),
2135 dev_path(dev
), journal_path(jdev
),
2136 store_is_rotational(store
->is_rotational()),
2137 trace_endpoint("0.0.0.0", 0, "osd"),
2139 m_osd_pg_epoch_max_lag_factor(cct
->_conf
.get_val
<double>(
2140 "osd_pg_epoch_max_lag_factor")),
2141 osd_compat(get_osd_compat_set()),
2142 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
2143 get_num_op_threads()),
2144 heartbeat_stop(false),
2145 heartbeat_need_update(true),
2146 hb_front_client_messenger(hb_client_front
),
2147 hb_back_client_messenger(hb_client_back
),
2148 hb_front_server_messenger(hb_front_serverm
),
2149 hb_back_server_messenger(hb_back_serverm
),
2151 heartbeat_thread(this),
2152 heartbeat_dispatcher(this),
2153 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
2154 cct
->_conf
->osd_num_op_tracker_shard
),
2155 test_ops_hook(NULL
),
2158 cct
->_conf
->osd_op_thread_timeout
,
2159 cct
->_conf
->osd_op_thread_suicide_timeout
,
2161 last_pg_create_epoch(0),
2164 requested_full_first(0),
2165 requested_full_last(0),
2169 if (!gss_ktfile_client
.empty()) {
2170 // Assert we can export environment variable
2172 The default client keytab is used, if it is present and readable,
2173 to automatically obtain initial credentials for GSSAPI client
2174 applications. The principal name of the first entry in the client
2175 keytab is used by default when obtaining initial credentials.
2176 1. The KRB5_CLIENT_KTNAME environment variable.
2177 2. The default_client_keytab_name profile variable in [libdefaults].
2178 3. The hardcoded default, DEFCKTNAME.
2180 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2181 gss_ktfile_client
.c_str(), 1));
2182 ceph_assert(set_result
== 0);
2185 monc
->set_messenger(client_messenger
);
2186 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
2187 cct
->_conf
->osd_op_log_threshold
);
2188 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
2189 cct
->_conf
->osd_op_history_duration
);
2190 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
2191 cct
->_conf
->osd_op_history_slow_op_threshold
);
2192 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
2194 std::stringstream ss
;
2195 ss
<< "osd." << whoami
;
2196 trace_endpoint
.copy_name(ss
.str());
2199 // initialize shards
2200 num_shards
= get_num_op_shards();
2201 for (uint32_t i
= 0; i
< num_shards
; i
++) {
2202 OSDShard
*one_shard
= new OSDShard(
2206 shards
.push_back(one_shard
);
2212 while (!shards
.empty()) {
2213 delete shards
.back();
2216 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
2217 cct
->get_perfcounters_collection()->remove(logger
);
2218 delete recoverystate_perf
;
2223 double OSD::get_tick_interval() const
2225 // vary +/- 5% to avoid scrub scheduling livelocks
2226 constexpr auto delta
= 0.05;
2227 return (OSD_TICK_INTERVAL
*
2228 ceph::util::generate_random_number(1.0 - delta
, 1.0 + delta
));
2231 void OSD::handle_signal(int signum
)
2233 ceph_assert(signum
== SIGINT
|| signum
== SIGTERM
);
2234 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
2240 std::lock_guard
lock(osd_lock
);
2244 if (store
->test_mount_in_use()) {
2245 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
2246 << "currently in use. (Is ceph-osd already running?)" << dendl
;
2250 cct
->_conf
.add_observer(this);
2254 int OSD::set_numa_affinity()
2256 // storage numa node
2257 int store_node
= -1;
2258 store
->get_numa_node(&store_node
, nullptr, nullptr);
2259 if (store_node
>= 0) {
2260 dout(1) << __func__
<< " storage numa node " << store_node
<< dendl
;
2263 // check network numa node(s)
2264 int front_node
= -1, back_node
= -1;
2265 string front_iface
= pick_iface(
2267 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
2268 string back_iface
= pick_iface(
2270 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
2271 int r
= get_iface_numa_node(front_iface
, &front_node
);
2272 if (r
>= 0 && front_node
>= 0) {
2273 dout(1) << __func__
<< " public network " << front_iface
<< " numa node "
2274 << front_node
<< dendl
;
2275 r
= get_iface_numa_node(back_iface
, &back_node
);
2276 if (r
>= 0 && back_node
>= 0) {
2277 dout(1) << __func__
<< " cluster network " << back_iface
<< " numa node "
2278 << back_node
<< dendl
;
2279 if (front_node
== back_node
&&
2280 front_node
== store_node
) {
2281 dout(1) << " objectstore and network numa nodes all match" << dendl
;
2282 if (g_conf().get_val
<bool>("osd_numa_auto_affinity")) {
2283 numa_node
= front_node
;
2285 } else if (front_node
!= back_node
) {
2286 dout(1) << __func__
<< " public and cluster network numa nodes do not match"
2289 dout(1) << __func__
<< " objectstore and network numa nodes do not match"
2292 } else if (back_node
== -2) {
2293 dout(1) << __func__
<< " cluster network " << back_iface
2294 << " ports numa nodes do not match" << dendl
;
2296 derr
<< __func__
<< " unable to identify cluster interface '" << back_iface
2297 << "' numa node: " << cpp_strerror(r
) << dendl
;
2299 } else if (front_node
== -2) {
2300 dout(1) << __func__
<< " public network " << front_iface
2301 << " ports numa nodes do not match" << dendl
;
2303 derr
<< __func__
<< " unable to identify public interface '" << front_iface
2304 << "' numa node: " << cpp_strerror(r
) << dendl
;
2306 if (int node
= g_conf().get_val
<int64_t>("osd_numa_node"); node
>= 0) {
2307 // this takes precedence over the automagic logic above
2310 if (numa_node
>= 0) {
2311 int r
= get_numa_node_cpu_set(numa_node
, &numa_cpu_set_size
, &numa_cpu_set
);
2313 dout(1) << __func__
<< " unable to determine numa node " << numa_node
2314 << " CPUs" << dendl
;
2317 dout(1) << __func__
<< " setting numa affinity to node " << numa_node
2319 << cpu_set_to_str_list(numa_cpu_set_size
, &numa_cpu_set
)
2321 r
= set_cpu_affinity_all_threads(numa_cpu_set_size
, &numa_cpu_set
);
2324 derr
<< __func__
<< " failed to set numa affinity: " << cpp_strerror(r
)
2330 dout(1) << __func__
<< " not setting numa affinity" << dendl
;
2337 class OSDSocketHook
: public AdminSocketHook
{
2340 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
2341 int call(std::string_view prefix
, const cmdmap_t
& cmdmap
,
2344 bufferlist
& out
) override
{
2345 ceph_abort("should use async hook");
2348 std::string_view prefix
,
2349 const cmdmap_t
& cmdmap
,
2351 const bufferlist
& inbl
,
2352 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
) override
{
2354 osd
->asok_command(prefix
, cmdmap
, f
, inbl
, on_finish
);
2355 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2357 on_finish(-EINVAL
, e
.what(), empty
);
2362 std::set
<int64_t> OSD::get_mapped_pools()
2364 std::set
<int64_t> pools
;
2365 std::vector
<spg_t
> pgids
;
2367 for (const auto &pgid
: pgids
) {
2368 pools
.insert(pgid
.pool());
2373 void OSD::asok_command(
2374 std::string_view prefix
, const cmdmap_t
& cmdmap
,
2376 const bufferlist
& inbl
,
2377 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
2380 stringstream ss
; // stderr error message stream
2381 bufferlist outbl
; // if empty at end, we'll dump formatter as output
2383 // --- PG commands are routed here to PG::do_command ---
2384 if (prefix
== "pg" ||
2385 prefix
== "query" ||
2386 prefix
== "mark_unfound_lost" ||
2387 prefix
== "list_unfound" ||
2388 prefix
== "scrub" ||
2389 prefix
== "deep_scrub"
2393 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
2394 ss
<< "no pgid specified";
2398 if (!pgid
.parse(pgidstr
.c_str())) {
2399 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
2405 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
2406 (pg
= _lookup_lock_pg(pcand
))) {
2407 if (pg
->is_primary()) {
2408 cmdmap_t new_cmdmap
= cmdmap
;
2410 pg
->do_command(prefix
, new_cmdmap
, inbl
, on_finish
);
2412 return; // the pg handler calls on_finish directly
2413 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2420 ss
<< "not primary for pgid " << pgid
;
2421 // do not reply; they will get newer maps and realize they
2428 ss
<< "i don't have pgid " << pgid
;
2433 // --- OSD commands follow ---
2435 else if (prefix
== "status") {
2436 lock_guard
l(osd_lock
);
2437 f
->open_object_section("status");
2438 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
2439 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
2440 f
->dump_unsigned("whoami", superblock
.whoami
);
2441 f
->dump_string("state", get_state_name(get_state()));
2442 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
2443 f
->dump_unsigned("newest_map", superblock
.newest_map
);
2444 f
->dump_unsigned("num_pgs", num_pgs
);
2446 } else if (prefix
== "flush_journal") {
2447 store
->flush_journal();
2448 } else if (prefix
== "dump_ops_in_flight" ||
2450 prefix
== "dump_blocked_ops" ||
2451 prefix
== "dump_historic_ops" ||
2452 prefix
== "dump_historic_ops_by_duration" ||
2453 prefix
== "dump_historic_slow_ops") {
2455 const string error_str
= "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2456 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2457 will start to track new ops received afterwards.";
2459 set
<string
> filters
;
2460 vector
<string
> filter_str
;
2461 if (cmd_getval(cmdmap
, "filterstr", filter_str
)) {
2462 copy(filter_str
.begin(), filter_str
.end(),
2463 inserter(filters
, filters
.end()));
2466 if (prefix
== "dump_ops_in_flight" ||
2468 if (!op_tracker
.dump_ops_in_flight(f
, false, filters
)) {
2474 if (prefix
== "dump_blocked_ops") {
2475 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
)) {
2481 if (prefix
== "dump_historic_ops") {
2482 if (!op_tracker
.dump_historic_ops(f
, false, filters
)) {
2488 if (prefix
== "dump_historic_ops_by_duration") {
2489 if (!op_tracker
.dump_historic_ops(f
, true, filters
)) {
2495 if (prefix
== "dump_historic_slow_ops") {
2496 if (!op_tracker
.dump_historic_slow_ops(f
, filters
)) {
2502 } else if (prefix
== "dump_op_pq_state") {
2503 f
->open_object_section("pq");
2504 op_shardedwq
.dump(f
);
2506 } else if (prefix
== "dump_blacklist") {
2507 list
<pair
<entity_addr_t
,utime_t
> > bl
;
2508 OSDMapRef curmap
= service
.get_osdmap();
2510 f
->open_array_section("blacklist");
2511 curmap
->get_blacklist(&bl
);
2512 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
2513 it
!= bl
.end(); ++it
) {
2514 f
->open_object_section("entry");
2515 f
->open_object_section("entity_addr_t");
2517 f
->close_section(); //entity_addr_t
2518 it
->second
.localtime(f
->dump_stream("expire_time"));
2519 f
->close_section(); //entry
2521 f
->close_section(); //blacklist
2522 } else if (prefix
== "dump_watchers") {
2523 list
<obj_watch_item_t
> watchers
;
2527 for (auto& pg
: pgs
) {
2528 list
<obj_watch_item_t
> pg_watchers
;
2529 pg
->get_watchers(&pg_watchers
);
2530 watchers
.splice(watchers
.end(), pg_watchers
);
2533 f
->open_array_section("watchers");
2534 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2535 it
!= watchers
.end(); ++it
) {
2537 f
->open_object_section("watch");
2539 f
->dump_string("namespace", it
->obj
.nspace
);
2540 f
->dump_string("object", it
->obj
.oid
.name
);
2542 f
->open_object_section("entity_name");
2543 it
->wi
.name
.dump(f
);
2544 f
->close_section(); //entity_name_t
2546 f
->dump_unsigned("cookie", it
->wi
.cookie
);
2547 f
->dump_unsigned("timeout", it
->wi
.timeout_seconds
);
2549 f
->open_object_section("entity_addr_t");
2550 it
->wi
.addr
.dump(f
);
2551 f
->close_section(); //entity_addr_t
2553 f
->close_section(); //watch
2556 f
->close_section(); //watchers
2557 } else if (prefix
== "dump_recovery_reservations") {
2558 f
->open_object_section("reservations");
2559 f
->open_object_section("local_reservations");
2560 service
.local_reserver
.dump(f
);
2562 f
->open_object_section("remote_reservations");
2563 service
.remote_reserver
.dump(f
);
2566 } else if (prefix
== "dump_scrub_reservations") {
2567 f
->open_object_section("scrub_reservations");
2568 service
.dump_scrub_reservations(f
);
2570 } else if (prefix
== "get_latest_osdmap") {
2571 get_latest_osdmap();
2572 } else if (prefix
== "set_heap_property") {
2576 bool success
= false;
2577 if (!cmd_getval(cmdmap
, "property", property
)) {
2578 error
= "unable to get property";
2580 } else if (!cmd_getval(cmdmap
, "value", value
)) {
2581 error
= "unable to get value";
2583 } else if (value
< 0) {
2584 error
= "negative value not allowed";
2586 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2587 error
= "invalid property";
2592 f
->open_object_section("result");
2593 f
->dump_string("error", error
);
2594 f
->dump_bool("success", success
);
2596 } else if (prefix
== "get_heap_property") {
2600 bool success
= false;
2601 if (!cmd_getval(cmdmap
, "property", property
)) {
2602 error
= "unable to get property";
2604 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2605 error
= "invalid property";
2610 f
->open_object_section("result");
2611 f
->dump_string("error", error
);
2612 f
->dump_bool("success", success
);
2613 f
->dump_int("value", value
);
2615 } else if (prefix
== "dump_objectstore_kv_stats") {
2616 store
->get_db_statistics(f
);
2617 } else if (prefix
== "dump_scrubs") {
2618 service
.dumps_scrub(f
);
2619 } else if (prefix
== "calc_objectstore_db_histogram") {
2620 store
->generate_db_histogram(f
);
2621 } else if (prefix
== "flush_store_cache") {
2622 store
->flush_cache(&ss
);
2623 } else if (prefix
== "dump_pgstate_history") {
2624 f
->open_object_section("pgstate_history");
2625 f
->open_array_section("pgs");
2628 for (auto& pg
: pgs
) {
2629 f
->open_object_section("pg");
2630 f
->dump_stream("pg") << pg
->pg_id
;
2631 f
->dump_string("currently", pg
->get_current_state());
2632 pg
->dump_pgstate_history(f
);
2637 } else if (prefix
== "compact") {
2638 dout(1) << "triggering manual compaction" << dendl
;
2639 auto start
= ceph::coarse_mono_clock::now();
2641 auto end
= ceph::coarse_mono_clock::now();
2642 double duration
= std::chrono::duration
<double>(end
-start
).count();
2643 dout(1) << "finished manual compaction in "
2645 << " seconds" << dendl
;
2646 f
->open_object_section("compact_result");
2647 f
->dump_float("elapsed_time", duration
);
2649 } else if (prefix
== "get_mapped_pools") {
2650 f
->open_array_section("mapped_pools");
2651 set
<int64_t> poollist
= get_mapped_pools();
2652 for (auto pool
: poollist
) {
2653 f
->dump_int("pool_id", pool
);
2656 } else if (prefix
== "smart") {
2658 cmd_getval(cmdmap
, "devid", devid
);
2660 probe_smart(devid
, out
);
2661 outbl
.append(out
.str());
2662 } else if (prefix
== "list_devices") {
2663 set
<string
> devnames
;
2664 store
->get_devices(&devnames
);
2665 f
->open_array_section("list_devices");
2666 for (auto dev
: devnames
) {
2667 if (dev
.find("dm-") == 0) {
2671 f
->open_object_section("device");
2672 f
->dump_string("device", "/dev/" + dev
);
2673 f
->dump_string("device_id", get_device_id(dev
, &err
));
2677 } else if (prefix
== "send_beacon") {
2678 lock_guard
l(osd_lock
);
2680 send_beacon(ceph::coarse_mono_clock::now());
2684 else if (prefix
== "cluster_log") {
2686 cmd_getval(cmdmap
, "message", msg
);
2689 ss
<< "ignoring empty log message";
2692 string message
= msg
.front();
2693 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
2694 message
+= " " + *a
;
2696 cmd_getval(cmdmap
, "level", lvl
);
2697 clog_type level
= string_to_clog_type(lvl
);
2700 ss
<< "unknown level '" << lvl
<< "'";
2703 clog
->do_log(level
, message
);
2706 else if (prefix
== "bench") {
2707 lock_guard
l(osd_lock
);
2710 int64_t osize
, onum
;
2711 // default count 1G, size 4MB
2712 cmd_getval(cmdmap
, "count", count
, (int64_t)1 << 30);
2713 cmd_getval(cmdmap
, "size", bsize
, (int64_t)4 << 20);
2714 cmd_getval(cmdmap
, "object_size", osize
, (int64_t)0);
2715 cmd_getval(cmdmap
, "object_num", onum
, (int64_t)0);
2717 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
2719 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
2720 // let us limit the block size because the next checks rely on it
2721 // having a sane value. If we allow any block size to be set things
2722 // can still go sideways.
2723 ss
<< "block 'size' values are capped at "
2724 << byte_u_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
2725 << " a higher value, please adjust 'osd_bench_max_block_size'";
2728 } else if (bsize
< (int64_t) (1 << 20)) {
2729 // entering the realm of small block sizes.
2730 // limit the count to a sane value, assuming a configurable amount of
2731 // IOPS and duration, so that the OSD doesn't get hung up on this,
2732 // preventing timeouts from going off
2734 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
2735 if (count
> max_count
) {
2736 ss
<< "'count' values greater than " << max_count
2737 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
2738 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
2739 << " for " << duration
<< " seconds,"
2740 << " can cause ill effects on osd. "
2741 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2742 << " value if you wish to use a higher 'count'.";
2747 // 1MB block sizes are big enough so that we get more stuff done.
2748 // However, to avoid the osd from getting hung on this and having
2749 // timers being triggered, we are going to limit the count assuming
2750 // a configurable throughput and duration.
2751 // NOTE: max_count is the total amount of bytes that we believe we
2752 // will be able to write during 'duration' for the given
2753 // throughput. The block size hardly impacts this unless it's
2754 // way too big. Given we already check how big the block size
2755 // is, it's safe to assume everything will check out.
2757 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
2758 if (count
> max_count
) {
2759 ss
<< "'count' values greater than " << max_count
2760 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
2761 << byte_u_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
2762 << " for " << duration
<< " seconds,"
2763 << " can cause ill effects on osd. "
2764 << " Please adjust 'osd_bench_large_size_max_throughput'"
2765 << " with a higher value if you wish to use a higher 'count'.";
2771 if (osize
&& bsize
> osize
)
2774 dout(1) << " bench count " << count
2775 << " bsize " << byte_u_t(bsize
) << dendl
;
2777 ObjectStore::Transaction cleanupt
;
2779 if (osize
&& onum
) {
2781 bufferptr
bp(osize
);
2783 bl
.push_back(std::move(bp
));
2784 bl
.rebuild_page_aligned();
2785 for (int i
=0; i
<onum
; ++i
) {
2787 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
2789 hobject_t
soid(sobject_t(oid
, 0));
2790 ObjectStore::Transaction t
;
2791 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
2792 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
2793 cleanupt
.remove(coll_t(), ghobject_t(soid
));
2798 bufferptr
bp(bsize
);
2800 bl
.push_back(std::move(bp
));
2801 bl
.rebuild_page_aligned();
2805 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2810 utime_t start
= ceph_clock_now();
2811 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
2813 unsigned offset
= 0;
2814 if (onum
&& osize
) {
2815 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
2816 offset
= rand() % (osize
/ bsize
) * bsize
;
2818 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
2821 hobject_t
soid(sobject_t(oid
, 0));
2822 ObjectStore::Transaction t
;
2823 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
2824 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
2825 if (!onum
|| !osize
)
2826 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
2831 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2835 utime_t end
= ceph_clock_now();
2838 store
->queue_transaction(service
.meta_ch
, std::move(cleanupt
), NULL
);
2841 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2846 double elapsed
= end
- start
;
2847 double rate
= count
/ elapsed
;
2848 double iops
= rate
/ bsize
;
2849 f
->open_object_section("osd_bench_results");
2850 f
->dump_int("bytes_written", count
);
2851 f
->dump_int("blocksize", bsize
);
2852 f
->dump_float("elapsed_sec", elapsed
);
2853 f
->dump_float("bytes_per_sec", rate
);
2854 f
->dump_float("iops", iops
);
2858 else if (prefix
== "flush_pg_stats") {
2859 mgrc
.send_pgstats();
2860 f
->dump_unsigned("stat_seq", service
.get_osd_stat_seq());
2863 else if (prefix
== "heap") {
2864 ret
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ss
);
2867 else if (prefix
== "debug dump_missing") {
2868 f
->open_array_section("pgs");
2871 for (auto& pg
: pgs
) {
2872 string s
= stringify(pg
->pg_id
);
2873 f
->open_array_section(s
.c_str());
2875 pg
->dump_missing(f
);
2882 else if (prefix
== "debug kick_recovery_wq") {
2884 cmd_getval(cmdmap
, "delay", delay
);
2887 ret
= cct
->_conf
.set_val("osd_recovery_delay_start", oss
.str().c_str());
2889 ss
<< "kick_recovery_wq: error setting "
2890 << "osd_recovery_delay_start to '" << delay
<< "': error "
2894 cct
->_conf
.apply_changes(nullptr);
2895 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
2896 << "to " << cct
->_conf
->osd_recovery_delay_start
;
2899 else if (prefix
== "cpu_profiler") {
2902 cmd_getval(cmdmap
, "arg", arg
);
2903 vector
<string
> argvec
;
2904 get_str_vec(arg
, argvec
);
2905 cpu_profiler_handle_command(argvec
, ds
);
2906 outbl
.append(ds
.str());
2909 else if (prefix
== "dump_pg_recovery_stats") {
2910 lock_guard
l(osd_lock
);
2911 pg_recovery_stats
.dump_formatted(f
);
2914 else if (prefix
== "reset_pg_recovery_stats") {
2915 lock_guard
l(osd_lock
);
2916 pg_recovery_stats
.reset();
2919 else if (prefix
== "perf histogram dump") {
2921 std::string counter
;
2922 cmd_getval(cmdmap
, "logger", logger
);
2923 cmd_getval(cmdmap
, "counter", counter
);
2924 cct
->get_perfcounters_collection()->dump_formatted_histograms(
2925 f
, false, logger
, counter
);
2928 else if (prefix
== "cache drop") {
2929 lock_guard
l(osd_lock
);
2930 dout(20) << "clearing all caches" << dendl
;
2931 // Clear the objectstore's cache - onode and buffer for Bluestore,
2932 // system's pagecache for Filestore
2933 ret
= store
->flush_cache(&ss
);
2935 ss
<< "Error flushing objectstore cache: " << cpp_strerror(ret
);
2938 // Clear the objectcontext cache (per PG)
2941 for (auto& pg
: pgs
) {
2946 else if (prefix
== "cache status") {
2947 lock_guard
l(osd_lock
);
2948 int obj_ctx_count
= 0;
2951 for (auto& pg
: pgs
) {
2952 obj_ctx_count
+= pg
->get_cache_obj_count();
2954 f
->open_object_section("cache_status");
2955 f
->dump_int("object_ctx", obj_ctx_count
);
2956 store
->dump_cache_stats(f
);
2960 else if (prefix
== "scrub_purged_snaps") {
2961 lock_guard
l(osd_lock
);
2962 scrub_purged_snaps();
2965 else if (prefix
== "dump_osd_network") {
2966 lock_guard
l(osd_lock
);
2968 if (!(cmd_getval(cmdmap
, "value", value
))) {
2969 // Convert milliseconds to microseconds
2970 value
= static_cast<double>(g_conf().get_val
<double>(
2971 "mon_warn_on_slow_ping_time")) * 1000;
2973 double ratio
= g_conf().get_val
<double>("mon_warn_on_slow_ping_ratio");
2974 value
= g_conf().get_val
<int64_t>("osd_heartbeat_grace");
2975 value
*= 1000000 * ratio
; // Seconds of grace to microseconds at ratio
2978 // Convert user input to microseconds
2981 if (value
< 0) value
= 0;
2983 struct osd_ping_time_t
{
2987 std::array
<uint32_t,3> times
;
2988 std::array
<uint32_t,3> min
;
2989 std::array
<uint32_t,3> max
;
2991 uint32_t last_update
;
2993 bool operator<(const osd_ping_time_t
& rhs
) const {
2994 if (pingtime
< rhs
.pingtime
)
2996 if (pingtime
> rhs
.pingtime
)
3006 set
<osd_ping_time_t
> sorted
;
3007 // Get pingtimes under lock and not on the stack
3008 map
<int, osd_stat_t::Interfaces
> *pingtimes
= new map
<int, osd_stat_t::Interfaces
>;
3009 service
.get_hb_pingtime(pingtimes
);
3010 for (auto j
: *pingtimes
) {
3011 if (j
.second
.last_update
== 0)
3013 osd_ping_time_t item
;
3014 item
.pingtime
= std::max(j
.second
.back_pingtime
[0], j
.second
.back_pingtime
[1]);
3015 item
.pingtime
= std::max(item
.pingtime
, j
.second
.back_pingtime
[2]);
3016 if (item
.pingtime
>= value
) {
3018 item
.times
[0] = j
.second
.back_pingtime
[0];
3019 item
.times
[1] = j
.second
.back_pingtime
[1];
3020 item
.times
[2] = j
.second
.back_pingtime
[2];
3021 item
.min
[0] = j
.second
.back_min
[0];
3022 item
.min
[1] = j
.second
.back_min
[1];
3023 item
.min
[2] = j
.second
.back_min
[2];
3024 item
.max
[0] = j
.second
.back_max
[0];
3025 item
.max
[1] = j
.second
.back_max
[1];
3026 item
.max
[2] = j
.second
.back_max
[2];
3027 item
.last
= j
.second
.back_last
;
3029 item
.last_update
= j
.second
.last_update
;
3030 sorted
.emplace(item
);
3032 if (j
.second
.front_last
== 0)
3034 item
.pingtime
= std::max(j
.second
.front_pingtime
[0], j
.second
.front_pingtime
[1]);
3035 item
.pingtime
= std::max(item
.pingtime
, j
.second
.front_pingtime
[2]);
3036 if (item
.pingtime
>= value
) {
3038 item
.times
[0] = j
.second
.front_pingtime
[0];
3039 item
.times
[1] = j
.second
.front_pingtime
[1];
3040 item
.times
[2] = j
.second
.front_pingtime
[2];
3041 item
.min
[0] = j
.second
.front_min
[0];
3042 item
.min
[1] = j
.second
.front_min
[1];
3043 item
.min
[2] = j
.second
.front_min
[2];
3044 item
.max
[0] = j
.second
.front_max
[0];
3045 item
.max
[1] = j
.second
.front_max
[1];
3046 item
.max
[2] = j
.second
.front_max
[2];
3047 item
.last
= j
.second
.front_last
;
3048 item
.last_update
= j
.second
.last_update
;
3050 sorted
.emplace(item
);
3055 // Network ping times (1min 5min 15min)
3056 f
->open_object_section("network_ping_times");
3057 f
->dump_int("threshold", value
/ 1000);
3058 f
->open_array_section("entries");
3059 for (auto &sitem
: boost::adaptors::reverse(sorted
)) {
3060 ceph_assert(sitem
.pingtime
>= value
);
3061 f
->open_object_section("entry");
3063 const time_t lu(sitem
.last_update
);
3065 string
lustr(ctime_r(&lu
, buffer
));
3066 lustr
.pop_back(); // Remove trailing \n
3067 auto stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3068 f
->dump_string("last update", lustr
);
3069 f
->dump_bool("stale", ceph_clock_now().sec() - sitem
.last_update
> stale
);
3070 f
->dump_int("from osd", whoami
);
3071 f
->dump_int("to osd", sitem
.to
);
3072 f
->dump_string("interface", (sitem
.back
? "back" : "front"));
3073 f
->open_object_section("average");
3074 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.times
[0],3).c_str());
3075 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.times
[1],3).c_str());
3076 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.times
[2],3).c_str());
3077 f
->close_section(); // average
3078 f
->open_object_section("min");
3079 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3080 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3081 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3082 f
->close_section(); // min
3083 f
->open_object_section("max");
3084 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3085 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3086 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3087 f
->close_section(); // max
3088 f
->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem
.last
,3).c_str());
3089 f
->close_section(); // entry
3091 f
->close_section(); // entries
3092 f
->close_section(); // network_ping_times
3094 ceph_abort_msg("broken asok registration");
3098 on_finish(ret
, ss
.str(), outbl
);
3101 class TestOpsSocketHook
: public AdminSocketHook
{
3102 OSDService
*service
;
3105 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
3106 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
3108 std::ostream
& errss
,
3109 bufferlist
& out
) override
{
3113 test_ops(service
, store
, command
, cmdmap
, outss
);
3115 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
3121 void test_ops(OSDService
*service
, ObjectStore
*store
,
3122 std::string_view command
, const cmdmap_t
& cmdmap
, ostream
&ss
);
3126 class OSD::C_Tick
: public Context
{
3129 explicit C_Tick(OSD
*o
) : osd(o
) {}
3130 void finish(int r
) override
{
3135 class OSD::C_Tick_WithoutOSDLock
: public Context
{
3138 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
3139 void finish(int r
) override
{
3140 osd
->tick_without_osd_lock();
3144 int OSD::enable_disable_fuse(bool stop
)
3148 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
3149 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
3150 dout(1) << __func__
<< " disabling" << dendl
;
3154 r
= ::rmdir(mntpath
.c_str());
3157 derr
<< __func__
<< " failed to rmdir " << mntpath
<< ": "
3158 << cpp_strerror(r
) << dendl
;
3163 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
3164 dout(1) << __func__
<< " enabling" << dendl
;
3165 r
= ::mkdir(mntpath
.c_str(), 0700);
3168 if (r
< 0 && r
!= -EEXIST
) {
3169 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
3170 << cpp_strerror(r
) << dendl
;
3173 fuse_store
= new FuseStore(store
, mntpath
);
3174 r
= fuse_store
->start();
3176 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
3182 #endif // HAVE_LIBFUSE
3186 size_t OSD::get_num_cache_shards()
3188 return cct
->_conf
.get_val
<Option::size_t>("osd_num_cache_shards");
3191 int OSD::get_num_op_shards()
3193 if (cct
->_conf
->osd_op_num_shards
)
3194 return cct
->_conf
->osd_op_num_shards
;
3195 if (store_is_rotational
)
3196 return cct
->_conf
->osd_op_num_shards_hdd
;
3198 return cct
->_conf
->osd_op_num_shards_ssd
;
3201 int OSD::get_num_op_threads()
3203 if (cct
->_conf
->osd_op_num_threads_per_shard
)
3204 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard
;
3205 if (store_is_rotational
)
3206 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_hdd
;
3208 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_ssd
;
3211 float OSD::get_osd_recovery_sleep()
3213 if (cct
->_conf
->osd_recovery_sleep
)
3214 return cct
->_conf
->osd_recovery_sleep
;
3215 if (!store_is_rotational
&& !journal_is_rotational
)
3216 return cct
->_conf
->osd_recovery_sleep_ssd
;
3217 else if (store_is_rotational
&& !journal_is_rotational
)
3218 return cct
->_conf
.get_val
<double>("osd_recovery_sleep_hybrid");
3220 return cct
->_conf
->osd_recovery_sleep_hdd
;
3223 float OSD::get_osd_delete_sleep()
3225 float osd_delete_sleep
= cct
->_conf
.get_val
<double>("osd_delete_sleep");
3226 if (osd_delete_sleep
> 0)
3227 return osd_delete_sleep
;
3228 if (!store_is_rotational
&& !journal_is_rotational
)
3229 return cct
->_conf
.get_val
<double>("osd_delete_sleep_ssd");
3230 if (store_is_rotational
&& !journal_is_rotational
)
3231 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hybrid");
3232 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hdd");
3235 int OSD::get_recovery_max_active()
3237 if (cct
->_conf
->osd_recovery_max_active
)
3238 return cct
->_conf
->osd_recovery_max_active
;
3239 if (store_is_rotational
)
3240 return cct
->_conf
->osd_recovery_max_active_hdd
;
3242 return cct
->_conf
->osd_recovery_max_active_ssd
;
3245 float OSD::get_osd_snap_trim_sleep()
3247 float osd_snap_trim_sleep
= cct
->_conf
.get_val
<double>("osd_snap_trim_sleep");
3248 if (osd_snap_trim_sleep
> 0)
3249 return osd_snap_trim_sleep
;
3250 if (!store_is_rotational
&& !journal_is_rotational
)
3251 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_ssd");
3252 if (store_is_rotational
&& !journal_is_rotational
)
3253 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hybrid");
3254 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hdd");
3260 CompatSet initial
, diff
;
3261 std::lock_guard
lock(osd_lock
);
3266 tick_timer_without_osd_lock
.init();
3267 service
.recovery_request_timer
.init();
3268 service
.sleep_timer
.init();
3270 boot_finisher
.start();
3274 store
->read_meta("require_osd_release", &val
);
3275 last_require_osd_release
= ceph_release_from_name(val
);
3279 dout(2) << "init " << dev_path
3280 << " (looks like " << (store_is_rotational
? "hdd" : "ssd") << ")"
3282 dout(2) << "journal " << journal_path
<< dendl
;
3283 ceph_assert(store
); // call pre_init() first!
3285 store
->set_cache_shards(get_num_cache_shards());
3287 int r
= store
->mount();
3289 derr
<< "OSD:init: unable to mount object store" << dendl
;
3292 journal_is_rotational
= store
->is_journal_rotational();
3293 dout(2) << "journal looks like " << (journal_is_rotational
? "hdd" : "ssd")
3296 enable_disable_fuse(false);
3298 dout(2) << "boot" << dendl
;
3300 service
.meta_ch
= store
->open_collection(coll_t::meta());
3302 // initialize the daily loadavg with current 15min loadavg
3304 if (getloadavg(loadavgs
, 3) == 3) {
3305 daily_loadavg
= loadavgs
[2];
3307 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
3308 daily_loadavg
= 1.0;
3311 int rotating_auth_attempts
= 0;
3312 auto rotating_auth_timeout
=
3313 g_conf().get_val
<int64_t>("rotating_keys_bootstrap_timeout");
3315 // sanity check long object name handling
3318 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
3319 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
3320 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
3321 r
= store
->validate_hobject_key(l
);
3323 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
3324 << "object name[space] len" << dendl
;
3325 derr
<< " osd max object name len = "
3326 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
3327 derr
<< " osd max object namespace len = "
3328 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
3329 derr
<< cpp_strerror(r
) << dendl
;
3330 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
3333 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
3336 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
3341 r
= read_superblock();
3343 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
3348 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
3349 derr
<< "The disk uses features unsupported by the executable." << dendl
;
3350 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
3351 derr
<< " daemon features " << osd_compat
<< dendl
;
3353 if (osd_compat
.writeable(superblock
.compat_features
)) {
3354 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3355 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
3360 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3361 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
3367 assert_warn(whoami
== superblock
.whoami
);
3368 if (whoami
!= superblock
.whoami
) {
3369 derr
<< "OSD::init: superblock says osd"
3370 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
3375 startup_time
= ceph::mono_clock::now();
3377 // load up "current" osdmap
3378 assert_warn(!get_osdmap());
3380 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
3384 osdmap
= get_map(superblock
.current_epoch
);
3387 // make sure we don't have legacy pgs deleting
3390 int r
= store
->list_collections(ls
);
3391 ceph_assert(r
>= 0);
3394 if (c
.is_pg(&pgid
) &&
3395 !osdmap
->have_pg_pool(pgid
.pool())) {
3396 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
3397 if (!store
->exists(service
.meta_ch
, oid
)) {
3398 derr
<< __func__
<< " missing pg_pool_t for deleted pool "
3399 << pgid
.pool() << " for pg " << pgid
3400 << "; please downgrade to luminous and allow "
3401 << "pg deletion to complete before upgrading" << dendl
;
3408 initial
= get_osd_initial_compat_set();
3409 diff
= superblock
.compat_features
.unsupported(initial
);
3410 if (superblock
.compat_features
.merge(initial
)) {
3411 // Are we adding SNAPMAPPER2?
3412 if (diff
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
)) {
3413 dout(1) << __func__
<< " upgrade snap_mapper (first start as octopus)"
3415 auto ch
= service
.meta_ch
;
3416 auto hoid
= make_snapmapper_oid();
3417 unsigned max
= cct
->_conf
->osd_target_transaction_size
;
3418 r
= SnapMapper::convert_legacy(cct
, store
, ch
, hoid
, max
);
3422 // We need to persist the new compat_set before we
3424 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
3425 ObjectStore::Transaction t
;
3426 write_superblock(t
);
3427 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3432 // make sure snap mapper object exists
3433 if (!store
->exists(service
.meta_ch
, OSD::make_snapmapper_oid())) {
3434 dout(10) << "init creating/touching snapmapper object" << dendl
;
3435 ObjectStore::Transaction t
;
3436 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3437 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3441 if (!store
->exists(service
.meta_ch
, OSD::make_purged_snaps_oid())) {
3442 dout(10) << "init creating/touching purged_snaps object" << dendl
;
3443 ObjectStore::Transaction t
;
3444 t
.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3445 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3450 if (cct
->_conf
->osd_open_classes_on_start
) {
3451 int r
= ClassHandler::get_instance().open_all_classes();
3453 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
3456 check_osdmap_features();
3458 create_recoverystate_perf();
3461 epoch_t bind_epoch
= osdmap
->get_epoch();
3462 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
3465 clear_temp_objects();
3467 // initialize osdmap references in sharded wq
3468 for (auto& shard
: shards
) {
3469 std::lock_guard
l(shard
->osdmap_lock
);
3470 shard
->shard_osdmap
= osdmap
;
3473 // load up pgs (as they previously existed)
3476 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
3482 struct store_statfs_t stbuf
;
3483 osd_alert_list_t alerts
;
3484 int r
= store
->statfs(&stbuf
, &alerts
);
3485 ceph_assert(r
== 0);
3486 service
.set_statfs(stbuf
, alerts
);
3489 // client_messenger auth_client is already set up by monc.
3490 for (auto m
: { cluster_messenger
,
3492 hb_front_client_messenger
,
3493 hb_back_client_messenger
,
3494 hb_front_server_messenger
,
3495 hb_back_server_messenger
} ) {
3496 m
->set_auth_client(monc
);
3498 for (auto m
: { client_messenger
,
3500 hb_front_server_messenger
,
3501 hb_back_server_messenger
}) {
3502 m
->set_auth_server(monc
);
3504 monc
->set_handle_authentication_dispatcher(this);
3506 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
3507 | CEPH_ENTITY_TYPE_MGR
);
3512 mgrc
.set_pgstats_cb([this](){ return collect_pg_stats(); });
3513 mgrc
.set_perf_metric_query_cb(
3514 [this](const ConfigPayload
&config_payload
) {
3515 set_perf_queries(config_payload
);
3518 return get_perf_reports();
3522 // tell monc about log_client so it will know about mon session resets
3523 monc
->set_log_client(&log_client
);
3524 update_log_config();
3527 client_messenger
->add_dispatcher_tail(&mgrc
);
3528 client_messenger
->add_dispatcher_tail(this);
3529 cluster_messenger
->add_dispatcher_head(this);
3531 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3532 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3533 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3534 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3536 objecter_messenger
->add_dispatcher_head(service
.objecter
.get());
3539 service
.publish_map(osdmap
);
3540 service
.publish_superblock(superblock
);
3541 service
.max_oldest_map
= superblock
.oldest_map
;
3543 for (auto& shard
: shards
) {
3544 // put PGs in a temporary set because we may modify pg_slots
3545 // unordered_map below.
3547 for (auto& i
: shard
->pg_slots
) {
3548 PGRef pg
= i
.second
->pg
;
3554 for (auto pg
: pgs
) {
3555 std::scoped_lock l
{*pg
};
3556 set
<pair
<spg_t
,epoch_t
>> new_children
;
3557 set
<pair
<spg_t
,epoch_t
>> merge_pgs
;
3558 service
.identify_splits_and_merges(pg
->get_osdmap(), osdmap
, pg
->pg_id
,
3559 &new_children
, &merge_pgs
);
3560 if (!new_children
.empty()) {
3561 for (auto shard
: shards
) {
3562 shard
->prime_splits(osdmap
, &new_children
);
3564 assert(new_children
.empty());
3566 if (!merge_pgs
.empty()) {
3567 for (auto shard
: shards
) {
3568 shard
->prime_merges(osdmap
, &merge_pgs
);
3570 assert(merge_pgs
.empty());
3577 // start the heartbeat
3578 heartbeat_thread
.create("osd_srv_heartbt");
3581 tick_timer
.add_event_after(get_tick_interval(),
3584 std::lock_guard
l(tick_timer_lock
);
3585 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
3586 new C_Tick_WithoutOSDLock(this));
3591 r
= monc
->authenticate();
3593 derr
<< __func__
<< " authentication failed: " << cpp_strerror(r
)
3598 while (monc
->wait_auth_rotating(rotating_auth_timeout
) < 0) {
3599 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
3600 ++rotating_auth_attempts
;
3601 if (rotating_auth_attempts
> g_conf()->max_rotating_auth_attempts
) {
3602 derr
<< __func__
<< " wait_auth_rotating timed out" << dendl
;
3607 r
= update_crush_device_class();
3609 derr
<< __func__
<< " unable to update_crush_device_class: "
3610 << cpp_strerror(r
) << dendl
;
3614 r
= update_crush_location();
3616 derr
<< __func__
<< " unable to update_crush_location: "
3617 << cpp_strerror(r
) << dendl
;
3625 // start objecter *after* we have authenticated, so that we don't ignore
3626 // the OSDMaps it requests.
3627 service
.final_init();
3631 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
3634 dout(0) << "done with init, starting boot process" << dendl
;
3636 // subscribe to any pg creations
3637 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
3639 // MgrClient needs this (it doesn't have MonClient reference itself)
3640 monc
->sub_want("mgrmap", 0, 0);
3642 // we don't need to ask for an osdmap here; objecter will
3643 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3652 enable_disable_fuse(true);
3659 void OSD::final_init()
3661 AdminSocket
*admin_socket
= cct
->get_admin_socket();
3662 asok_hook
= new OSDSocketHook(this);
3663 int r
= admin_socket
->register_command("status", asok_hook
,
3664 "high-level status of OSD");
3665 ceph_assert(r
== 0);
3666 r
= admin_socket
->register_command("flush_journal",
3668 "flush the journal to permanent store");
3669 ceph_assert(r
== 0);
3670 r
= admin_socket
->register_command("dump_ops_in_flight " \
3671 "name=filterstr,type=CephString,n=N,req=false",
3673 "show the ops currently in flight");
3674 ceph_assert(r
== 0);
3675 r
= admin_socket
->register_command("ops " \
3676 "name=filterstr,type=CephString,n=N,req=false",
3678 "show the ops currently in flight");
3679 ceph_assert(r
== 0);
3680 r
= admin_socket
->register_command("dump_blocked_ops " \
3681 "name=filterstr,type=CephString,n=N,req=false",
3683 "show the blocked ops currently in flight");
3684 ceph_assert(r
== 0);
3685 r
= admin_socket
->register_command("dump_historic_ops " \
3686 "name=filterstr,type=CephString,n=N,req=false",
3689 ceph_assert(r
== 0);
3690 r
= admin_socket
->register_command("dump_historic_slow_ops " \
3691 "name=filterstr,type=CephString,n=N,req=false",
3693 "show slowest recent ops");
3694 ceph_assert(r
== 0);
3695 r
= admin_socket
->register_command("dump_historic_ops_by_duration " \
3696 "name=filterstr,type=CephString,n=N,req=false",
3698 "show slowest recent ops, sorted by duration");
3699 ceph_assert(r
== 0);
3700 r
= admin_socket
->register_command("dump_op_pq_state",
3702 "dump op priority queue state");
3703 ceph_assert(r
== 0);
3704 r
= admin_socket
->register_command("dump_blacklist",
3706 "dump blacklisted clients and times");
3707 ceph_assert(r
== 0);
3708 r
= admin_socket
->register_command("dump_watchers",
3710 "show clients which have active watches,"
3711 " and on which objects");
3712 ceph_assert(r
== 0);
3713 r
= admin_socket
->register_command("dump_recovery_reservations",
3715 "show recovery reservations");
3716 ceph_assert(r
== 0);
3717 r
= admin_socket
->register_command("dump_scrub_reservations",
3719 "show recovery reservations");
3720 ceph_assert(r
== 0);
3721 r
= admin_socket
->register_command("get_latest_osdmap",
3723 "force osd to update the latest map from "
3725 ceph_assert(r
== 0);
3727 r
= admin_socket
->register_command("set_heap_property " \
3728 "name=property,type=CephString " \
3729 "name=value,type=CephInt",
3731 "update malloc extension heap property");
3732 ceph_assert(r
== 0);
3734 r
= admin_socket
->register_command("get_heap_property " \
3735 "name=property,type=CephString",
3737 "get malloc extension heap property");
3738 ceph_assert(r
== 0);
3740 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
3742 "print statistics of kvdb which used by bluestore");
3743 ceph_assert(r
== 0);
3745 r
= admin_socket
->register_command("dump_scrubs",
3747 "print scheduled scrubs");
3748 ceph_assert(r
== 0);
3750 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
3752 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3753 ceph_assert(r
== 0);
3755 r
= admin_socket
->register_command("flush_store_cache",
3757 "Flush bluestore internal cache");
3758 ceph_assert(r
== 0);
3759 r
= admin_socket
->register_command("dump_pgstate_history",
3761 "show recent state history");
3762 ceph_assert(r
== 0);
3764 r
= admin_socket
->register_command("compact",
3766 "Commpact object store's omap."
3767 " WARNING: Compaction probably slows your requests");
3768 ceph_assert(r
== 0);
3770 r
= admin_socket
->register_command("get_mapped_pools",
3772 "dump pools whose PG(s) are mapped to this OSD.");
3774 ceph_assert(r
== 0);
3776 r
= admin_socket
->register_command("smart name=devid,type=CephString,req=false",
3778 "probe OSD devices for SMART data.");
3780 ceph_assert(r
== 0);
3782 r
= admin_socket
->register_command("list_devices",
3784 "list OSD devices.");
3785 r
= admin_socket
->register_command("send_beacon",
3787 "send OSD beacon to mon immediately");
3789 r
= admin_socket
->register_command(
3790 "dump_osd_network name=value,type=CephInt,req=false", asok_hook
,
3791 "Dump osd heartbeat network ping times");
3792 ceph_assert(r
== 0);
3794 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
);
3795 // Note: pools are CephString instead of CephPoolname because
3796 // these commands traditionally support both pool names and numbers
3797 r
= admin_socket
->register_command(
3799 "name=pool,type=CephString " \
3800 "name=objname,type=CephObjectname " \
3801 "name=key,type=CephString "\
3802 "name=val,type=CephString",
3805 ceph_assert(r
== 0);
3806 r
= admin_socket
->register_command(
3808 "name=pool,type=CephString " \
3809 "name=objname,type=CephObjectname " \
3810 "name=key,type=CephString",
3813 ceph_assert(r
== 0);
3814 r
= admin_socket
->register_command(
3816 "name=pool,type=CephString " \
3817 "name=objname,type=CephObjectname " \
3818 "name=header,type=CephString",
3821 ceph_assert(r
== 0);
3823 r
= admin_socket
->register_command(
3825 "name=pool,type=CephString " \
3826 "name=objname,type=CephObjectname",
3828 "output entire object map");
3829 ceph_assert(r
== 0);
3831 r
= admin_socket
->register_command(
3833 "name=pool,type=CephString " \
3834 "name=objname,type=CephObjectname " \
3835 "name=len,type=CephInt",
3837 "truncate object to length");
3838 ceph_assert(r
== 0);
3840 r
= admin_socket
->register_command(
3842 "name=pool,type=CephString " \
3843 "name=objname,type=CephObjectname " \
3844 "name=shardid,type=CephInt,req=false,range=0|255",
3846 "inject data error to an object");
3847 ceph_assert(r
== 0);
3849 r
= admin_socket
->register_command(
3851 "name=pool,type=CephString " \
3852 "name=objname,type=CephObjectname " \
3853 "name=shardid,type=CephInt,req=false,range=0|255",
3855 "inject metadata error to an object");
3856 ceph_assert(r
== 0);
3857 r
= admin_socket
->register_command(
3858 "set_recovery_delay " \
3859 "name=utime,type=CephInt,req=false",
3861 "Delay osd recovery by specified seconds");
3862 ceph_assert(r
== 0);
3863 r
= admin_socket
->register_command(
3865 "name=type,type=CephString,req=false " \
3866 "name=count,type=CephInt,req=false ",
3868 "Inject a full disk (optional count times)");
3869 ceph_assert(r
== 0);
3870 r
= admin_socket
->register_command(
3872 "name=count,type=CephInt,req=false " \
3873 "name=size,type=CephInt,req=false " \
3874 "name=object_size,type=CephInt,req=false " \
3875 "name=object_num,type=CephInt,req=false ",
3877 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3878 "(default count=1G default size=4MB). Results in log.");
3879 ceph_assert(r
== 0);
3880 r
= admin_socket
->register_command(
3882 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3883 "name=message,type=CephString,n=N",
3885 "log a message to the cluster log");
3886 ceph_assert(r
== 0);
3887 r
= admin_socket
->register_command(
3891 ceph_assert(r
== 0);
3892 r
= admin_socket
->register_command(
3894 "name=heapcmd,type=CephChoices,strings=" \
3895 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3896 "name=value,type=CephString,req=false",
3898 "show heap usage info (available only if compiled with tcmalloc)");
3899 ceph_assert(r
== 0);
3900 r
= admin_socket
->register_command(
3901 "debug dump_missing " \
3902 "name=filename,type=CephFilepath",
3904 "dump missing objects to a named file");
3905 ceph_assert(r
== 0);
3906 r
= admin_socket
->register_command(
3907 "debug kick_recovery_wq " \
3908 "name=delay,type=CephInt,range=0",
3910 "set osd_recovery_delay_start to <val>");
3911 ceph_assert(r
== 0);
3912 r
= admin_socket
->register_command(
3914 "name=arg,type=CephChoices,strings=status|flush",
3916 "run cpu profiling on daemon");
3917 ceph_assert(r
== 0);
3918 r
= admin_socket
->register_command(
3919 "dump_pg_recovery_stats",
3921 "dump pg recovery statistics");
3922 ceph_assert(r
== 0);
3923 r
= admin_socket
->register_command(
3924 "reset_pg_recovery_stats",
3926 "reset pg recovery statistics");
3927 ceph_assert(r
== 0);
3928 r
= admin_socket
->register_command(
3931 "Drop all OSD caches");
3932 ceph_assert(r
== 0);
3933 r
= admin_socket
->register_command(
3936 "Get OSD caches statistics");
3937 ceph_assert(r
== 0);
3938 r
= admin_socket
->register_command(
3939 "scrub_purged_snaps",
3941 "Scrub purged_snaps vs snapmapper index");
3942 ceph_assert(r
== 0);
3944 // -- pg commands --
3945 // old form: ceph pg <pgid> command ...
3946 r
= admin_socket
->register_command(
3948 "name=pgid,type=CephPgid " \
3949 "name=cmd,type=CephChoices,strings=query",
3952 ceph_assert(r
== 0);
3953 r
= admin_socket
->register_command(
3955 "name=pgid,type=CephPgid " \
3956 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3957 "name=mulcmd,type=CephChoices,strings=revert|delete",
3960 ceph_assert(r
== 0);
3961 r
= admin_socket
->register_command(
3963 "name=pgid,type=CephPgid " \
3964 "name=cmd,type=CephChoices,strings=list_unfound " \
3965 "name=offset,type=CephString,req=false",
3968 ceph_assert(r
== 0);
3969 r
= admin_socket
->register_command(
3971 "name=pgid,type=CephPgid " \
3972 "name=cmd,type=CephChoices,strings=scrub " \
3973 "name=time,type=CephInt,req=false",
3976 ceph_assert(r
== 0);
3977 r
= admin_socket
->register_command(
3979 "name=pgid,type=CephPgid " \
3980 "name=cmd,type=CephChoices,strings=deep_scrub " \
3981 "name=time,type=CephInt,req=false",
3984 ceph_assert(r
== 0);
3985 // new form: tell <pgid> <cmd> for both cli and rest
3986 r
= admin_socket
->register_command(
3989 "show details of a specific pg");
3990 ceph_assert(r
== 0);
3991 r
= admin_socket
->register_command(
3992 "mark_unfound_lost " \
3993 "name=pgid,type=CephPgid,req=false " \
3994 "name=mulcmd,type=CephChoices,strings=revert|delete",
3996 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
3997 ceph_assert(r
== 0);
3998 r
= admin_socket
->register_command(
4000 "name=pgid,type=CephPgid,req=false " \
4001 "name=offset,type=CephString,req=false",
4003 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4004 ceph_assert(r
== 0);
4005 r
= admin_socket
->register_command(
4007 "name=pgid,type=CephPgid,req=false " \
4008 "name=time,type=CephInt,req=false",
4010 "Trigger a scheduled scrub ");
4011 ceph_assert(r
== 0);
4012 r
= admin_socket
->register_command(
4014 "name=pgid,type=CephPgid,req=false " \
4015 "name=time,type=CephInt,req=false",
4017 "Trigger a scheduled deep scrub ");
4018 ceph_assert(r
== 0);
4021 void OSD::create_logger()
4023 dout(10) << "create_logger" << dendl
;
4025 logger
= build_osd_logger(cct
);
4026 cct
->get_perfcounters_collection()->add(logger
);
4029 void OSD::create_recoverystate_perf()
4031 dout(10) << "create_recoverystate_perf" << dendl
;
4033 recoverystate_perf
= build_recoverystate_perf(cct
);
4034 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
4039 if (cct
->_conf
->osd_fast_shutdown
) {
4040 derr
<< "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl
;
4045 if (!service
.prepare_to_stop())
4046 return 0; // already shutting down
4048 if (is_stopping()) {
4052 dout(0) << "shutdown" << dendl
;
4054 set_state(STATE_STOPPING
);
4057 if (cct
->_conf
.get_val
<bool>("osd_debug_shutdown")) {
4058 cct
->_conf
.set_val("debug_osd", "100");
4059 cct
->_conf
.set_val("debug_journal", "100");
4060 cct
->_conf
.set_val("debug_filestore", "100");
4061 cct
->_conf
.set_val("debug_bluestore", "100");
4062 cct
->_conf
.set_val("debug_ms", "100");
4063 cct
->_conf
.apply_changes(nullptr);
4066 // stop MgrClient earlier as it's more like an internal consumer of OSD
4069 service
.start_shutdown();
4071 // stop sending work to pgs. this just prevents any new work in _process
4072 // from racing with on_shutdown and potentially entering the pg after.
4073 op_shardedwq
.drain();
4079 for (auto pg
: pgs
) {
4084 // drain op queue again (in case PGs requeued something)
4085 op_shardedwq
.drain();
4087 finished
.clear(); // zap waiters (bleh, this is messy)
4088 waiting_for_osdmap
.clear();
4091 // unregister commands
4092 cct
->get_admin_socket()->unregister_commands(asok_hook
);
4096 cct
->get_admin_socket()->unregister_commands(test_ops_hook
);
4097 delete test_ops_hook
;
4098 test_ops_hook
= NULL
;
4103 std::lock_guard l
{heartbeat_lock
};
4104 heartbeat_stop
= true;
4105 heartbeat_cond
.notify_all();
4106 heartbeat_peers
.clear();
4108 heartbeat_thread
.join();
4110 hb_back_server_messenger
->mark_down_all();
4111 hb_front_server_messenger
->mark_down_all();
4112 hb_front_client_messenger
->mark_down_all();
4113 hb_back_client_messenger
->mark_down_all();
4117 dout(10) << "op sharded tp stopped" << dendl
;
4119 dout(10) << "stopping agent" << dendl
;
4120 service
.agent_stop();
4122 boot_finisher
.wait_for_empty();
4126 boot_finisher
.stop();
4127 reset_heartbeat_peers(true);
4129 tick_timer
.shutdown();
4132 std::lock_guard
l(tick_timer_lock
);
4133 tick_timer_without_osd_lock
.shutdown();
4136 // note unmount epoch
4137 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl
;
4138 superblock
.mounted
= service
.get_boot_epoch();
4139 superblock
.clean_thru
= get_osdmap_epoch();
4140 ObjectStore::Transaction t
;
4141 write_superblock(t
);
4142 int r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
4144 derr
<< "OSD::shutdown: error writing superblock: "
4145 << cpp_strerror(r
) << dendl
;
4149 service
.shutdown_reserver();
4152 #ifdef PG_DEBUG_REFS
4153 service
.dump_live_pgids();
4157 _get_pgs(&pgs
, true);
4161 for (auto& pg
: pgs
) {
4162 if (pg
->is_deleted()) {
4165 dout(20) << " kicking pg " << pg
<< dendl
;
4167 if (pg
->get_num_ref() != 1) {
4168 derr
<< "pgid " << pg
->get_pgid() << " has ref count of "
4169 << pg
->get_num_ref() << dendl
;
4170 #ifdef PG_DEBUG_REFS
4171 pg
->dump_live_ids();
4173 if (cct
->_conf
->osd_shutdown_pgref_assert
) {
4181 #ifdef PG_DEBUG_REFS
4182 service
.dump_live_pgids();
4186 cct
->_conf
.remove_observer(this);
4189 service
.meta_ch
.reset();
4191 dout(10) << "syncing store" << dendl
;
4192 enable_disable_fuse(true);
4194 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
4195 dout(10) << "flushing journal" << dendl
;
4196 store
->flush_journal();
4202 std::unique_lock l
{map_lock
};
4203 set_osdmap(OSDMapRef());
4205 for (auto s
: shards
) {
4206 std::lock_guard
l(s
->osdmap_lock
);
4207 s
->shard_osdmap
= OSDMapRef();
4211 std::lock_guard
lock(osd_lock
);
4215 dout(10) << "Store synced" << dendl
;
4217 op_tracker
.on_shutdown();
4219 ClassHandler::get_instance().shutdown();
4220 client_messenger
->shutdown();
4221 cluster_messenger
->shutdown();
4222 hb_front_client_messenger
->shutdown();
4223 hb_back_client_messenger
->shutdown();
4224 objecter_messenger
->shutdown();
4225 hb_front_server_messenger
->shutdown();
4226 hb_back_server_messenger
->shutdown();
4231 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
4233 bool created
= false;
4235 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
4236 vector
<string
> vcmd
{cmd
};
4240 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
4243 if (r
== -ENOENT
&& !created
) {
4244 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
4245 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
4246 vector
<string
> vnewcmd
{newcmd
};
4250 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
4253 derr
<< __func__
<< " fail: osd does not exist and created failed: "
4254 << cpp_strerror(r
) << dendl
;
4260 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
4269 int OSD::update_crush_location()
4271 if (!cct
->_conf
->osd_crush_update_on_start
) {
4272 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
4277 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
4278 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
4280 struct store_statfs_t st
;
4281 osd_alert_list_t alerts
;
4282 int r
= store
->statfs(&st
, &alerts
);
4284 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
4287 snprintf(weight
, sizeof(weight
), "%.4lf",
4290 double(1ull << 40 /* TB */)));
4293 dout(10) << __func__
<< " crush location is " << cct
->crush_location
<< dendl
;
4296 string("{\"prefix\": \"osd crush create-or-move\", ") +
4297 string("\"id\": ") + stringify(whoami
) + ", " +
4298 string("\"weight\":") + weight
+ ", " +
4299 string("\"args\": [") + stringify(cct
->crush_location
) + "]}";
4300 return mon_cmd_maybe_osd_create(cmd
);
4303 int OSD::update_crush_device_class()
4305 if (!cct
->_conf
->osd_class_update_on_start
) {
4306 dout(10) << __func__
<< " osd_class_update_on_start = false" << dendl
;
4310 string device_class
;
4311 int r
= store
->read_meta("crush_device_class", &device_class
);
4312 if (r
< 0 || device_class
.empty()) {
4313 device_class
= store
->get_default_device_class();
4316 if (device_class
.empty()) {
4317 dout(20) << __func__
<< " no device class stored locally" << dendl
;
4322 string("{\"prefix\": \"osd crush set-device-class\", ") +
4323 string("\"class\": \"") + device_class
+ string("\", ") +
4324 string("\"ids\": [\"") + stringify(whoami
) + string("\"]}");
4326 r
= mon_cmd_maybe_osd_create(cmd
);
4328 // good, already bound to a device-class
4335 void OSD::write_superblock(ObjectStore::Transaction
& t
)
4337 dout(10) << "write_superblock " << superblock
<< dendl
;
4339 //hack: at minimum it's using the baseline feature set
4340 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
4341 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
4344 encode(superblock
, bl
);
4345 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
4348 int OSD::read_superblock()
4351 int r
= store
->read(service
.meta_ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
4355 auto p
= bl
.cbegin();
4356 decode(superblock
, p
);
4358 dout(10) << "read_superblock " << superblock
<< dendl
;
4363 void OSD::clear_temp_objects()
4365 dout(10) << __func__
<< dendl
;
4367 store
->list_collections(ls
);
4368 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
4370 if (!p
->is_pg(&pgid
))
4373 // list temp objects
4374 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
4376 vector
<ghobject_t
> temps
;
4379 vector
<ghobject_t
> objects
;
4380 auto ch
= store
->open_collection(*p
);
4382 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4383 store
->get_ideal_list_max(),
4385 if (objects
.empty())
4387 vector
<ghobject_t
>::iterator q
;
4388 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
4389 // Hammer set pool for temps to -1, so check for clean-up
4390 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
4391 temps
.push_back(*q
);
4396 // If we saw a non-temp object and hit the break above we can
4397 // break out of the while loop too.
4398 if (q
!= objects
.end())
4401 if (!temps
.empty()) {
4402 ObjectStore::Transaction t
;
4404 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
4405 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
4407 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
4408 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4409 t
= ObjectStore::Transaction();
4414 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4420 void OSD::recursive_remove_collection(CephContext
* cct
,
4421 ObjectStore
*store
, spg_t pgid
,
4427 make_snapmapper_oid());
4429 ObjectStore::CollectionHandle ch
= store
->open_collection(tmp
);
4430 ObjectStore::Transaction t
;
4431 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
4434 int max
= cct
->_conf
->osd_target_transaction_size
;
4435 vector
<ghobject_t
> objects
;
4436 objects
.reserve(max
);
4439 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4440 max
, &objects
, &next
);
4441 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
4442 if (objects
.empty())
4444 for (auto& p
: objects
) {
4445 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
4446 int r
= mapper
.remove_oid(p
.hobj
, &_t
);
4447 if (r
!= 0 && r
!= -ENOENT
)
4451 int r
= store
->queue_transaction(ch
, std::move(t
));
4452 ceph_assert(r
== 0);
4453 t
= ObjectStore::Transaction();
4455 t
.remove_collection(tmp
);
4456 int r
= store
->queue_transaction(ch
, std::move(t
));
4457 ceph_assert(r
== 0);
4460 if (!ch
->flush_commit(&waiter
)) {
4466 // ======================================================
4470 OSDMapRef createmap
,
4473 dout(10) << __func__
<< " " << pgid
<< dendl
;
4475 map
<string
,string
> ec_profile
;
4477 if (createmap
->have_pg_pool(pgid
.pool())) {
4478 pi
= *createmap
->get_pg_pool(pgid
.pool());
4479 name
= createmap
->get_pool_name(pgid
.pool());
4480 if (pi
.is_erasure()) {
4481 ec_profile
= createmap
->get_erasure_code_profile(pi
.erasure_code_profile
);
4484 // pool was deleted; grab final pg_pool_t off disk.
4485 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
4487 int r
= store
->read(service
.meta_ch
, oid
, 0, 0, bl
);
4489 derr
<< __func__
<< " missing pool " << pgid
.pool() << " tombstone"
4493 ceph_assert(r
>= 0);
4494 auto p
= bl
.cbegin();
4497 if (p
.end()) { // dev release v13.0.2 did not include ec_profile
4498 derr
<< __func__
<< " missing ec_profile from pool " << pgid
.pool()
4499 << " tombstone" << dendl
;
4502 decode(ec_profile
, p
);
4504 PGPool
pool(cct
, createmap
, pgid
.pool(), pi
, name
);
4506 if (pi
.type
== pg_pool_t::TYPE_REPLICATED
||
4507 pi
.type
== pg_pool_t::TYPE_ERASURE
)
4508 pg
= new PrimaryLogPG(&service
, createmap
, pool
, ec_profile
, pgid
);
4514 void OSD::_get_pgs(vector
<PGRef
> *v
, bool clear_too
)
4517 v
->reserve(get_num_pgs());
4518 for (auto& s
: shards
) {
4519 std::lock_guard
l(s
->shard_lock
);
4520 for (auto& j
: s
->pg_slots
) {
4522 !j
.second
->pg
->is_deleted()) {
4523 v
->push_back(j
.second
->pg
);
4525 s
->_detach_pg(j
.second
.get());
4532 void OSD::_get_pgids(vector
<spg_t
> *v
)
4535 v
->reserve(get_num_pgs());
4536 for (auto& s
: shards
) {
4537 std::lock_guard
l(s
->shard_lock
);
4538 for (auto& j
: s
->pg_slots
) {
4540 !j
.second
->pg
->is_deleted()) {
4541 v
->push_back(j
.first
);
4547 void OSD::register_pg(PGRef pg
)
4549 spg_t pgid
= pg
->get_pgid();
4550 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4551 auto sdata
= shards
[shard_index
];
4552 std::lock_guard
l(sdata
->shard_lock
);
4553 auto r
= sdata
->pg_slots
.emplace(pgid
, make_unique
<OSDShardPGSlot
>());
4554 ceph_assert(r
.second
);
4555 auto *slot
= r
.first
->second
.get();
4556 dout(20) << __func__
<< " " << pgid
<< " " << pg
<< dendl
;
4557 sdata
->_attach_pg(slot
, pg
.get());
4560 bool OSD::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
4562 auto sdata
= pg
->osd_shard
;
4565 std::lock_guard
l(sdata
->shard_lock
);
4566 auto p
= sdata
->pg_slots
.find(pg
->pg_id
);
4567 if (p
== sdata
->pg_slots
.end() ||
4569 dout(20) << __func__
<< " " << pg
->pg_id
<< " not found" << dendl
;
4572 if (p
->second
->waiting_for_merge_epoch
) {
4573 dout(20) << __func__
<< " " << pg
->pg_id
<< " waiting for merge" << dendl
;
4576 dout(20) << __func__
<< " " << pg
->pg_id
<< " " << pg
<< dendl
;
4577 sdata
->_detach_pg(p
->second
.get());
4580 for (auto shard
: shards
) {
4581 shard
->unprime_split_children(pg
->pg_id
, old_pg_num
);
4584 // update pg count now since we might not get an osdmap any time soon.
4585 if (pg
->is_primary())
4586 service
.logger
->dec(l_osd_pg_primary
);
4587 else if (pg
->is_nonprimary())
4588 service
.logger
->dec(l_osd_pg_replica
); // misnomver
4590 service
.logger
->dec(l_osd_pg_stray
);
4595 PGRef
OSD::_lookup_pg(spg_t pgid
)
4597 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4598 auto sdata
= shards
[shard_index
];
4599 std::lock_guard
l(sdata
->shard_lock
);
4600 auto p
= sdata
->pg_slots
.find(pgid
);
4601 if (p
== sdata
->pg_slots
.end()) {
4604 return p
->second
->pg
;
4607 PGRef
OSD::_lookup_lock_pg(spg_t pgid
)
4609 PGRef pg
= _lookup_pg(pgid
);
4614 if (!pg
->is_deleted()) {
4621 PGRef
OSD::lookup_lock_pg(spg_t pgid
)
4623 return _lookup_lock_pg(pgid
);
4626 void OSD::load_pgs()
4628 ceph_assert(ceph_mutex_is_locked(osd_lock
));
4629 dout(0) << "load_pgs" << dendl
;
4632 auto pghist
= make_pg_num_history_oid();
4634 int r
= store
->read(service
.meta_ch
, pghist
, 0, 0, bl
, 0);
4635 if (r
>= 0 && bl
.length() > 0) {
4636 auto p
= bl
.cbegin();
4637 decode(pg_num_history
, p
);
4639 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
4643 int r
= store
->list_collections(ls
);
4645 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
4649 for (vector
<coll_t
>::iterator it
= ls
.begin();
4653 if (it
->is_temp(&pgid
) ||
4654 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
, pgid
))) {
4655 dout(10) << "load_pgs " << *it
4656 << " removing, legacy or flagged for removal pg" << dendl
;
4657 recursive_remove_collection(cct
, store
, pgid
, *it
);
4661 if (!it
->is_pg(&pgid
)) {
4662 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
4666 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
4667 epoch_t map_epoch
= 0;
4668 int r
= PG::peek_map_epoch(store
, pgid
, &map_epoch
);
4670 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
4676 if (map_epoch
> 0) {
4677 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
4679 if (!get_osdmap()->have_pg_pool(pgid
.pool())) {
4680 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
4681 << " on pg " << pgid
<< ", but the pool is not present in the "
4682 << "current map, so this is probably a result of bug 10617. "
4683 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4684 << "to clean it up later." << dendl
;
4687 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
4688 << map_epoch
<< ", but missing map. Crashing."
4690 ceph_abort_msg("Missing map in load_pgs");
4693 pg
= _make_pg(pgosdmap
, pgid
);
4695 pg
= _make_pg(get_osdmap(), pgid
);
4698 recursive_remove_collection(cct
, store
, pgid
, *it
);
4702 // there can be no waiters here, so we don't call _wake_pg_slot
4705 pg
->ch
= store
->open_collection(pg
->coll
);
4707 // read pg state, log
4708 pg
->read_state(store
);
4711 dout(10) << "load_pgs " << *it
<< " deleting dne" << dendl
;
4714 recursive_remove_collection(cct
, store
, pgid
, *it
);
4718 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4719 assert(NULL
!= shards
[shard_index
]);
4720 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4723 pg
->reg_next_scrub();
4725 dout(10) << __func__
<< " loaded " << *pg
<< dendl
;
4731 dout(0) << __func__
<< " opened " << num
<< " pgs" << dendl
;
4735 PGRef
OSD::handle_pg_create_info(const OSDMapRef
& osdmap
,
4736 const PGCreateInfo
*info
)
4738 spg_t pgid
= info
->pgid
;
4740 if (maybe_wait_for_max_pg(osdmap
, pgid
, info
->by_mon
)) {
4741 dout(10) << __func__
<< " hit max pg, dropping" << dendl
;
4745 PeeringCtx rctx
= create_context();
4747 OSDMapRef startmap
= get_map(info
->epoch
);
4750 int64_t pool_id
= pgid
.pgid
.pool();
4751 const pg_pool_t
*pool
= osdmap
->get_pg_pool(pool_id
);
4753 dout(10) << __func__
<< " ignoring " << pgid
<< ", pool dne" << dendl
;
4756 if (osdmap
->require_osd_release
>= ceph_release_t::nautilus
&&
4757 !pool
->has_flag(pg_pool_t::FLAG_CREATING
)) {
4758 // this ensures we do not process old creating messages after the
4759 // pool's initial pgs have been created (and pg are subsequently
4760 // allowed to split or merge).
4761 dout(20) << __func__
<< " dropping " << pgid
4762 << "create, pool does not have CREATING flag set" << dendl
;
4767 int up_primary
, acting_primary
;
4768 vector
<int> up
, acting
;
4769 startmap
->pg_to_up_acting_osds(
4770 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4772 const pg_pool_t
* pp
= startmap
->get_pg_pool(pgid
.pool());
4773 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
4774 store
->get_type() != "bluestore") {
4775 clog
->warn() << "pg " << pgid
4776 << " is at risk of silent data corruption: "
4777 << "the pool allows ec overwrites but is not stored in "
4778 << "bluestore, so deep scrubbing will not detect bitrot";
4780 create_pg_collection(
4781 rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
4782 init_pg_ondisk(rctx
.transaction
, pgid
, pp
);
4784 int role
= startmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
4786 PGRef pg
= _make_pg(startmap
, pgid
);
4787 pg
->ch
= store
->create_new_collection(pg
->coll
);
4790 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4791 assert(NULL
!= shards
[shard_index
]);
4792 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4797 // we are holding the shard lock
4798 ceph_assert(!pg
->is_deleted());
4807 info
->past_intervals
,
4811 pg
->init_collection_pool_opts();
4813 if (pg
->is_primary()) {
4814 std::lock_guard locker
{m_perf_queries_lock
};
4815 pg
->set_dynamic_perf_stats_queries(m_perf_queries
);
4818 pg
->handle_initialize(rctx
);
4819 pg
->handle_activate_map(rctx
);
4821 dispatch_context(rctx
, pg
.get(), osdmap
, nullptr);
4823 dout(10) << __func__
<< " new pg " << *pg
<< dendl
;
4827 bool OSD::maybe_wait_for_max_pg(const OSDMapRef
& osdmap
,
4831 const auto max_pgs_per_osd
=
4832 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4833 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4835 if (num_pgs
< max_pgs_per_osd
) {
4839 std::lock_guard
l(pending_creates_lock
);
4840 if (is_mon_create
) {
4841 pending_creates_from_mon
++;
4843 bool is_primary
= osdmap
->get_pg_acting_role(pgid
, whoami
) == 0;
4844 pending_creates_from_osd
.emplace(pgid
, is_primary
);
4846 dout(1) << __func__
<< " withhold creation of pg " << pgid
4847 << ": " << num_pgs
<< " >= "<< max_pgs_per_osd
<< dendl
;
4851 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4852 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4853 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4854 static vector
<int32_t> twiddle(const vector
<int>& acting
) {
4855 if (acting
.size() > 1) {
4858 vector
<int32_t> twiddled(acting
.begin(), acting
.end());
4859 twiddled
.push_back(-1);
4864 void OSD::resume_creating_pg()
4866 bool do_sub_pg_creates
= false;
4867 bool have_pending_creates
= false;
4869 const auto max_pgs_per_osd
=
4870 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4871 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4872 if (max_pgs_per_osd
<= num_pgs
) {
4873 // this could happen if admin decreases this setting before a PG is removed
4876 unsigned spare_pgs
= max_pgs_per_osd
- num_pgs
;
4877 std::lock_guard
l(pending_creates_lock
);
4878 if (pending_creates_from_mon
> 0) {
4879 dout(20) << __func__
<< " pending_creates_from_mon "
4880 << pending_creates_from_mon
<< dendl
;
4881 do_sub_pg_creates
= true;
4882 if (pending_creates_from_mon
>= spare_pgs
) {
4883 spare_pgs
= pending_creates_from_mon
= 0;
4885 spare_pgs
-= pending_creates_from_mon
;
4886 pending_creates_from_mon
= 0;
4889 auto pg
= pending_creates_from_osd
.cbegin();
4890 while (spare_pgs
> 0 && pg
!= pending_creates_from_osd
.cend()) {
4891 dout(20) << __func__
<< " pg " << pg
->first
<< dendl
;
4893 get_osdmap()->pg_to_up_acting_osds(pg
->first
.pgid
, nullptr, nullptr, &acting
, nullptr);
4894 service
.queue_want_pg_temp(pg
->first
.pgid
, twiddle(acting
), true);
4895 pg
= pending_creates_from_osd
.erase(pg
);
4896 do_sub_pg_creates
= true;
4899 have_pending_creates
= (pending_creates_from_mon
> 0 ||
4900 !pending_creates_from_osd
.empty());
4903 bool do_renew_subs
= false;
4904 if (do_sub_pg_creates
) {
4905 if (monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0)) {
4906 dout(4) << __func__
<< ": resolicit pg creates from mon since "
4907 << last_pg_create_epoch
<< dendl
;
4908 do_renew_subs
= true;
4911 version_t start
= get_osdmap_epoch() + 1;
4912 if (have_pending_creates
) {
4913 // don't miss any new osdmap deleting PGs
4914 if (monc
->sub_want("osdmap", start
, 0)) {
4915 dout(4) << __func__
<< ": resolicit osdmap from mon since "
4917 do_renew_subs
= true;
4919 } else if (do_sub_pg_creates
) {
4920 // no need to subscribe the osdmap continuously anymore
4921 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4922 if (monc
->sub_want_increment("osdmap", start
, CEPH_SUBSCRIBE_ONETIME
)) {
4923 dout(4) << __func__
<< ": re-subscribe osdmap(onetime) since "
4925 do_renew_subs
= true;
4929 if (do_renew_subs
) {
4933 service
.send_pg_temp();
4936 void OSD::build_initial_pg_history(
4939 utime_t created_stamp
,
4943 dout(10) << __func__
<< " " << pgid
<< " created " << created
<< dendl
;
4944 *h
= pg_history_t(created
, created_stamp
);
4946 OSDMapRef lastmap
= service
.get_map(created
);
4947 int up_primary
, acting_primary
;
4948 vector
<int> up
, acting
;
4949 lastmap
->pg_to_up_acting_osds(
4950 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4952 ostringstream debug
;
4953 for (epoch_t e
= created
+ 1; e
<= get_osdmap_epoch(); ++e
) {
4954 OSDMapRef osdmap
= service
.get_map(e
);
4955 int new_up_primary
, new_acting_primary
;
4956 vector
<int> new_up
, new_acting
;
4957 osdmap
->pg_to_up_acting_osds(
4958 pgid
.pgid
, &new_up
, &new_up_primary
, &new_acting
, &new_acting_primary
);
4960 // this is a bit imprecise, but sufficient?
4961 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
4962 const pg_pool_t
*pi
;
4963 bool operator()(const set
<pg_shard_t
> &have
) const {
4964 return have
.size() >= pi
->min_size
;
4966 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
4967 } min_size_predicate(osdmap
->get_pg_pool(pgid
.pgid
.pool()));
4969 bool new_interval
= PastIntervals::check_new_interval(
4976 h
->same_interval_since
,
4977 h
->last_epoch_clean
,
4985 h
->same_interval_since
= e
;
4987 h
->same_up_since
= e
;
4989 if (acting_primary
!= new_acting_primary
) {
4990 h
->same_primary_since
= e
;
4992 if (pgid
.pgid
.is_split(lastmap
->get_pg_num(pgid
.pgid
.pool()),
4993 osdmap
->get_pg_num(pgid
.pgid
.pool()),
4995 h
->last_epoch_split
= e
;
4998 acting
= new_acting
;
4999 up_primary
= new_up_primary
;
5000 acting_primary
= new_acting_primary
;
5004 dout(20) << __func__
<< " " << debug
.str() << dendl
;
5005 dout(10) << __func__
<< " " << *h
<< " " << *pi
5006 << " [" << (pi
->empty() ? pair
<epoch_t
,epoch_t
>(0,0) :
5007 pi
->get_bounds()) << ")"
5011 void OSD::_add_heartbeat_peer(int p
)
5017 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
5018 if (i
== heartbeat_peers
.end()) {
5019 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, get_osdmap_epoch());
5022 assert(cons
.second
);
5024 hi
= &heartbeat_peers
[p
];
5027 auto stamps
= service
.get_hb_stamps(p
);
5029 auto sb
= ceph::make_ref
<Session
>(cct
, cons
.first
.get());
5031 sb
->stamps
= stamps
;
5032 hi
->hb_interval_start
= ceph_clock_now();
5033 hi
->con_back
= cons
.first
.get();
5034 hi
->con_back
->set_priv(sb
);
5036 auto sf
= ceph::make_ref
<Session
>(cct
, cons
.second
.get());
5038 sf
->stamps
= stamps
;
5039 hi
->con_front
= cons
.second
.get();
5040 hi
->con_front
->set_priv(sf
);
5042 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5043 << " " << hi
->con_back
->get_peer_addr()
5044 << " " << hi
->con_front
->get_peer_addr()
5049 hi
->epoch
= get_osdmap_epoch();
5052 void OSD::_remove_heartbeat_peer(int n
)
5054 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
5055 ceph_assert(q
!= heartbeat_peers
.end());
5056 dout(20) << " removing heartbeat peer osd." << n
5057 << " " << q
->second
.con_back
->get_peer_addr()
5058 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
5060 q
->second
.clear_mark_down();
5061 heartbeat_peers
.erase(q
);
5064 void OSD::need_heartbeat_peer_update()
5068 dout(20) << "need_heartbeat_peer_update" << dendl
;
5069 heartbeat_set_peers_need_update();
5072 void OSD::maybe_update_heartbeat_peers()
5074 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5076 if (is_waiting_for_healthy() || is_active()) {
5077 utime_t now
= ceph_clock_now();
5078 if (last_heartbeat_resample
== utime_t()) {
5079 last_heartbeat_resample
= now
;
5080 heartbeat_set_peers_need_update();
5081 } else if (!heartbeat_peers_need_update()) {
5082 utime_t dur
= now
- last_heartbeat_resample
;
5083 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
5084 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
5085 heartbeat_set_peers_need_update();
5086 last_heartbeat_resample
= now
;
5087 // automatically clean up any stale heartbeat peers
5088 // if we are unhealthy, then clean all
5089 reset_heartbeat_peers(is_waiting_for_healthy());
5094 if (!heartbeat_peers_need_update())
5096 heartbeat_clear_peers_need_update();
5098 std::lock_guard
l(heartbeat_lock
);
5100 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
5103 // build heartbeat from set
5107 for (auto& pg
: pgs
) {
5108 pg
->with_heartbeat_peers([&](int peer
) {
5109 if (get_osdmap()->is_up(peer
)) {
5110 _add_heartbeat_peer(peer
);
5116 // include next and previous up osds to ensure we have a fully-connected set
5117 set
<int> want
, extras
;
5118 const int next
= get_osdmap()->get_next_up_osd_after(whoami
);
5121 int prev
= get_osdmap()->get_previous_up_osd_before(whoami
);
5122 if (prev
>= 0 && prev
!= next
)
5125 // make sure we have at least **min_down** osds coming from different
5126 // subtree level (e.g., hosts) for fast failure detection.
5127 auto min_down
= cct
->_conf
.get_val
<uint64_t>("mon_osd_min_down_reporters");
5128 auto subtree
= cct
->_conf
.get_val
<string
>("mon_osd_reporter_subtree_level");
5129 auto limit
= std::max(min_down
, (uint64_t)cct
->_conf
->osd_heartbeat_min_peers
);
5130 get_osdmap()->get_random_up_osds_by_subtree(
5131 whoami
, subtree
, limit
, want
, &want
);
5133 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
5134 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
5136 _add_heartbeat_peer(*p
);
5139 // remove down peers; enumerate extras
5140 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5141 while (p
!= heartbeat_peers
.end()) {
5142 if (!get_osdmap()->is_up(p
->first
)) {
5145 _remove_heartbeat_peer(o
);
5148 if (p
->second
.epoch
< get_osdmap_epoch()) {
5149 extras
.insert(p
->first
);
5155 for (int n
= next
; n
>= 0; ) {
5156 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
5158 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
5159 dout(10) << " adding random peer osd." << n
<< dendl
;
5161 _add_heartbeat_peer(n
);
5163 n
= get_osdmap()->get_next_up_osd_after(n
);
5165 break; // came full circle; stop
5169 for (set
<int>::iterator p
= extras
.begin();
5170 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
5174 _remove_heartbeat_peer(*p
);
5177 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
5179 // clean up stale failure pending
5180 for (auto it
= failure_pending
.begin(); it
!= failure_pending
.end();) {
5181 if (heartbeat_peers
.count(it
->first
) == 0) {
5182 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
5183 failure_pending
.erase(it
++);
5190 void OSD::reset_heartbeat_peers(bool all
)
5192 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5193 dout(10) << "reset_heartbeat_peers" << dendl
;
5194 utime_t stale
= ceph_clock_now();
5195 stale
-= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
5196 std::lock_guard
l(heartbeat_lock
);
5197 for (auto it
= heartbeat_peers
.begin(); it
!= heartbeat_peers
.end();) {
5198 HeartbeatInfo
& hi
= it
->second
;
5199 if (all
|| hi
.is_stale(stale
)) {
5200 hi
.clear_mark_down();
5201 // stop sending failure_report to mon too
5202 failure_queue
.erase(it
->first
);
5203 heartbeat_peers
.erase(it
++);
5210 void OSD::handle_osd_ping(MOSDPing
*m
)
5212 if (superblock
.cluster_fsid
!= m
->fsid
) {
5213 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
5214 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
5220 int from
= m
->get_source().num();
5222 heartbeat_lock
.lock();
5223 if (is_stopping()) {
5224 heartbeat_lock
.unlock();
5229 utime_t now
= ceph_clock_now();
5230 auto mnow
= service
.get_mnow();
5231 ConnectionRef
con(m
->get_connection());
5232 OSDMapRef curmap
= service
.get_osdmap();
5234 heartbeat_lock
.unlock();
5239 auto sref
= con
->get_priv();
5240 Session
*s
= static_cast<Session
*>(sref
.get());
5242 heartbeat_lock
.unlock();
5248 s
->stamps
= service
.get_hb_stamps(from
);
5253 case MOSDPing::PING
:
5255 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
5256 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
5257 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
5258 if (heartbeat_drop
->second
== 0) {
5259 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
5261 --heartbeat_drop
->second
;
5262 dout(5) << "Dropping heartbeat from " << from
5263 << ", " << heartbeat_drop
->second
5264 << " remaining to drop" << dendl
;
5267 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
5268 ((((double)(rand()%100))/100.0))) {
5270 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
5271 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
5272 dout(5) << "Dropping heartbeat from " << from
5273 << ", " << heartbeat_drop
->second
5274 << " remaining to drop" << dendl
;
5279 ceph::signedspan sender_delta_ub
{};
5280 s
->stamps
->got_ping(
5286 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5288 if (!cct
->get_heartbeat_map()->is_healthy()) {
5289 dout(10) << "internal heartbeat not healthy, dropping ping request"
5294 Message
*r
= new MOSDPing(monc
->get_fsid(),
5295 curmap
->get_epoch(),
5296 MOSDPing::PING_REPLY
,
5300 service
.get_up_epoch(),
5301 cct
->_conf
->osd_heartbeat_min_size
,
5303 con
->send_message(r
);
5305 if (curmap
->is_up(from
)) {
5307 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5308 from
, curmap
->get_epoch());
5310 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5313 } else if (!curmap
->exists(from
) ||
5314 curmap
->get_down_at(from
) > m
->map_epoch
) {
5315 // tell them they have died
5316 Message
*r
= new MOSDPing(monc
->get_fsid(),
5317 curmap
->get_epoch(),
5322 service
.get_up_epoch(),
5323 cct
->_conf
->osd_heartbeat_min_size
);
5324 con
->send_message(r
);
5329 case MOSDPing::PING_REPLY
:
5331 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
5332 if (i
!= heartbeat_peers
.end()) {
5333 auto acked
= i
->second
.ping_history
.find(m
->ping_stamp
);
5334 if (acked
!= i
->second
.ping_history
.end()) {
5335 int &unacknowledged
= acked
->second
.second
;
5336 if (con
== i
->second
.con_back
) {
5337 dout(25) << "handle_osd_ping got reply from osd." << from
5338 << " first_tx " << i
->second
.first_tx
5339 << " last_tx " << i
->second
.last_tx
5340 << " last_rx_back " << i
->second
.last_rx_back
5342 << " last_rx_front " << i
->second
.last_rx_front
5344 i
->second
.last_rx_back
= now
;
5345 ceph_assert(unacknowledged
> 0);
5347 // if there is no front con, set both stamps.
5348 if (i
->second
.con_front
== NULL
) {
5349 i
->second
.last_rx_front
= now
;
5350 ceph_assert(unacknowledged
> 0);
5353 } else if (con
== i
->second
.con_front
) {
5354 dout(25) << "handle_osd_ping got reply from osd." << from
5355 << " first_tx " << i
->second
.first_tx
5356 << " last_tx " << i
->second
.last_tx
5357 << " last_rx_back " << i
->second
.last_rx_back
5358 << " last_rx_front " << i
->second
.last_rx_front
5361 i
->second
.last_rx_front
= now
;
5362 ceph_assert(unacknowledged
> 0);
5366 if (unacknowledged
== 0) {
5367 // succeeded in getting all replies
5368 dout(25) << "handle_osd_ping got all replies from osd." << from
5369 << " , erase pending ping(sent at " << m
->ping_stamp
<< ")"
5370 << " and older pending ping(s)"
5373 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5374 ++i
->second
.hb_average_count
;
5375 uint32_t back_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_back
- m
->ping_stamp
);
5376 i
->second
.hb_total_back
+= back_pingtime
;
5377 if (back_pingtime
< i
->second
.hb_min_back
)
5378 i
->second
.hb_min_back
= back_pingtime
;
5379 if (back_pingtime
> i
->second
.hb_max_back
)
5380 i
->second
.hb_max_back
= back_pingtime
;
5381 uint32_t front_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_front
- m
->ping_stamp
);
5382 i
->second
.hb_total_front
+= front_pingtime
;
5383 if (front_pingtime
< i
->second
.hb_min_front
)
5384 i
->second
.hb_min_front
= front_pingtime
;
5385 if (front_pingtime
> i
->second
.hb_max_front
)
5386 i
->second
.hb_max_front
= front_pingtime
;
5388 ceph_assert(i
->second
.hb_interval_start
!= utime_t());
5389 if (i
->second
.hb_interval_start
== utime_t())
5390 i
->second
.hb_interval_start
= now
;
5391 int64_t hb_avg_time_period
= 60;
5392 if (cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span")) {
5393 hb_avg_time_period
= cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span");
5395 if (now
- i
->second
.hb_interval_start
>= utime_t(hb_avg_time_period
, 0)) {
5396 uint32_t back_avg
= i
->second
.hb_total_back
/ i
->second
.hb_average_count
;
5397 uint32_t back_min
= i
->second
.hb_min_back
;
5398 uint32_t back_max
= i
->second
.hb_max_back
;
5399 uint32_t front_avg
= i
->second
.hb_total_front
/ i
->second
.hb_average_count
;
5400 uint32_t front_min
= i
->second
.hb_min_front
;
5401 uint32_t front_max
= i
->second
.hb_max_front
;
5403 // Reset for new interval
5404 i
->second
.hb_average_count
= 0;
5405 i
->second
.hb_interval_start
= now
;
5406 i
->second
.hb_total_back
= i
->second
.hb_max_back
= 0;
5407 i
->second
.hb_min_back
= UINT_MAX
;
5408 i
->second
.hb_total_front
= i
->second
.hb_max_front
= 0;
5409 i
->second
.hb_min_front
= UINT_MAX
;
5411 // Record per osd interace ping times
5412 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5413 if (i
->second
.hb_back_pingtime
.size() == 0) {
5414 ceph_assert(i
->second
.hb_front_pingtime
.size() == 0);
5415 for (unsigned k
= 0 ; k
< hb_vector_size
; ++k
) {
5416 i
->second
.hb_back_pingtime
.push_back(back_avg
);
5417 i
->second
.hb_back_min
.push_back(back_min
);
5418 i
->second
.hb_back_max
.push_back(back_max
);
5419 i
->second
.hb_front_pingtime
.push_back(front_avg
);
5420 i
->second
.hb_front_min
.push_back(front_min
);
5421 i
->second
.hb_front_max
.push_back(front_max
);
5422 ++i
->second
.hb_index
;
5425 int index
= i
->second
.hb_index
& (hb_vector_size
- 1);
5426 i
->second
.hb_back_pingtime
[index
] = back_avg
;
5427 i
->second
.hb_back_min
[index
] = back_min
;
5428 i
->second
.hb_back_max
[index
] = back_max
;
5429 i
->second
.hb_front_pingtime
[index
] = front_avg
;
5430 i
->second
.hb_front_min
[index
] = front_min
;
5431 i
->second
.hb_front_max
[index
] = front_max
;
5432 ++i
->second
.hb_index
;
5436 std::lock_guard
l(service
.stat_lock
);
5437 service
.osd_stat
.hb_pingtime
[from
].last_update
= now
.sec();
5438 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5441 uint32_t min
= UINT_MAX
;
5445 uint32_t size
= (uint32_t)i
->second
.hb_back_pingtime
.size();
5446 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5448 int index
= (i
->second
.hb_index
+ k
) % size
;
5449 total
+= i
->second
.hb_back_pingtime
[index
];
5450 if (i
->second
.hb_back_min
[index
] < min
)
5451 min
= i
->second
.hb_back_min
[index
];
5452 if (i
->second
.hb_back_max
[index
] > max
)
5453 max
= i
->second
.hb_back_max
[index
];
5454 if (count
== 1 || count
== 5 || count
== 15) {
5455 service
.osd_stat
.hb_pingtime
[from
].back_pingtime
[which
] = total
/ count
;
5456 service
.osd_stat
.hb_pingtime
[from
].back_min
[which
] = min
;
5457 service
.osd_stat
.hb_pingtime
[from
].back_max
[which
] = max
;
5464 if (i
->second
.con_front
!= NULL
) {
5465 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5472 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5474 int index
= (i
->second
.hb_index
+ k
) % size
;
5475 total
+= i
->second
.hb_front_pingtime
[index
];
5476 if (i
->second
.hb_front_min
[index
] < min
)
5477 min
= i
->second
.hb_front_min
[index
];
5478 if (i
->second
.hb_front_max
[index
] > max
)
5479 max
= i
->second
.hb_front_max
[index
];
5480 if (count
== 1 || count
== 5 || count
== 15) {
5481 service
.osd_stat
.hb_pingtime
[from
].front_pingtime
[which
] = total
/ count
;
5482 service
.osd_stat
.hb_pingtime
[from
].front_min
[which
] = min
;
5483 service
.osd_stat
.hb_pingtime
[from
].front_max
[which
] = max
;
5492 std::lock_guard
l(service
.stat_lock
);
5493 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5494 if (i
->second
.con_front
!= NULL
)
5495 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5497 i
->second
.ping_history
.erase(i
->second
.ping_history
.begin(), ++acked
);
5500 if (i
->second
.is_healthy(now
)) {
5501 // Cancel false reports
5502 auto failure_queue_entry
= failure_queue
.find(from
);
5503 if (failure_queue_entry
!= failure_queue
.end()) {
5504 dout(10) << "handle_osd_ping canceling queued "
5505 << "failure report for osd." << from
<< dendl
;
5506 failure_queue
.erase(failure_queue_entry
);
5509 auto failure_pending_entry
= failure_pending
.find(from
);
5510 if (failure_pending_entry
!= failure_pending
.end()) {
5511 dout(10) << "handle_osd_ping canceling in-flight "
5512 << "failure report for osd." << from
<< dendl
;
5513 send_still_alive(curmap
->get_epoch(),
5515 failure_pending_entry
->second
.second
);
5516 failure_pending
.erase(failure_pending_entry
);
5520 // old replies, deprecated by newly sent pings.
5521 dout(10) << "handle_osd_ping no pending ping(sent at " << m
->ping_stamp
5522 << ") is found, treat as covered by newly sent pings "
5529 curmap
->is_up(from
)) {
5531 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5532 from
, curmap
->get_epoch());
5534 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5539 s
->stamps
->got_ping_reply(
5543 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5547 case MOSDPing::YOU_DIED
:
5548 dout(10) << "handle_osd_ping " << m
->get_source_inst()
5549 << " says i am down in " << m
->map_epoch
<< dendl
;
5550 osdmap_subscribe(curmap
->get_epoch()+1, false);
5554 heartbeat_lock
.unlock();
5558 void OSD::heartbeat_entry()
5560 std::unique_lock
l(heartbeat_lock
);
5563 while (!heartbeat_stop
) {
5567 if (cct
->_conf
.get_val
<bool>("debug_disable_randomized_ping")) {
5568 wait
= (float)cct
->_conf
->osd_heartbeat_interval
;
5570 wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
5572 auto w
= ceph::make_timespan(wait
);
5573 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
5574 heartbeat_cond
.wait_for(l
, w
);
5577 dout(30) << "heartbeat_entry woke up" << dendl
;
5581 void OSD::heartbeat_check()
5583 ceph_assert(ceph_mutex_is_locked(heartbeat_lock
));
5584 utime_t now
= ceph_clock_now();
5586 // check for incoming heartbeats (move me elsewhere?)
5587 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5588 p
!= heartbeat_peers
.end();
5591 if (p
->second
.first_tx
== utime_t()) {
5592 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
5593 << " yet, skipping" << dendl
;
5597 dout(25) << "heartbeat_check osd." << p
->first
5598 << " first_tx " << p
->second
.first_tx
5599 << " last_tx " << p
->second
.last_tx
5600 << " last_rx_back " << p
->second
.last_rx_back
5601 << " last_rx_front " << p
->second
.last_rx_front
5603 if (p
->second
.is_unhealthy(now
)) {
5604 utime_t oldest_deadline
= p
->second
.ping_history
.begin()->second
.first
;
5605 if (p
->second
.last_rx_back
== utime_t() ||
5606 p
->second
.last_rx_front
== utime_t()) {
5607 derr
<< "heartbeat_check: no reply from "
5608 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5609 << " osd." << p
->first
5610 << " ever on either front or back, first ping sent "
5611 << p
->second
.first_tx
5612 << " (oldest deadline " << oldest_deadline
<< ")"
5615 failure_queue
[p
->first
] = p
->second
.first_tx
;
5617 derr
<< "heartbeat_check: no reply from "
5618 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5619 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
5620 << " front " << p
->second
.last_rx_front
5621 << " (oldest deadline " << oldest_deadline
<< ")"
5624 failure_queue
[p
->first
] = std::min(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
5630 void OSD::heartbeat()
5632 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock
));
5633 dout(30) << "heartbeat" << dendl
;
5637 int hb_interval
= cct
->_conf
->osd_heartbeat_interval
;
5638 int n_samples
= 86400;
5639 if (hb_interval
> 1) {
5640 n_samples
/= hb_interval
;
5645 if (getloadavg(loadavgs
, 1) == 1) {
5646 logger
->set(l_osd_loadavg
, 100 * loadavgs
[0]);
5647 daily_loadavg
= (daily_loadavg
* (n_samples
- 1) + loadavgs
[0]) / n_samples
;
5648 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg
<< dendl
;
5651 dout(30) << "heartbeat checking stats" << dendl
;
5653 // refresh peer list and osd stats
5654 vector
<int> hb_peers
;
5655 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5656 p
!= heartbeat_peers
.end();
5658 hb_peers
.push_back(p
->first
);
5660 auto new_stat
= service
.set_osd_stat(hb_peers
, get_num_pgs());
5661 dout(5) << __func__
<< " " << new_stat
<< dendl
;
5662 ceph_assert(new_stat
.statfs
.total
);
5665 float ratio
= service
.compute_adjusted_ratio(new_stat
, &pratio
);
5667 service
.check_full_status(ratio
, pratio
);
5669 utime_t now
= ceph_clock_now();
5670 auto mnow
= service
.get_mnow();
5671 utime_t deadline
= now
;
5672 deadline
+= cct
->_conf
->osd_heartbeat_grace
;
5675 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
5676 i
!= heartbeat_peers
.end();
5678 int peer
= i
->first
;
5679 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
5681 i
->second
.last_tx
= now
;
5682 if (i
->second
.first_tx
== utime_t())
5683 i
->second
.first_tx
= now
;
5684 i
->second
.ping_history
[now
] = make_pair(deadline
,
5685 HeartbeatInfo::HEARTBEAT_MAX_CONN
);
5686 if (i
->second
.hb_interval_start
== utime_t())
5687 i
->second
.hb_interval_start
= now
;
5689 Session
*s
= static_cast<Session
*>(i
->second
.con_back
->get_priv().get());
5690 std::optional
<ceph::signedspan
> delta_ub
;
5691 s
->stamps
->sent_ping(&delta_ub
);
5693 i
->second
.con_back
->send_message(
5694 new MOSDPing(monc
->get_fsid(),
5695 service
.get_osdmap_epoch(),
5700 service
.get_up_epoch(),
5701 cct
->_conf
->osd_heartbeat_min_size
,
5704 if (i
->second
.con_front
)
5705 i
->second
.con_front
->send_message(
5706 new MOSDPing(monc
->get_fsid(),
5707 service
.get_osdmap_epoch(),
5712 service
.get_up_epoch(),
5713 cct
->_conf
->osd_heartbeat_min_size
,
5717 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
5719 // hmm.. am i all alone?
5720 dout(30) << "heartbeat lonely?" << dendl
;
5721 if (heartbeat_peers
.empty()) {
5722 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
5723 last_mon_heartbeat
= now
;
5724 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
5725 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5729 dout(30) << "heartbeat done" << dendl
;
5732 bool OSD::heartbeat_reset(Connection
*con
)
5734 std::lock_guard
l(heartbeat_lock
);
5735 auto s
= con
->get_priv();
5736 dout(20) << __func__
<< " con " << con
<< " s " << s
.get() << dendl
;
5737 con
->set_priv(nullptr);
5739 if (is_stopping()) {
5742 auto session
= static_cast<Session
*>(s
.get());
5743 auto p
= heartbeat_peers
.find(session
->peer
);
5744 if (p
!= heartbeat_peers
.end() &&
5745 (p
->second
.con_back
== con
||
5746 p
->second
.con_front
== con
)) {
5747 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5748 << ", reopening" << dendl
;
5749 p
->second
.clear_mark_down(con
);
5750 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
5752 p
->second
.con_back
= newcon
.first
.get();
5753 p
->second
.con_back
->set_priv(s
);
5754 if (newcon
.second
) {
5755 p
->second
.con_front
= newcon
.second
.get();
5756 p
->second
.con_front
->set_priv(s
);
5758 p
->second
.ping_history
.clear();
5760 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5761 << ", raced with osdmap update, closing out peer" << dendl
;
5762 heartbeat_peers
.erase(p
);
5765 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
5773 // =========================================
5777 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5778 dout(10) << "tick" << dendl
;
5780 utime_t now
= ceph_clock_now();
5781 // throw out any obsolete markdown log
5782 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
5783 while (!osd_markdown_log
.empty() &&
5784 osd_markdown_log
.front() + grace
< now
)
5785 osd_markdown_log
.pop_front();
5787 if (is_active() || is_waiting_for_healthy()) {
5788 maybe_update_heartbeat_peers();
5791 if (is_waiting_for_healthy()) {
5795 if (is_waiting_for_healthy() || is_booting()) {
5796 std::lock_guard
l(heartbeat_lock
);
5797 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
) {
5798 last_mon_heartbeat
= now
;
5799 dout(1) << __func__
<< " checking mon for new map" << dendl
;
5800 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5806 // scrub purged_snaps every deep scrub interval
5808 const utime_t last
= superblock
.last_purged_snaps_scrub
;
5809 utime_t next
= last
;
5810 next
+= cct
->_conf
->osd_scrub_min_interval
;
5812 // use a seed that is stable for each scrub interval, but varies
5813 // by OSD to avoid any herds.
5814 rng
.seed(whoami
+ superblock
.last_purged_snaps_scrub
.sec());
5815 double r
= (rng() % 1024) / 1024;
5817 cct
->_conf
->osd_scrub_min_interval
*
5818 cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
5819 if (next
< ceph_clock_now()) {
5820 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5821 << " next " << next
<< " ... now" << dendl
;
5822 scrub_purged_snaps();
5824 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5825 << " next " << next
<< dendl
;
5829 tick_timer
.add_event_after(get_tick_interval(), new C_Tick(this));
5832 void OSD::tick_without_osd_lock()
5834 ceph_assert(ceph_mutex_is_locked(tick_timer_lock
));
5835 dout(10) << "tick_without_osd_lock" << dendl
;
5837 logger
->set(l_osd_cached_crc
, buffer::get_cached_crc());
5838 logger
->set(l_osd_cached_crc_adjusted
, buffer::get_cached_crc_adjusted());
5839 logger
->set(l_osd_missed_crc
, buffer::get_missed_crc());
5841 // refresh osd stats
5842 struct store_statfs_t stbuf
;
5843 osd_alert_list_t alerts
;
5844 int r
= store
->statfs(&stbuf
, &alerts
);
5845 ceph_assert(r
== 0);
5846 service
.set_statfs(stbuf
, alerts
);
5848 // osd_lock is not being held, which means the OSD state
5849 // might change when doing the monitor report
5850 if (is_active() || is_waiting_for_healthy()) {
5852 std::lock_guard l
{heartbeat_lock
};
5855 map_lock
.lock_shared();
5856 std::lock_guard
l(mon_report_lock
);
5859 utime_t now
= ceph_clock_now();
5860 if (service
.need_fullness_update() ||
5861 now
- last_mon_report
> cct
->_conf
->osd_mon_report_interval
) {
5862 last_mon_report
= now
;
5866 map_lock
.unlock_shared();
5868 epoch_t max_waiting_epoch
= 0;
5869 for (auto s
: shards
) {
5870 max_waiting_epoch
= std::max(max_waiting_epoch
,
5871 s
->get_max_waiting_epoch());
5873 if (max_waiting_epoch
> get_osdmap()->get_epoch()) {
5874 dout(20) << __func__
<< " max_waiting_epoch " << max_waiting_epoch
5875 << ", requesting new map" << dendl
;
5876 osdmap_subscribe(superblock
.newest_map
+ 1, false);
5881 if (!scrub_random_backoff()) {
5884 service
.promote_throttle_recalibrate();
5885 resume_creating_pg();
5886 bool need_send_beacon
= false;
5887 const auto now
= ceph::coarse_mono_clock::now();
5889 // borrow lec lock to pretect last_sent_beacon from changing
5890 std::lock_guard l
{min_last_epoch_clean_lock
};
5891 const auto elapsed
= now
- last_sent_beacon
;
5892 if (chrono::duration_cast
<chrono::seconds
>(elapsed
).count() >
5893 cct
->_conf
->osd_beacon_report_interval
) {
5894 need_send_beacon
= true;
5897 if (need_send_beacon
) {
5902 mgrc
.update_daemon_health(get_health_metrics());
5903 service
.kick_recovery_queue();
5904 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
5905 new C_Tick_WithoutOSDLock(this));
5909 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5910 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5911 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5912 // getomap <pool> [namespace/]<obj-name>
5913 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5914 // injectmdataerr [namespace/]<obj-name> [shardid]
5915 // injectdataerr [namespace/]<obj-name> [shardid]
5917 // set_recovery_delay [utime]
5918 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
5919 std::string_view command
,
5920 const cmdmap_t
& cmdmap
, ostream
&ss
)
5923 //Support changing the omap on a single osd by using the Admin Socket to
5924 //directly request the osd make a change.
5925 if (command
== "setomapval" || command
== "rmomapkey" ||
5926 command
== "setomapheader" || command
== "getomap" ||
5927 command
== "truncobj" || command
== "injectmdataerr" ||
5928 command
== "injectdataerr"
5932 OSDMapRef curmap
= service
->get_osdmap();
5937 cmd_getval(cmdmap
, "pool", poolstr
);
5938 pool
= curmap
->lookup_pg_pool_name(poolstr
);
5939 //If we can't find it by name then maybe id specified
5940 if (pool
< 0 && isdigit(poolstr
[0]))
5941 pool
= atoll(poolstr
.c_str());
5943 ss
<< "Invalid pool '" << poolstr
<< "''";
5947 string objname
, nspace
;
5948 cmd_getval(cmdmap
, "objname", objname
);
5949 std::size_t found
= objname
.find_first_of('/');
5950 if (found
!= string::npos
) {
5951 nspace
= objname
.substr(0, found
);
5952 objname
= objname
.substr(found
+1);
5954 object_locator_t
oloc(pool
, nspace
);
5955 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
5958 ss
<< "Invalid namespace/objname";
5963 cmd_getval(cmdmap
, "shardid", shardid
, int64_t(shard_id_t::NO_SHARD
));
5964 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
5965 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
5966 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
5967 if (curmap
->pg_is_ec(rawpg
)) {
5968 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
5969 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
5974 ObjectStore::Transaction t
;
5976 if (command
== "setomapval") {
5977 map
<string
, bufferlist
> newattrs
;
5980 cmd_getval(cmdmap
, "key", key
);
5981 cmd_getval(cmdmap
, "val", valstr
);
5984 newattrs
[key
] = val
;
5985 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
5986 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5988 ss
<< "error=" << r
;
5991 } else if (command
== "rmomapkey") {
5993 cmd_getval(cmdmap
, "key", key
);
5995 t
.omap_rmkey(coll_t(pgid
), ghobject_t(obj
), key
);
5996 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5998 ss
<< "error=" << r
;
6001 } else if (command
== "setomapheader") {
6002 bufferlist newheader
;
6005 cmd_getval(cmdmap
, "header", headerstr
);
6006 newheader
.append(headerstr
);
6007 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
6008 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6010 ss
<< "error=" << r
;
6013 } else if (command
== "getomap") {
6014 //Debug: Output entire omap
6016 map
<string
, bufferlist
> keyvals
;
6017 auto ch
= store
->open_collection(coll_t(pgid
));
6019 ss
<< "unable to open collection for " << pgid
;
6022 r
= store
->omap_get(ch
, ghobject_t(obj
), &hdrbl
, &keyvals
);
6024 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
6025 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
6026 it
!= keyvals
.end(); ++it
)
6027 ss
<< " key=" << (*it
).first
<< " val="
6028 << string((*it
).second
.c_str(), (*it
).second
.length());
6030 ss
<< "error=" << r
;
6033 } else if (command
== "truncobj") {
6035 cmd_getval(cmdmap
, "len", trunclen
);
6036 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
6037 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6039 ss
<< "error=" << r
;
6042 } else if (command
== "injectdataerr") {
6043 store
->inject_data_error(gobj
);
6045 } else if (command
== "injectmdataerr") {
6046 store
->inject_mdata_error(gobj
);
6051 if (command
== "set_recovery_delay") {
6053 cmd_getval(cmdmap
, "utime", delay
, (int64_t)0);
6056 int r
= service
->cct
->_conf
.set_val("osd_recovery_delay_start",
6059 ss
<< "set_recovery_delay: error setting "
6060 << "osd_recovery_delay_start to '" << delay
<< "': error "
6064 service
->cct
->_conf
.apply_changes(nullptr);
6065 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
6066 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
6069 if (command
== "injectfull") {
6072 OSDService::s_names state
;
6073 cmd_getval(cmdmap
, "type", type
, string("full"));
6074 cmd_getval(cmdmap
, "count", count
, (int64_t)-1);
6075 if (type
== "none" || count
== 0) {
6079 state
= service
->get_full_state(type
);
6080 if (state
== OSDService::s_names::INVALID
) {
6081 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6084 service
->set_injectfull(state
, count
);
6087 ss
<< "Internal error - command=" << command
;
6090 // =========================================
6092 void OSD::ms_handle_connect(Connection
*con
)
6094 dout(10) << __func__
<< " con " << con
<< dendl
;
6095 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
6096 std::lock_guard
l(osd_lock
);
6099 dout(10) << __func__
<< " on mon" << dendl
;
6103 } else if (is_booting()) {
6104 _send_boot(); // resend boot message
6106 map_lock
.lock_shared();
6107 std::lock_guard
l2(mon_report_lock
);
6109 utime_t now
= ceph_clock_now();
6110 last_mon_report
= now
;
6112 // resend everything, it's a new session
6115 service
.requeue_pg_temp();
6116 service
.clear_sent_ready_to_merge();
6117 service
.send_pg_temp();
6118 service
.send_ready_to_merge();
6119 service
.send_pg_created();
6123 map_lock
.unlock_shared();
6125 send_beacon(ceph::coarse_mono_clock::now());
6129 // full map requests may happen while active or pre-boot
6130 if (requested_full_first
) {
6131 rerequest_full_maps();
6136 void OSD::ms_handle_fast_connect(Connection
*con
)
6138 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6139 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6140 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6141 s
= ceph::make_ref
<Session
>(cct
, con
);
6143 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
6144 << " addr=" << s
->con
->get_peer_addr() << dendl
;
6145 // we don't connect to clients
6146 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6147 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6152 void OSD::ms_handle_fast_accept(Connection
*con
)
6154 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6155 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6156 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6157 s
= ceph::make_ref
<Session
>(cct
, con
);
6159 dout(10) << "new session (incoming)" << s
<< " con=" << con
6160 << " addr=" << con
->get_peer_addr()
6161 << " must have raced with connect" << dendl
;
6162 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6163 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6168 bool OSD::ms_handle_reset(Connection
*con
)
6170 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6171 dout(2) << "ms_handle_reset con " << con
<< " session " << session
.get() << dendl
;
6174 session
->wstate
.reset(con
);
6175 session
->con
->set_priv(nullptr);
6176 session
->con
.reset(); // break con <-> session ref cycle
6177 // note that we break session->con *before* the session_handle_reset
6178 // cleanup below. this avoids a race between us and
6179 // PG::add_backoff, Session::check_backoff, etc.
6180 session_handle_reset(session
);
6184 bool OSD::ms_handle_refused(Connection
*con
)
6186 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
6189 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6190 dout(2) << "ms_handle_refused con " << con
<< " session " << session
.get() << dendl
;
6193 int type
= con
->get_peer_type();
6194 // handle only OSD failures here
6195 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
6196 OSDMapRef osdmap
= get_osdmap();
6198 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
6199 if (id
>= 0 && osdmap
->is_up(id
)) {
6200 // I'm cheating mon heartbeat grace logic, because we know it's not going
6201 // to respawn alone. +1 so we won't hit any boundary case.
6202 monc
->send_mon_message(
6206 osdmap
->get_addrs(id
),
6207 cct
->_conf
->osd_heartbeat_grace
+ 1,
6208 osdmap
->get_epoch(),
6209 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
6217 struct C_OSD_GetVersion
: public Context
{
6219 uint64_t oldest
, newest
;
6220 explicit C_OSD_GetVersion(OSD
*o
) : osd(o
), oldest(0), newest(0) {}
6221 void finish(int r
) override
{
6223 osd
->_got_mon_epochs(oldest
, newest
);
6227 void OSD::start_boot()
6229 if (!_is_healthy()) {
6230 // if we are not healthy, do not mark ourselves up (yet)
6231 dout(1) << "not healthy; waiting to boot" << dendl
;
6232 if (!is_waiting_for_healthy())
6233 start_waiting_for_healthy();
6234 // send pings sooner rather than later
6238 dout(1) << __func__
<< dendl
;
6239 set_state(STATE_PREBOOT
);
6240 dout(10) << "start_boot - have maps " << superblock
.oldest_map
6241 << ".." << superblock
.newest_map
<< dendl
;
6242 C_OSD_GetVersion
*c
= new C_OSD_GetVersion(this);
6243 monc
->get_version("osdmap", &c
->newest
, &c
->oldest
, c
);
6246 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
6248 std::lock_guard
l(osd_lock
);
6250 _preboot(oldest
, newest
);
6254 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
6256 ceph_assert(is_preboot());
6257 dout(10) << __func__
<< " _preboot mon has osdmaps "
6258 << oldest
<< ".." << newest
<< dendl
;
6260 // ensure our local fullness awareness is accurate
6262 std::lock_guard
l(heartbeat_lock
);
6266 const auto& monmap
= monc
->monmap
;
6267 const auto osdmap
= get_osdmap();
6268 // if our map within recent history, try to add ourselves to the osdmap.
6269 if (osdmap
->get_epoch() == 0) {
6270 derr
<< "waiting for initial osdmap" << dendl
;
6271 } else if (osdmap
->is_destroyed(whoami
)) {
6272 derr
<< "osdmap says I am destroyed" << dendl
;
6273 // provide a small margin so we don't livelock seeing if we
6274 // un-destroyed ourselves.
6275 if (osdmap
->get_epoch() > newest
- 1) {
6278 } else if (osdmap
->is_noup(whoami
)) {
6279 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
6280 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
6281 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6283 } else if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
6284 derr
<< "osdmap require_osd_release < luminous; please upgrade to luminous"
6286 } else if (service
.need_fullness_update()) {
6287 derr
<< "osdmap fullness state needs update" << dendl
;
6289 } else if (monmap
.min_mon_release
>= ceph_release_t::octopus
&&
6290 superblock
.purged_snaps_last
< superblock
.current_epoch
) {
6291 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6292 << " < newest_map " << superblock
.current_epoch
<< dendl
;
6293 _get_purged_snaps();
6294 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
6295 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
6297 // wait for pgs to fully catch up in a different thread, since
6298 // this thread might be required for splitting and merging PGs to
6300 boot_finisher
.queue(
6303 std::unique_lock
l(osd_lock
);
6305 dout(10) << __func__
<< " waiting for peering work to drain"
6308 for (auto shard
: shards
) {
6309 shard
->wait_min_pg_epoch(get_osdmap_epoch());
6320 // get all the latest maps
6321 if (osdmap
->get_epoch() + 1 >= oldest
)
6322 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6324 osdmap_subscribe(oldest
- 1, true);
6327 void OSD::_get_purged_snaps()
6329 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6330 // overlapping requests to the mon, which will be somewhat inefficient, but
6331 // it should be reliable.
6332 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6333 << ", newest_map " << superblock
.current_epoch
<< dendl
;
6334 MMonGetPurgedSnaps
*m
= new MMonGetPurgedSnaps(
6335 superblock
.purged_snaps_last
+ 1,
6336 superblock
.current_epoch
+ 1);
6337 monc
->send_mon_message(m
);
6340 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply
*m
)
6342 dout(10) << __func__
<< " " << *m
<< dendl
;
6343 ObjectStore::Transaction t
;
6344 if (!is_preboot() ||
6345 m
->last
< superblock
.purged_snaps_last
) {
6348 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
6349 make_purged_snaps_oid(), &t
,
6351 superblock
.purged_snaps_last
= m
->last
;
6352 write_superblock(t
);
6353 store
->queue_transaction(
6356 service
.publish_superblock(superblock
);
6357 if (m
->last
< superblock
.current_epoch
) {
6358 _get_purged_snaps();
6366 void OSD::send_full_update()
6368 if (!service
.need_fullness_update())
6371 if (service
.is_full()) {
6372 state
= CEPH_OSD_FULL
;
6373 } else if (service
.is_backfillfull()) {
6374 state
= CEPH_OSD_BACKFILLFULL
;
6375 } else if (service
.is_nearfull()) {
6376 state
= CEPH_OSD_NEARFULL
;
6379 OSDMap::calc_state_set(state
, s
);
6380 dout(10) << __func__
<< " want state " << s
<< dendl
;
6381 monc
->send_mon_message(new MOSDFull(get_osdmap_epoch(), state
));
6384 void OSD::start_waiting_for_healthy()
6386 dout(1) << "start_waiting_for_healthy" << dendl
;
6387 set_state(STATE_WAITING_FOR_HEALTHY
);
6388 last_heartbeat_resample
= utime_t();
6390 // subscribe to osdmap updates, in case our peers really are known to be dead
6391 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6394 bool OSD::_is_healthy()
6396 if (!cct
->get_heartbeat_map()->is_healthy()) {
6397 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
6401 if (is_waiting_for_healthy()) {
6402 utime_t now
= ceph_clock_now();
6403 if (osd_markdown_log
.empty()) {
6404 dout(5) << __func__
<< " force returning true since last markdown"
6405 << " was " << cct
->_conf
->osd_max_markdown_period
6406 << "s ago" << dendl
;
6409 std::lock_guard
l(heartbeat_lock
);
6410 int num
= 0, up
= 0;
6411 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
6412 p
!= heartbeat_peers
.end();
6414 if (p
->second
.is_healthy(now
))
6418 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
6419 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
6420 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
6428 void OSD::_send_boot()
6430 dout(10) << "_send_boot" << dendl
;
6431 Connection
*local_connection
=
6432 cluster_messenger
->get_loopback_connection().get();
6433 entity_addrvec_t client_addrs
= client_messenger
->get_myaddrs();
6434 entity_addrvec_t cluster_addrs
= cluster_messenger
->get_myaddrs();
6435 entity_addrvec_t hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6436 entity_addrvec_t hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6438 dout(20) << " initial client_addrs " << client_addrs
6439 << ", cluster_addrs " << cluster_addrs
6440 << ", hb_back_addrs " << hb_back_addrs
6441 << ", hb_front_addrs " << hb_front_addrs
6443 if (cluster_messenger
->set_addr_unknowns(client_addrs
)) {
6444 dout(10) << " assuming cluster_addrs match client_addrs "
6445 << client_addrs
<< dendl
;
6446 cluster_addrs
= cluster_messenger
->get_myaddrs();
6448 if (auto session
= local_connection
->get_priv(); !session
) {
6449 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6452 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
6453 if (hb_back_server_messenger
->set_addr_unknowns(cluster_addrs
)) {
6454 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6455 << cluster_addrs
<< dendl
;
6456 hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6458 if (auto session
= local_connection
->get_priv(); !session
) {
6459 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6462 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
6463 if (hb_front_server_messenger
->set_addr_unknowns(client_addrs
)) {
6464 dout(10) << " assuming hb_front_addrs match client_addrs "
6465 << client_addrs
<< dendl
;
6466 hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6468 if (auto session
= local_connection
->get_priv(); !session
) {
6469 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6472 // we now know what our front and back addrs will be, and we are
6473 // about to tell the mon what our metadata (including numa bindings)
6474 // are, so now is a good time!
6475 set_numa_affinity();
6477 MOSDBoot
*mboot
= new MOSDBoot(
6478 superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
6479 hb_back_addrs
, hb_front_addrs
, cluster_addrs
,
6481 dout(10) << " final client_addrs " << client_addrs
6482 << ", cluster_addrs " << cluster_addrs
6483 << ", hb_back_addrs " << hb_back_addrs
6484 << ", hb_front_addrs " << hb_front_addrs
6486 _collect_metadata(&mboot
->metadata
);
6487 monc
->send_mon_message(mboot
);
6488 set_state(STATE_BOOTING
);
6491 void OSD::_collect_metadata(map
<string
,string
> *pm
)
6494 (*pm
)["osd_data"] = dev_path
;
6495 if (store
->get_type() == "filestore") {
6496 // not applicable for bluestore
6497 (*pm
)["osd_journal"] = journal_path
;
6499 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddrs());
6500 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddrs());
6501 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddrs());
6502 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddrs());
6505 (*pm
)["osd_objectstore"] = store
->get_type();
6506 (*pm
)["rotational"] = store_is_rotational
? "1" : "0";
6507 (*pm
)["journal_rotational"] = journal_is_rotational
? "1" : "0";
6508 (*pm
)["default_device_class"] = store
->get_default_device_class();
6509 store
->collect_metadata(pm
);
6511 collect_sys_info(pm
, cct
);
6513 (*pm
)["front_iface"] = pick_iface(
6515 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
6516 (*pm
)["back_iface"] = pick_iface(
6518 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
6524 set
<string
> unknown
;
6525 for (auto nm
: { "front_iface", "back_iface" }) {
6526 if (!(*pm
)[nm
].size()) {
6531 int r
= get_iface_numa_node((*pm
)[nm
], &n
);
6533 unknown
.insert((*pm
)[nm
]);
6541 if (unknown
.size()) {
6542 (*pm
)["network_numa_unknown_ifaces"] = stringify(unknown
);
6544 if (!nodes
.empty()) {
6545 (*pm
)["network_numa_nodes"] = stringify(nodes
);
6547 if (node
>= 0 && nodes
.size() == 1 && unknown
.empty()) {
6548 (*pm
)["network_numa_node"] = stringify(node
);
6552 if (numa_node
>= 0) {
6553 (*pm
)["numa_node"] = stringify(numa_node
);
6554 (*pm
)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size
,
6558 set
<string
> devnames
;
6559 store
->get_devices(&devnames
);
6560 map
<string
,string
> errs
;
6561 get_device_metadata(devnames
, pm
, &errs
);
6562 for (auto& i
: errs
) {
6563 dout(1) << __func__
<< " " << i
.first
<< ": " << i
.second
<< dendl
;
6565 dout(10) << __func__
<< " " << *pm
<< dendl
;
6568 void OSD::queue_want_up_thru(epoch_t want
)
6570 std::shared_lock map_locker
{map_lock
};
6571 epoch_t cur
= get_osdmap()->get_up_thru(whoami
);
6572 std::lock_guard
report_locker(mon_report_lock
);
6573 if (want
> up_thru_wanted
) {
6574 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
6575 << ", currently " << cur
6577 up_thru_wanted
= want
;
6580 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
6581 << ", currently " << cur
6586 void OSD::send_alive()
6588 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6589 const auto osdmap
= get_osdmap();
6590 if (!osdmap
->exists(whoami
))
6592 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
6593 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
6594 if (up_thru_wanted
> up_thru
) {
6595 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
6596 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
6600 void OSD::request_full_map(epoch_t first
, epoch_t last
)
6602 dout(10) << __func__
<< " " << first
<< ".." << last
6603 << ", previously requested "
6604 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
6605 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6606 ceph_assert(first
> 0 && last
> 0);
6607 ceph_assert(first
<= last
);
6608 ceph_assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
6609 if (requested_full_first
== 0) {
6611 requested_full_first
= first
;
6612 requested_full_last
= last
;
6613 } else if (last
<= requested_full_last
) {
6617 // additional request
6618 first
= requested_full_last
+ 1;
6619 requested_full_last
= last
;
6621 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
6622 req
->request_full(first
, last
);
6623 monc
->send_mon_message(req
);
6626 void OSD::got_full_map(epoch_t e
)
6628 ceph_assert(requested_full_first
<= requested_full_last
);
6629 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6630 if (requested_full_first
== 0) {
6631 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
6634 if (e
< requested_full_first
) {
6635 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6636 << ".." << requested_full_last
6637 << ", ignoring" << dendl
;
6640 if (e
>= requested_full_last
) {
6641 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6642 << ".." << requested_full_last
<< ", resetting" << dendl
;
6643 requested_full_first
= requested_full_last
= 0;
6647 requested_full_first
= e
+ 1;
6649 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6650 << ".." << requested_full_last
6651 << ", still need more" << dendl
;
6654 void OSD::requeue_failures()
6656 std::lock_guard
l(heartbeat_lock
);
6657 unsigned old_queue
= failure_queue
.size();
6658 unsigned old_pending
= failure_pending
.size();
6659 for (auto p
= failure_pending
.begin(); p
!= failure_pending
.end(); ) {
6660 failure_queue
[p
->first
] = p
->second
.first
;
6661 failure_pending
.erase(p
++);
6663 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
6664 << failure_queue
.size() << dendl
;
6667 void OSD::send_failures()
6669 ceph_assert(ceph_mutex_is_locked(map_lock
));
6670 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6671 std::lock_guard
l(heartbeat_lock
);
6672 utime_t now
= ceph_clock_now();
6673 const auto osdmap
= get_osdmap();
6674 while (!failure_queue
.empty()) {
6675 int osd
= failure_queue
.begin()->first
;
6676 if (!failure_pending
.count(osd
)) {
6677 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
6678 monc
->send_mon_message(
6682 osdmap
->get_addrs(osd
),
6684 osdmap
->get_epoch()));
6685 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
,
6686 osdmap
->get_addrs(osd
));
6688 failure_queue
.erase(osd
);
6692 void OSD::send_still_alive(epoch_t epoch
, int osd
, const entity_addrvec_t
&addrs
)
6694 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), osd
, addrs
, 0, epoch
,
6695 MOSDFailure::FLAG_ALIVE
);
6696 monc
->send_mon_message(m
);
6699 void OSD::cancel_pending_failures()
6701 std::lock_guard
l(heartbeat_lock
);
6702 auto it
= failure_pending
.begin();
6703 while (it
!= failure_pending
.end()) {
6704 dout(10) << __func__
<< " canceling in-flight failure report for osd."
6705 << it
->first
<< dendl
;
6706 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
6707 failure_pending
.erase(it
++);
6711 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
6713 const auto& monmap
= monc
->monmap
;
6714 // send beacon to mon even if we are just connected, and the monmap is not
6715 // initialized yet by then.
6716 if (monmap
.epoch
> 0 &&
6717 monmap
.get_required_features().contains_all(
6718 ceph::features::mon::FEATURE_LUMINOUS
)) {
6719 dout(20) << __func__
<< " sending" << dendl
;
6720 MOSDBeacon
* beacon
= nullptr;
6722 std::lock_guard l
{min_last_epoch_clean_lock
};
6723 beacon
= new MOSDBeacon(get_osdmap_epoch(),
6724 min_last_epoch_clean
,
6725 superblock
.last_purged_snaps_scrub
);
6726 beacon
->pgs
= min_last_epoch_clean_pgs
;
6727 last_sent_beacon
= now
;
6729 monc
->send_mon_message(beacon
);
6731 dout(20) << __func__
<< " not sending" << dendl
;
6735 void OSD::handle_command(MCommand
*m
)
6737 ConnectionRef con
= m
->get_connection();
6738 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6740 con
->send_message(new MCommandReply(m
, -EACCES
));
6744 if (!session
->caps
.allow_all()) {
6745 con
->send_message(new MCommandReply(m
, -EACCES
));
6749 cct
->get_admin_socket()->queue_tell_command(m
);
6754 class unlock_guard
{
6757 explicit unlock_guard(ceph::mutex
& mutex
)
6762 unlock_guard(unlock_guard
&) = delete;
6769 void OSD::scrub_purged_snaps()
6771 dout(10) << __func__
<< dendl
;
6772 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6773 SnapMapper::Scrubber
s(cct
, store
, service
.meta_ch
,
6774 make_snapmapper_oid(),
6775 make_purged_snaps_oid());
6776 clog
->debug() << "purged_snaps scrub starts";
6779 if (s
.stray
.size()) {
6780 clog
->debug() << "purged_snaps scrub found " << s
.stray
.size() << " strays";
6782 clog
->debug() << "purged_snaps scrub ok";
6784 set
<pair
<spg_t
,snapid_t
>> queued
;
6785 for (auto& [pool
, snap
, hash
, shard
] : s
.stray
) {
6786 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(pool
);
6788 dout(20) << __func__
<< " pool " << pool
<< " dne" << dendl
;
6791 pg_t
pgid(pi
->raw_hash_to_pg(hash
), pool
);
6792 spg_t
spgid(pgid
, shard
);
6793 pair
<spg_t
,snapid_t
> p(spgid
, snap
);
6794 if (queued
.count(p
)) {
6795 dout(20) << __func__
<< " pg " << spgid
<< " snap " << snap
6796 << " already queued" << dendl
;
6799 PGRef pg
= lookup_lock_pg(spgid
);
6801 dout(20) << __func__
<< " pg " << spgid
<< " not found" << dendl
;
6805 dout(10) << __func__
<< " requeue pg " << spgid
<< " " << pg
<< " snap "
6807 pg
->queue_snap_retrim(snap
);
6811 if (is_stopping()) {
6814 dout(10) << __func__
<< " done queueing pgs, updating superblock" << dendl
;
6815 ObjectStore::Transaction t
;
6816 superblock
.last_purged_snaps_scrub
= ceph_clock_now();
6817 write_superblock(t
);
6818 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
6819 ceph_assert(tr
== 0);
6821 send_beacon(ceph::coarse_mono_clock::now());
6823 dout(10) << __func__
<< " done" << dendl
;
6826 void OSD::probe_smart(const string
& only_devid
, ostream
& ss
)
6828 set
<string
> devnames
;
6829 store
->get_devices(&devnames
);
6830 uint64_t smart_timeout
= cct
->_conf
.get_val
<uint64_t>(
6831 "osd_smart_report_timeout");
6833 // == typedef std::map<std::string, mValue> mObject;
6834 json_spirit::mObject json_map
;
6836 for (auto dev
: devnames
) {
6837 // smartctl works only on physical devices; filter out any logical device
6838 if (dev
.find("dm-") == 0) {
6843 string devid
= get_device_id(dev
, &err
);
6844 if (devid
.size() == 0) {
6845 dout(10) << __func__
<< " no unique id for dev " << dev
<< " ("
6846 << err
<< "), skipping" << dendl
;
6849 if (only_devid
.size() && devid
!= only_devid
) {
6853 json_spirit::mValue smart_json
;
6854 if (block_device_get_metrics(dev
, smart_timeout
,
6856 dout(10) << "block_device_get_metrics failed for /dev/" << dev
<< dendl
;
6859 json_map
[devid
] = smart_json
;
6861 json_spirit::write(json_map
, ss
, json_spirit::pretty_print
);
6864 bool OSD::heartbeat_dispatch(Message
*m
)
6866 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
6867 switch (m
->get_type()) {
6870 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
6875 handle_osd_ping(static_cast<MOSDPing
*>(m
));
6879 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
6886 bool OSD::ms_dispatch(Message
*m
)
6888 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
6889 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
6890 service
.got_stop_ack();
6898 if (is_stopping()) {
6912 void OSDService::maybe_share_map(
6914 const OSDMapRef
& osdmap
,
6915 epoch_t peer_epoch_lb
)
6917 // NOTE: we assume caller hold something that keeps the Connection itself
6918 // pinned (e.g., an OpRequest's MessageRef).
6919 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6924 // assume the peer has the newer of the op's sent_epoch and what
6925 // we think we sent them.
6926 session
->sent_epoch_lock
.lock();
6927 if (peer_epoch_lb
> session
->last_sent_epoch
) {
6928 dout(10) << __func__
<< " con " << con
6929 << " " << con
->get_peer_addr()
6930 << " map epoch " << session
->last_sent_epoch
6931 << " -> " << peer_epoch_lb
<< " (as per caller)" << dendl
;
6932 session
->last_sent_epoch
= peer_epoch_lb
;
6934 epoch_t last_sent_epoch
= session
->last_sent_epoch
;
6935 session
->sent_epoch_lock
.unlock();
6937 if (osdmap
->get_epoch() <= last_sent_epoch
) {
6941 send_incremental_map(last_sent_epoch
, con
, osdmap
);
6942 last_sent_epoch
= osdmap
->get_epoch();
6944 session
->sent_epoch_lock
.lock();
6945 if (session
->last_sent_epoch
< last_sent_epoch
) {
6946 dout(10) << __func__
<< " con " << con
6947 << " " << con
->get_peer_addr()
6948 << " map epoch " << session
->last_sent_epoch
6949 << " -> " << last_sent_epoch
<< " (shared)" << dendl
;
6950 session
->last_sent_epoch
= last_sent_epoch
;
6952 session
->sent_epoch_lock
.unlock();
6955 void OSD::dispatch_session_waiting(const ceph::ref_t
<Session
>& session
, OSDMapRef osdmap
)
6957 ceph_assert(ceph_mutex_is_locked(session
->session_dispatch_lock
));
6959 auto i
= session
->waiting_on_map
.begin();
6960 while (i
!= session
->waiting_on_map
.end()) {
6961 OpRequestRef op
= &(*i
);
6962 ceph_assert(ms_can_fast_dispatch(op
->get_req()));
6963 auto m
= op
->get_req
<MOSDFastDispatchOp
>();
6964 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
6967 session
->waiting_on_map
.erase(i
++);
6971 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
6972 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
6973 static_cast<const MOSDOp
*>(m
)->get_pg());
6974 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
6978 pgid
= m
->get_spg();
6980 enqueue_op(pgid
, std::move(op
), m
->get_map_epoch());
6983 if (session
->waiting_on_map
.empty()) {
6984 clear_session_waiting_on_map(session
);
6986 register_session_waiting_on_map(session
);
6990 void OSD::ms_fast_dispatch(Message
*m
)
6993 if (service
.is_stopping()) {
6999 switch (m
->get_type()) {
7001 dout(10) << "ping from " << m
->get_source() << dendl
;
7004 case MSG_OSD_FORCE_RECOVERY
:
7005 handle_fast_force_recovery(static_cast<MOSDForceRecovery
*>(m
));
7007 case MSG_OSD_SCRUB2
:
7008 handle_fast_scrub(static_cast<MOSDScrub2
*>(m
));
7011 case MSG_OSD_PG_CREATE2
:
7012 return handle_fast_pg_create(static_cast<MOSDPGCreate2
*>(m
));
7013 case MSG_OSD_PG_QUERY
:
7014 return handle_fast_pg_query(static_cast<MOSDPGQuery
*>(m
));
7015 case MSG_OSD_PG_NOTIFY
:
7016 return handle_fast_pg_notify(static_cast<MOSDPGNotify
*>(m
));
7017 case MSG_OSD_PG_INFO
:
7018 return handle_fast_pg_info(static_cast<MOSDPGInfo
*>(m
));
7019 case MSG_OSD_PG_REMOVE
:
7020 return handle_fast_pg_remove(static_cast<MOSDPGRemove
*>(m
));
7022 // these are single-pg messages that handle themselves
7023 case MSG_OSD_PG_LOG
:
7024 case MSG_OSD_PG_TRIM
:
7025 case MSG_OSD_PG_NOTIFY2
:
7026 case MSG_OSD_PG_QUERY2
:
7027 case MSG_OSD_PG_INFO2
:
7028 case MSG_OSD_BACKFILL_RESERVE
:
7029 case MSG_OSD_RECOVERY_RESERVE
:
7030 case MSG_OSD_PG_LEASE
:
7031 case MSG_OSD_PG_LEASE_ACK
:
7033 MOSDPeeringOp
*pm
= static_cast<MOSDPeeringOp
*>(m
);
7034 if (require_osd_peer(pm
)) {
7035 enqueue_peering_evt(
7037 PGPeeringEventRef(pm
->get_event()));
7044 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7047 osd_reqid_t reqid
= op
->get_reqid();
7049 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
7050 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
7054 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7056 // note sender epoch, min req's epoch
7057 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
7058 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
7059 ceph_assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
7061 service
.maybe_inject_dispatch_delay();
7063 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
7064 m
->get_type() != CEPH_MSG_OSD_OP
) {
7065 // queue it directly
7067 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
7069 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
7071 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7072 // message that didn't have an explicit spg_t); we need to map
7073 // them to an spg_t while preserving delivery order.
7074 auto priv
= m
->get_connection()->get_priv();
7075 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
7076 std::lock_guard l
{session
->session_dispatch_lock
};
7078 session
->waiting_on_map
.push_back(*op
);
7079 OSDMapRef nextmap
= service
.get_nextmap_reserved();
7080 dispatch_session_waiting(session
, nextmap
);
7081 service
.release_map(nextmap
);
7084 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
7087 int OSD::ms_handle_authentication(Connection
*con
)
7090 auto s
= ceph::ref_cast
<Session
>(con
->get_priv());
7092 s
= ceph::make_ref
<Session
>(cct
, con
);
7094 s
->entity_name
= con
->get_peer_entity_name();
7095 dout(10) << __func__
<< " new session " << s
<< " con " << s
->con
7096 << " entity " << s
->entity_name
7097 << " addr " << con
->get_peer_addrs() << dendl
;
7099 dout(10) << __func__
<< " existing session " << s
<< " con " << s
->con
7100 << " entity " << s
->entity_name
7101 << " addr " << con
->get_peer_addrs() << dendl
;
7104 AuthCapsInfo
&caps_info
= con
->get_peer_caps_info();
7105 if (caps_info
.allow_all
) {
7106 s
->caps
.set_allow_all();
7107 } else if (caps_info
.caps
.length() > 0) {
7108 bufferlist::const_iterator p
= caps_info
.caps
.cbegin();
7113 catch (buffer::error
& e
) {
7114 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7115 << " failed to decode caps string" << dendl
;
7119 bool success
= s
->caps
.parse(str
);
7121 dout(10) << __func__
<< " session " << s
7122 << " " << s
->entity_name
7123 << " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
7126 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7127 << " failed to parse caps '" << str
<< "'" << dendl
;
7135 void OSD::do_waiters()
7137 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7139 dout(10) << "do_waiters -- start" << dendl
;
7140 while (!finished
.empty()) {
7141 OpRequestRef next
= finished
.front();
7142 finished
.pop_front();
7145 dout(10) << "do_waiters -- finish" << dendl
;
7148 void OSD::dispatch_op(OpRequestRef op
)
7150 switch (op
->get_req()->get_type()) {
7152 case MSG_OSD_PG_CREATE
:
7153 handle_pg_create(op
);
7158 void OSD::_dispatch(Message
*m
)
7160 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7161 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
7163 switch (m
->get_type()) {
7164 // -- don't need OSDMap --
7166 // map and replication
7167 case CEPH_MSG_OSD_MAP
:
7168 handle_osd_map(static_cast<MOSDMap
*>(m
));
7170 case MSG_MON_GET_PURGED_SNAPS_REPLY
:
7171 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply
*>(m
));
7176 handle_scrub(static_cast<MOSDScrub
*>(m
));
7180 handle_command(static_cast<MCommand
*>(m
));
7183 // -- need OSDMap --
7185 case MSG_OSD_PG_CREATE
:
7187 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7189 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7190 // no map? starting up?
7191 if (!get_osdmap()) {
7192 dout(7) << "no OSDMap, not booted" << dendl
;
7193 logger
->inc(l_osd_waiting_for_map
);
7194 waiting_for_osdmap
.push_back(op
);
7195 op
->mark_delayed("no osdmap");
7205 // remove me post-nautilus
7206 void OSD::handle_scrub(MOSDScrub
*m
)
7208 dout(10) << "handle_scrub " << *m
<< dendl
;
7209 if (!require_mon_or_mgr_peer(m
)) {
7213 if (m
->fsid
!= monc
->get_fsid()) {
7214 dout(0) << "handle_scrub fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7223 if (!m
->scrub_pgs
.empty()) {
7225 for (auto pgid
: m
->scrub_pgs
) {
7227 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
7228 std::find(spgs
.begin(), spgs
.end(), pcand
) != spgs
.end()) {
7235 for (auto pgid
: spgs
) {
7236 enqueue_peering_evt(
7239 std::make_shared
<PGPeeringEvent
>(
7242 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7248 void OSD::handle_fast_scrub(MOSDScrub2
*m
)
7250 dout(10) << __func__
<< " " << *m
<< dendl
;
7251 if (!require_mon_or_mgr_peer(m
)) {
7255 if (m
->fsid
!= monc
->get_fsid()) {
7256 dout(0) << __func__
<< " fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7261 for (auto pgid
: m
->scrub_pgs
) {
7262 enqueue_peering_evt(
7265 std::make_shared
<PGPeeringEvent
>(
7268 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7273 bool OSD::scrub_random_backoff()
7275 bool coin_flip
= (rand() / (double)RAND_MAX
>=
7276 cct
->_conf
->osd_scrub_backoff_ratio
);
7278 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl
;
7284 OSDService::ScrubJob::ScrubJob(CephContext
* cct
,
7285 const spg_t
& pg
, const utime_t
& timestamp
,
7286 double pool_scrub_min_interval
,
7287 double pool_scrub_max_interval
, bool must
)
7290 sched_time(timestamp
),
7293 // if not explicitly requested, postpone the scrub with a random delay
7295 double scrub_min_interval
= pool_scrub_min_interval
> 0 ?
7296 pool_scrub_min_interval
: cct
->_conf
->osd_scrub_min_interval
;
7297 double scrub_max_interval
= pool_scrub_max_interval
> 0 ?
7298 pool_scrub_max_interval
: cct
->_conf
->osd_scrub_max_interval
;
7300 sched_time
+= scrub_min_interval
;
7301 double r
= rand() / (double)RAND_MAX
;
7303 scrub_min_interval
* cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
7304 if (scrub_max_interval
== 0) {
7305 deadline
= utime_t();
7307 deadline
+= scrub_max_interval
;
7313 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob
& rhs
) const {
7314 if (sched_time
< rhs
.sched_time
)
7316 if (sched_time
> rhs
.sched_time
)
7318 return pgid
< rhs
.pgid
;
7321 double OSD::scrub_sleep_time(bool must_scrub
)
7324 return cct
->_conf
->osd_scrub_sleep
;
7326 utime_t now
= ceph_clock_now();
7327 if (scrub_time_permit(now
)) {
7328 return cct
->_conf
->osd_scrub_sleep
;
7330 double normal_sleep
= cct
->_conf
->osd_scrub_sleep
;
7331 double extended_sleep
= cct
->_conf
->osd_scrub_extended_sleep
;
7332 return std::max(extended_sleep
, normal_sleep
);
7335 bool OSD::scrub_time_permit(utime_t now
)
7338 time_t tt
= now
.sec();
7339 localtime_r(&tt
, &bdt
);
7341 bool day_permit
= false;
7342 if (cct
->_conf
->osd_scrub_begin_week_day
< cct
->_conf
->osd_scrub_end_week_day
) {
7343 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
&& bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7347 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
|| bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7353 dout(20) << __func__
<< " should run between week day " << cct
->_conf
->osd_scrub_begin_week_day
7354 << " - " << cct
->_conf
->osd_scrub_end_week_day
7355 << " now " << bdt
.tm_wday
<< " = no" << dendl
;
7359 bool time_permit
= false;
7360 if (cct
->_conf
->osd_scrub_begin_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7361 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
&& bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7365 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
|| bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7370 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7371 << " - " << cct
->_conf
->osd_scrub_end_hour
7372 << " now " << bdt
.tm_hour
<< " = no" << dendl
;
7374 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7375 << " - " << cct
->_conf
->osd_scrub_end_hour
7376 << " now " << bdt
.tm_hour
<< " = yes" << dendl
;
7381 bool OSD::scrub_load_below_threshold()
7384 if (getloadavg(loadavgs
, 3) != 3) {
7385 dout(10) << __func__
<< " couldn't read loadavgs\n" << dendl
;
7389 // allow scrub if below configured threshold
7390 long cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
7391 double loadavg_per_cpu
= cpus
> 0 ? loadavgs
[0] / cpus
: loadavgs
[0];
7392 if (loadavg_per_cpu
< cct
->_conf
->osd_scrub_load_threshold
) {
7393 dout(20) << __func__
<< " loadavg per cpu " << loadavg_per_cpu
7394 << " < max " << cct
->_conf
->osd_scrub_load_threshold
7395 << " = yes" << dendl
;
7399 // allow scrub if below daily avg and currently decreasing
7400 if (loadavgs
[0] < daily_loadavg
&& loadavgs
[0] < loadavgs
[2]) {
7401 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7402 << " < daily_loadavg " << daily_loadavg
7403 << " and < 15m avg " << loadavgs
[2]
7404 << " = yes" << dendl
;
7408 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7409 << " >= max " << cct
->_conf
->osd_scrub_load_threshold
7410 << " and ( >= daily_loadavg " << daily_loadavg
7411 << " or >= 15m avg " << loadavgs
[2]
7412 << ") = no" << dendl
;
7416 void OSD::sched_scrub()
7418 // if not permitted, fail fast
7419 if (!service
.can_inc_scrubs()) {
7422 bool allow_requested_repair_only
= false;
7423 if (service
.is_recovery_active()) {
7424 if (!cct
->_conf
->osd_scrub_during_recovery
&& cct
->_conf
->osd_repair_during_recovery
) {
7425 dout(10) << __func__
7426 << " will only schedule explicitly requested repair due to active recovery"
7428 allow_requested_repair_only
= true;
7429 } else if (!cct
->_conf
->osd_scrub_during_recovery
&& !cct
->_conf
->osd_repair_during_recovery
) {
7430 dout(20) << __func__
<< " not scheduling scrubs due to active recovery" << dendl
;
7435 utime_t now
= ceph_clock_now();
7436 bool time_permit
= scrub_time_permit(now
);
7437 bool load_is_low
= scrub_load_below_threshold();
7438 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low
<< dendl
;
7440 OSDService::ScrubJob scrub
;
7441 if (service
.first_scrub_stamp(&scrub
)) {
7443 dout(30) << "sched_scrub examine " << scrub
.pgid
<< " at " << scrub
.sched_time
<< dendl
;
7445 if (scrub
.sched_time
> now
) {
7446 // save ourselves some effort
7447 dout(10) << "sched_scrub " << scrub
.pgid
<< " scheduled at " << scrub
.sched_time
7448 << " > " << now
<< dendl
;
7452 if ((scrub
.deadline
.is_zero() || scrub
.deadline
>= now
) && !(time_permit
&& load_is_low
)) {
7453 dout(10) << __func__
<< " not scheduling scrub for " << scrub
.pgid
<< " due to "
7454 << (!time_permit
? "time not permit" : "high load") << dendl
;
7458 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7461 // This has already started, so go on to the next scrub job
7462 if (pg
->scrubber
.active
) {
7464 dout(30) << __func__
<< ": already in progress pgid " << scrub
.pgid
<< dendl
;
7467 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7468 if (allow_requested_repair_only
&& !pg
->scrubber
.must_repair
) {
7470 dout(10) << __func__
<< " skip " << scrub
.pgid
7471 << " because repairing is not explicitly requested on it"
7475 // If it is reserving, let it resolve before going to the next scrub job
7476 if (pg
->scrubber
.local_reserved
&& !pg
->scrubber
.active
) {
7478 dout(30) << __func__
<< ": reserve in progress pgid " << scrub
.pgid
<< dendl
;
7481 dout(10) << "sched_scrub scrubbing " << scrub
.pgid
<< " at " << scrub
.sched_time
7482 << (pg
->get_must_scrub() ? ", explicitly requested" :
7483 (load_is_low
? ", load_is_low" : " deadline < now"))
7485 if (pg
->sched_scrub()) {
7490 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7492 dout(20) << "sched_scrub done" << dendl
;
7495 void OSD::resched_all_scrubs()
7497 dout(10) << __func__
<< ": start" << dendl
;
7498 OSDService::ScrubJob scrub
;
7499 if (service
.first_scrub_stamp(&scrub
)) {
7501 dout(20) << __func__
<< ": examine " << scrub
.pgid
<< dendl
;
7503 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7506 if (!pg
->scrubber
.must_scrub
&& !pg
->scrubber
.need_auto
) {
7507 dout(20) << __func__
<< ": reschedule " << scrub
.pgid
<< dendl
;
7508 pg
->on_info_history_change();
7511 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7513 dout(10) << __func__
<< ": done" << dendl
;
7516 MPGStats
* OSD::collect_pg_stats()
7518 // This implementation unconditionally sends every is_primary PG's
7519 // stats every time we're called. This has equivalent cost to the
7520 // previous implementation's worst case where all PGs are busy and
7521 // their stats are always enqueued for sending.
7522 std::shared_lock l
{map_lock
};
7524 osd_stat_t cur_stat
= service
.get_osd_stat();
7525 cur_stat
.os_perf_stat
= store
->get_cur_stats();
7527 auto m
= new MPGStats(monc
->get_fsid(), get_osdmap_epoch());
7528 m
->osd_stat
= cur_stat
;
7530 std::lock_guard lec
{min_last_epoch_clean_lock
};
7531 min_last_epoch_clean
= get_osdmap_epoch();
7532 min_last_epoch_clean_pgs
.clear();
7534 std::set
<int64_t> pool_set
;
7537 for (auto& pg
: pgs
) {
7538 auto pool
= pg
->pg_id
.pgid
.pool();
7539 pool_set
.emplace((int64_t)pool
);
7540 if (!pg
->is_primary()) {
7543 pg
->get_pg_stats([&](const pg_stat_t
& s
, epoch_t lec
) {
7544 m
->pg_stat
[pg
->pg_id
.pgid
] = s
;
7545 min_last_epoch_clean
= min(min_last_epoch_clean
, lec
);
7546 min_last_epoch_clean_pgs
.push_back(pg
->pg_id
.pgid
);
7550 bool per_pool_stats
= false;
7551 bool per_pool_omap_stats
= false;
7552 for (auto p
: pool_set
) {
7553 int r
= store
->pool_statfs(p
, &st
, &per_pool_omap_stats
);
7554 if (r
== -ENOTSUP
) {
7558 m
->pool_stat
[p
] = st
;
7559 per_pool_stats
= true;
7563 // indicate whether we are reporting per-pool stats
7564 m
->osd_stat
.num_osds
= 1;
7565 m
->osd_stat
.num_per_pool_osds
= per_pool_stats
? 1 : 0;
7566 m
->osd_stat
.num_per_pool_omap_osds
= per_pool_omap_stats
? 1 : 0;
7571 vector
<DaemonHealthMetric
> OSD::get_health_metrics()
7573 vector
<DaemonHealthMetric
> metrics
;
7575 utime_t oldest_secs
;
7576 const utime_t now
= ceph_clock_now();
7578 too_old
-= cct
->_conf
.get_val
<double>("osd_op_complaint_time");
7580 TrackedOpRef oldest_op
;
7581 auto count_slow_ops
= [&](TrackedOp
& op
) {
7582 if (op
.get_initiated() < too_old
) {
7584 ss
<< "slow request " << op
.get_desc()
7586 << op
.get_initiated()
7588 << op
.state_string();
7589 lgeneric_subdout(cct
,osd
,20) << ss
.str() << dendl
;
7590 clog
->warn() << ss
.str();
7592 if (!oldest_op
|| op
.get_initiated() < oldest_op
->get_initiated()) {
7600 if (op_tracker
.visit_ops_in_flight(&oldest_secs
, count_slow_ops
)) {
7602 derr
<< __func__
<< " reporting " << slow
<< " slow ops, oldest is "
7603 << oldest_op
->get_desc() << dendl
;
7605 metrics
.emplace_back(daemon_metric::SLOW_OPS
, slow
, oldest_secs
);
7607 // no news is not good news.
7608 metrics
.emplace_back(daemon_metric::SLOW_OPS
, 0, 0);
7612 std::lock_guard
l(pending_creates_lock
);
7613 auto n_primaries
= pending_creates_from_mon
;
7614 for (const auto& create
: pending_creates_from_osd
) {
7615 if (create
.second
) {
7619 metrics
.emplace_back(daemon_metric::PENDING_CREATING_PGS
, n_primaries
);
7624 // =====================================================
7627 void OSD::wait_for_new_map(OpRequestRef op
)
7630 if (waiting_for_osdmap
.empty()) {
7631 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7634 logger
->inc(l_osd_waiting_for_map
);
7635 waiting_for_osdmap
.push_back(op
);
7636 op
->mark_delayed("wait for new map");
7641 * assimilate new OSDMap(s). scan pgs, etc.
7644 void OSD::note_down_osd(int peer
)
7646 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7647 cluster_messenger
->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer
));
7649 std::lock_guard l
{heartbeat_lock
};
7650 failure_queue
.erase(peer
);
7651 failure_pending
.erase(peer
);
7652 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
7653 if (p
!= heartbeat_peers
.end()) {
7654 p
->second
.clear_mark_down();
7655 heartbeat_peers
.erase(p
);
7659 void OSD::note_up_osd(int peer
)
7661 heartbeat_set_peers_need_update();
7664 struct C_OnMapCommit
: public Context
{
7666 epoch_t first
, last
;
7668 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
7669 : osd(o
), first(f
), last(l
), msg(m
) {}
7670 void finish(int r
) override
{
7671 osd
->_committed_osd_maps(first
, last
, msg
);
7676 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
7678 std::lock_guard
l(osdmap_subscribe_lock
);
7679 if (latest_subscribed_epoch
>= epoch
&& !force_request
)
7682 latest_subscribed_epoch
= std::max
<uint64_t>(epoch
, latest_subscribed_epoch
);
7684 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
7690 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
7692 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
7693 if (min
<= superblock
.oldest_map
)
7697 ObjectStore::Transaction t
;
7698 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
7699 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
7700 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
7701 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
7702 superblock
.oldest_map
= e
+ 1;
7704 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
7705 service
.publish_superblock(superblock
);
7706 write_superblock(t
);
7707 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7708 ceph_assert(tr
== 0);
7711 // skip_maps leaves us with a range of old maps if we fail to remove all
7712 // of them before moving superblock.oldest_map forward to the first map
7713 // in the incoming MOSDMap msg. so we should continue removing them in
7714 // this case, even we could do huge series of delete transactions all at
7721 service
.publish_superblock(superblock
);
7722 write_superblock(t
);
7723 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7724 ceph_assert(tr
== 0);
7726 // we should not remove the cached maps
7727 ceph_assert(min
<= service
.map_cache
.cached_key_lower_bound());
7730 void OSD::handle_osd_map(MOSDMap
*m
)
7732 // wait for pgs to catch up
7734 // we extend the map cache pins to accomodate pgs slow to consume maps
7735 // for some period, until we hit the max_lag_factor bound, at which point
7736 // we block here to stop injesting more maps than they are able to keep
7738 epoch_t max_lag
= cct
->_conf
->osd_map_cache_size
*
7739 m_osd_pg_epoch_max_lag_factor
;
7740 ceph_assert(max_lag
> 0);
7741 epoch_t osd_min
= 0;
7742 for (auto shard
: shards
) {
7743 epoch_t min
= shard
->get_min_pg_epoch();
7744 if (osd_min
== 0 || min
< osd_min
) {
7748 epoch_t osdmap_epoch
= get_osdmap_epoch();
7750 osdmap_epoch
> max_lag
&&
7751 osdmap_epoch
- max_lag
> osd_min
) {
7752 epoch_t need
= osdmap_epoch
- max_lag
;
7753 dout(10) << __func__
<< " waiting for pgs to catch up (need " << need
7754 << " max_lag " << max_lag
<< ")" << dendl
;
7755 for (auto shard
: shards
) {
7756 epoch_t min
= shard
->get_min_pg_epoch();
7758 dout(10) << __func__
<< " waiting for pgs to consume " << need
7759 << " (shard " << shard
->shard_id
<< " min " << min
7760 << ", map cache is " << cct
->_conf
->osd_map_cache_size
7761 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7763 unlock_guard unlock
{osd_lock
};
7764 shard
->wait_min_pg_epoch(need
);
7770 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7771 map
<epoch_t
,OSDMapRef
> added_maps
;
7772 map
<epoch_t
,bufferlist
> added_maps_bl
;
7773 if (m
->fsid
!= monc
->get_fsid()) {
7774 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
7775 << monc
->get_fsid() << dendl
;
7779 if (is_initializing()) {
7780 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
7785 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
7786 if (session
&& !(session
->entity_name
.is_mon() ||
7787 session
->entity_name
.is_osd())) {
7789 dout(10) << "got osd map from Session " << session
7790 << " which we can't take maps from (not a mon or osd)" << dendl
;
7795 // share with the objecter
7797 service
.objecter
->handle_osd_map(m
);
7799 epoch_t first
= m
->get_first();
7800 epoch_t last
= m
->get_last();
7801 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
7802 << superblock
.newest_map
7803 << ", src has [" << m
->oldest_map
<< "," << m
->newest_map
<< "]"
7806 logger
->inc(l_osd_map
);
7807 logger
->inc(l_osd_mape
, last
- first
+ 1);
7808 if (first
<= superblock
.newest_map
)
7809 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
7810 if (service
.max_oldest_map
< m
->oldest_map
) {
7811 service
.max_oldest_map
= m
->oldest_map
;
7812 ceph_assert(service
.max_oldest_map
>= superblock
.oldest_map
);
7815 // make sure there is something new, here, before we bother flushing
7816 // the queues and such
7817 if (last
<= superblock
.newest_map
) {
7818 dout(10) << " no new maps here, dropping" << dendl
;
7824 bool skip_maps
= false;
7825 if (first
> superblock
.newest_map
+ 1) {
7826 dout(10) << "handle_osd_map message skips epochs "
7827 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
7828 if (m
->oldest_map
<= superblock
.newest_map
+ 1) {
7829 osdmap_subscribe(superblock
.newest_map
+ 1, false);
7833 // always try to get the full range of maps--as many as we can. this
7834 // 1- is good to have
7835 // 2- is at present the only way to ensure that we get a *full* map as
7837 if (m
->oldest_map
< first
) {
7838 osdmap_subscribe(m
->oldest_map
- 1, true);
7845 ObjectStore::Transaction t
;
7846 uint64_t txn_size
= 0;
7848 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> purged_snaps
;
7850 // store new maps: queue for disk and put in the osdmap cache
7851 epoch_t start
= std::max(superblock
.newest_map
+ 1, first
);
7852 for (epoch_t e
= start
; e
<= last
; e
++) {
7853 if (txn_size
>= t
.get_num_bytes()) {
7854 derr
<< __func__
<< " transaction size overflowed" << dendl
;
7855 ceph_assert(txn_size
< t
.get_num_bytes());
7857 txn_size
= t
.get_num_bytes();
7858 map
<epoch_t
,bufferlist
>::iterator p
;
7859 p
= m
->maps
.find(e
);
7860 if (p
!= m
->maps
.end()) {
7861 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
7862 OSDMap
*o
= new OSDMap
;
7863 bufferlist
& bl
= p
->second
;
7867 purged_snaps
[e
] = o
->get_new_purged_snaps();
7869 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
7870 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
7871 added_maps
[e
] = add_map(o
);
7872 added_maps_bl
[e
] = bl
;
7877 p
= m
->incremental_maps
.find(e
);
7878 if (p
!= m
->incremental_maps
.end()) {
7879 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
7880 bufferlist
& bl
= p
->second
;
7881 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
7882 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
7884 OSDMap
*o
= new OSDMap
;
7887 bool got
= get_map_bl(e
- 1, obl
);
7889 auto p
= added_maps_bl
.find(e
- 1);
7890 ceph_assert(p
!= added_maps_bl
.end());
7896 OSDMap::Incremental inc
;
7897 auto p
= bl
.cbegin();
7900 if (o
->apply_incremental(inc
) < 0) {
7901 derr
<< "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
7902 ceph_abort_msg("bad fsid");
7906 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
7908 bool injected_failure
= false;
7909 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
7910 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
7911 derr
<< __func__
<< " injecting map crc failure" << dendl
;
7912 injected_failure
= true;
7915 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
7916 dout(2) << "got incremental " << e
7917 << " but failed to encode full with correct crc; requesting"
7919 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
7920 dout(20) << "my encoded map was:\n";
7921 fbl
.hexdump(*_dout
);
7924 request_full_map(e
, last
);
7929 purged_snaps
[e
] = o
->get_new_purged_snaps();
7931 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
7932 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
7933 added_maps
[e
] = add_map(o
);
7934 added_maps_bl
[e
] = fbl
;
7938 ceph_abort_msg("MOSDMap lied about what maps it had?");
7941 // even if this map isn't from a mon, we may have satisfied our subscription
7942 monc
->sub_got("osdmap", last
);
7944 if (!m
->maps
.empty() && requested_full_first
) {
7945 dout(10) << __func__
<< " still missing full maps " << requested_full_first
7946 << ".." << requested_full_last
<< dendl
;
7947 rerequest_full_maps();
7950 if (superblock
.oldest_map
) {
7951 // make sure we at least keep pace with incoming maps
7952 trim_maps(m
->oldest_map
, last
- first
+ 1, skip_maps
);
7953 pg_num_history
.prune(superblock
.oldest_map
);
7956 if (!superblock
.oldest_map
|| skip_maps
)
7957 superblock
.oldest_map
= first
;
7958 superblock
.newest_map
= last
;
7959 superblock
.current_epoch
= last
;
7961 // note in the superblock that we were clean thru the prior epoch
7962 epoch_t boot_epoch
= service
.get_boot_epoch();
7963 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
7964 superblock
.mounted
= boot_epoch
;
7965 superblock
.clean_thru
= last
;
7968 // check for pg_num changes and deleted pools
7970 for (auto& i
: added_maps
) {
7972 if (!(lastmap
= service
.try_get_map(i
.first
- 1))) {
7973 dout(10) << __func__
<< " can't get previous map " << i
.first
- 1
7974 << " probably first start of this osd" << dendl
;
7978 ceph_assert(lastmap
->get_epoch() + 1 == i
.second
->get_epoch());
7979 for (auto& j
: lastmap
->get_pools()) {
7980 if (!i
.second
->have_pg_pool(j
.first
)) {
7981 pg_num_history
.log_pool_delete(i
.first
, j
.first
);
7982 dout(10) << __func__
<< " recording final pg_pool_t for pool "
7983 << j
.first
<< dendl
;
7984 // this information is needed by _make_pg() if have to restart before
7985 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
7986 ghobject_t obj
= make_final_pool_info_oid(j
.first
);
7988 encode(j
.second
, bl
, CEPH_FEATURES_ALL
);
7989 string name
= lastmap
->get_pool_name(j
.first
);
7991 map
<string
,string
> profile
;
7992 if (lastmap
->get_pg_pool(j
.first
)->is_erasure()) {
7993 profile
= lastmap
->get_erasure_code_profile(
7994 lastmap
->get_pg_pool(j
.first
)->erasure_code_profile
);
7996 encode(profile
, bl
);
7997 t
.write(coll_t::meta(), obj
, 0, bl
.length(), bl
);
7998 } else if (unsigned new_pg_num
= i
.second
->get_pg_num(j
.first
);
7999 new_pg_num
!= j
.second
.get_pg_num()) {
8000 dout(10) << __func__
<< " recording pool " << j
.first
<< " pg_num "
8001 << j
.second
.get_pg_num() << " -> " << new_pg_num
<< dendl
;
8002 pg_num_history
.log_pg_num_change(i
.first
, j
.first
, new_pg_num
);
8005 for (auto& j
: i
.second
->get_pools()) {
8006 if (!lastmap
->have_pg_pool(j
.first
)) {
8007 dout(10) << __func__
<< " recording new pool " << j
.first
<< " pg_num "
8008 << j
.second
.get_pg_num() << dendl
;
8009 pg_num_history
.log_pg_num_change(i
.first
, j
.first
,
8010 j
.second
.get_pg_num());
8015 pg_num_history
.epoch
= last
;
8018 ::encode(pg_num_history
, bl
);
8019 t
.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl
.length(), bl
);
8020 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
8023 // record new purged_snaps
8024 if (superblock
.purged_snaps_last
== start
- 1) {
8025 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
8026 make_purged_snaps_oid(), &t
,
8028 superblock
.purged_snaps_last
= last
;
8030 dout(10) << __func__
<< " superblock purged_snaps_last is "
8031 << superblock
.purged_snaps_last
8032 << ", not recording new purged_snaps" << dendl
;
8035 // superblock and commit
8036 write_superblock(t
);
8037 t
.register_on_commit(new C_OnMapCommit(this, start
, last
, m
));
8038 store
->queue_transaction(
8041 service
.publish_superblock(superblock
);
8044 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
8046 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
8047 if (is_stopping()) {
8048 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8051 std::lock_guard
l(osd_lock
);
8052 if (is_stopping()) {
8053 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8058 bool do_shutdown
= false;
8059 bool do_restart
= false;
8060 bool network_error
= false;
8063 // advance through the new maps
8064 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
8065 dout(10) << " advance to epoch " << cur
8066 << " (<= last " << last
8067 << " <= newest_map " << superblock
.newest_map
8070 OSDMapRef newmap
= get_map(cur
);
8071 ceph_assert(newmap
); // we just cached it above!
8073 // start blacklisting messages sent to peers that go down.
8074 service
.pre_publish_map(newmap
);
8076 // kill connections to newly down osds
8077 bool waited_for_reservations
= false;
8079 osdmap
= get_osdmap();
8080 osdmap
->get_all_osds(old
);
8081 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
8083 osdmap
->is_up(*p
) && // in old map
8084 newmap
->is_down(*p
)) { // but not the new one
8085 if (!waited_for_reservations
) {
8086 service
.await_reserved_maps();
8087 waited_for_reservations
= true;
8090 } else if (*p
!= whoami
&&
8091 osdmap
->is_down(*p
) &&
8092 newmap
->is_up(*p
)) {
8097 if (osdmap
->is_noup(whoami
) != newmap
->is_noup(whoami
)) {
8098 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
8101 // this captures the case where we sent the boot message while
8102 // NOUP was being set on the mon and our boot request was
8103 // dropped, and then later it is cleared. it imperfectly
8104 // handles the case where our original boot message was not
8105 // dropped and we restart even though we might have booted, but
8106 // that is harmless (boot will just take slightly longer).
8111 osdmap
= std::move(newmap
);
8115 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
8117 osdmap
->is_up(whoami
) &&
8118 osdmap
->get_addrs(whoami
) == client_messenger
->get_myaddrs()) {
8119 up_epoch
= osdmap
->get_epoch();
8120 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
8122 boot_epoch
= osdmap
->get_epoch();
8123 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
8125 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
8129 epoch_t _bind_epoch
= service
.get_bind_epoch();
8130 if (osdmap
->is_up(whoami
) &&
8131 osdmap
->get_addrs(whoami
).legacy_equals(
8132 client_messenger
->get_myaddrs()) &&
8133 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
8136 dout(1) << "state: booting -> active" << dendl
;
8137 set_state(STATE_ACTIVE
);
8140 // set incarnation so that osd_reqid_t's we generate for our
8141 // objecter requests are unique across restarts.
8142 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
8143 cancel_pending_failures();
8147 if (osdmap
->get_epoch() > 0 &&
8149 if (!osdmap
->exists(whoami
)) {
8150 derr
<< "map says i do not exist. shutting down." << dendl
;
8151 do_shutdown
= true; // don't call shutdown() while we have
8152 // everything paused
8153 } else if (osdmap
->is_stop(whoami
)) {
8154 derr
<< "map says i am stopped by admin. shutting down." << dendl
;
8156 } else if (!osdmap
->is_up(whoami
) ||
8157 !osdmap
->get_addrs(whoami
).legacy_equals(
8158 client_messenger
->get_myaddrs()) ||
8159 !osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8160 cluster_messenger
->get_myaddrs()) ||
8161 !osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8162 hb_back_server_messenger
->get_myaddrs()) ||
8163 !osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8164 hb_front_server_messenger
->get_myaddrs())) {
8165 if (!osdmap
->is_up(whoami
)) {
8166 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
8167 service
.got_stop_ack();
8169 clog
->warn() << "Monitor daemon marked osd." << whoami
<< " down, "
8170 "but it is still running";
8171 clog
->debug() << "map e" << osdmap
->get_epoch()
8172 << " wrongly marked me down at e"
8173 << osdmap
->get_down_at(whoami
);
8175 if (monc
->monmap
.min_mon_release
>= ceph_release_t::octopus
) {
8176 // note that this is best-effort...
8177 monc
->send_mon_message(
8181 osdmap
->get_epoch()));
8183 } else if (!osdmap
->get_addrs(whoami
).legacy_equals(
8184 client_messenger
->get_myaddrs())) {
8185 clog
->error() << "map e" << osdmap
->get_epoch()
8186 << " had wrong client addr (" << osdmap
->get_addrs(whoami
)
8187 << " != my " << client_messenger
->get_myaddrs() << ")";
8188 } else if (!osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8189 cluster_messenger
->get_myaddrs())) {
8190 clog
->error() << "map e" << osdmap
->get_epoch()
8191 << " had wrong cluster addr ("
8192 << osdmap
->get_cluster_addrs(whoami
)
8193 << " != my " << cluster_messenger
->get_myaddrs() << ")";
8194 } else if (!osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8195 hb_back_server_messenger
->get_myaddrs())) {
8196 clog
->error() << "map e" << osdmap
->get_epoch()
8197 << " had wrong heartbeat back addr ("
8198 << osdmap
->get_hb_back_addrs(whoami
)
8199 << " != my " << hb_back_server_messenger
->get_myaddrs()
8201 } else if (!osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8202 hb_front_server_messenger
->get_myaddrs())) {
8203 clog
->error() << "map e" << osdmap
->get_epoch()
8204 << " had wrong heartbeat front addr ("
8205 << osdmap
->get_hb_front_addrs(whoami
)
8206 << " != my " << hb_front_server_messenger
->get_myaddrs()
8210 if (!service
.is_stopping()) {
8211 epoch_t up_epoch
= 0;
8212 epoch_t bind_epoch
= osdmap
->get_epoch();
8213 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
8217 utime_t now
= ceph_clock_now();
8218 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
8219 osd_markdown_log
.push_back(now
);
8220 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
8221 derr
<< __func__
<< " marked down "
8222 << osd_markdown_log
.size()
8223 << " > osd_max_markdown_count "
8224 << cct
->_conf
->osd_max_markdown_count
8225 << " in last " << grace
<< " seconds, shutting down"
8231 start_waiting_for_healthy();
8233 set
<int> avoid_ports
;
8234 #if defined(__FreeBSD__)
8235 // prevent FreeBSD from grabbing the client_messenger port during
8236 // rebinding. In which case a cluster_meesneger will connect also
8238 client_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8240 cluster_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8242 int r
= cluster_messenger
->rebind(avoid_ports
);
8244 do_shutdown
= true; // FIXME: do_restart?
8245 network_error
= true;
8246 derr
<< __func__
<< " marked down:"
8247 << " rebind cluster_messenger failed" << dendl
;
8250 hb_back_server_messenger
->mark_down_all();
8251 hb_front_server_messenger
->mark_down_all();
8252 hb_front_client_messenger
->mark_down_all();
8253 hb_back_client_messenger
->mark_down_all();
8255 reset_heartbeat_peers(true);
8262 check_osdmap_features();
8267 if (is_active() || is_waiting_for_healthy())
8268 maybe_update_heartbeat_peers();
8275 if (network_error
) {
8276 cancel_pending_failures();
8278 // trigger shutdown in a different thread
8279 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
8280 queue_async_signal(SIGINT
);
8282 else if (m
->newest_map
&& m
->newest_map
> last
) {
8283 dout(10) << " msg say newest map is " << m
->newest_map
8284 << ", requesting more" << dendl
;
8285 osdmap_subscribe(osdmap
->get_epoch()+1, false);
8287 else if (is_preboot()) {
8288 if (m
->get_source().is_mon())
8289 _preboot(m
->oldest_map
, m
->newest_map
);
8293 else if (do_restart
)
8298 void OSD::check_osdmap_features()
8300 // adjust required feature bits?
8302 // we have to be a bit careful here, because we are accessing the
8303 // Policy structures without taking any lock. in particular, only
8304 // modify integer values that can safely be read by a racing CPU.
8305 // since we are only accessing existing Policy structures a their
8306 // current memory location, and setting or clearing bits in integer
8307 // fields, and we are the only writer, this is not a problem.
8309 const auto osdmap
= get_osdmap();
8311 Messenger::Policy p
= client_messenger
->get_default_policy();
8313 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
8314 if ((p
.features_required
& mask
) != features
) {
8315 dout(0) << "crush map has features " << features
8316 << ", adjusting msgr requires for clients" << dendl
;
8317 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8318 client_messenger
->set_default_policy(p
);
8322 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
8324 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
8325 if ((p
.features_required
& mask
) != features
) {
8326 dout(0) << "crush map has features " << features
8327 << " was " << p
.features_required
8328 << ", adjusting msgr requires for mons" << dendl
;
8329 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8330 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
8334 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
8336 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
8338 if ((p
.features_required
& mask
) != features
) {
8339 dout(0) << "crush map has features " << features
8340 << ", adjusting msgr requires for osds" << dendl
;
8341 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8342 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
8345 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
8346 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
8347 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
8348 ObjectStore::Transaction t
;
8349 write_superblock(t
);
8350 int err
= store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
8351 ceph_assert(err
== 0);
8355 if (osdmap
->require_osd_release
< ceph_release_t::nautilus
) {
8356 hb_front_server_messenger
->set_require_authorizer(false);
8357 hb_back_server_messenger
->set_require_authorizer(false);
8359 hb_front_server_messenger
->set_require_authorizer(true);
8360 hb_back_server_messenger
->set_require_authorizer(true);
8363 if (osdmap
->require_osd_release
!= last_require_osd_release
) {
8364 dout(1) << __func__
<< " require_osd_release " << last_require_osd_release
8365 << " -> " << to_string(osdmap
->require_osd_release
) << dendl
;
8366 store
->write_meta("require_osd_release",
8367 stringify((int)osdmap
->require_osd_release
));
8368 last_require_osd_release
= osdmap
->require_osd_release
;
8372 struct C_FinishSplits
: public Context
{
8375 C_FinishSplits(OSD
*osd
, const set
<PGRef
> &in
)
8376 : osd(osd
), pgs(in
) {}
8377 void finish(int r
) override
{
8378 osd
->_finish_splits(pgs
);
8382 void OSD::_finish_splits(set
<PGRef
>& pgs
)
8384 dout(10) << __func__
<< " " << pgs
<< dendl
;
8387 for (set
<PGRef
>::iterator i
= pgs
.begin();
8392 PeeringCtx rctx
= create_context();
8394 dout(10) << __func__
<< " " << *pg
<< dendl
;
8395 epoch_t e
= pg
->get_osdmap_epoch();
8396 pg
->handle_initialize(rctx
);
8397 pg
->queue_null(e
, e
);
8398 dispatch_context(rctx
, pg
, service
.get_osdmap());
8401 unsigned shard_index
= pg
->pg_id
.hash_to_shard(num_shards
);
8402 shards
[shard_index
]->register_and_wake_split_child(pg
);
8406 bool OSD::add_merge_waiter(OSDMapRef nextmap
, spg_t target
, PGRef src
,
8409 std::lock_guard
l(merge_lock
);
8410 auto& p
= merge_waiters
[nextmap
->get_epoch()][target
];
8411 p
[src
->pg_id
] = src
;
8412 dout(10) << __func__
<< " added merge_waiter " << src
->pg_id
8413 << " for " << target
<< ", have " << p
.size() << "/" << need
8415 return p
.size() == need
;
8418 bool OSD::advance_pg(
8421 ThreadPool::TPHandle
&handle
,
8424 if (osd_epoch
<= pg
->get_osdmap_epoch()) {
8427 ceph_assert(pg
->is_locked());
8428 OSDMapRef lastmap
= pg
->get_osdmap();
8429 ceph_assert(lastmap
->get_epoch() < osd_epoch
);
8430 set
<PGRef
> new_pgs
; // any split children
8433 unsigned old_pg_num
= lastmap
->have_pg_pool(pg
->pg_id
.pool()) ?
8434 lastmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8435 for (epoch_t next_epoch
= pg
->get_osdmap_epoch() + 1;
8436 next_epoch
<= osd_epoch
;
8438 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
8440 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
8444 unsigned new_pg_num
=
8445 (old_pg_num
&& nextmap
->have_pg_pool(pg
->pg_id
.pool())) ?
8446 nextmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8447 if (old_pg_num
&& new_pg_num
&& old_pg_num
!= new_pg_num
) {
8449 if (nextmap
->have_pg_pool(pg
->pg_id
.pool())) {
8451 if (pg
->pg_id
.is_merge_source(
8455 // we are merge source
8456 PGRef spg
= pg
; // carry a ref
8457 dout(1) << __func__
<< " " << pg
->pg_id
8458 << " is merge source, target is " << parent
8460 pg
->write_if_dirty(rctx
);
8461 if (!new_pgs
.empty()) {
8462 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8466 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8468 // release backoffs explicitly, since the on_shutdown path
8469 // aggressively tears down backoff state.
8470 if (pg
->is_primary()) {
8471 pg
->release_pg_backoffs();
8474 OSDShard
*sdata
= pg
->osd_shard
;
8476 std::lock_guard
l(sdata
->shard_lock
);
8478 sdata
->_detach_pg(pg
->pg_slot
);
8479 // update pg count now since we might not get an osdmap
8481 if (pg
->is_primary())
8482 logger
->dec(l_osd_pg_primary
);
8483 else if (pg
->is_nonprimary())
8484 logger
->dec(l_osd_pg_replica
); // misnomer
8486 logger
->dec(l_osd_pg_stray
);
8491 set
<spg_t
> children
;
8492 parent
.is_split(new_pg_num
, old_pg_num
, &children
);
8493 if (add_merge_waiter(nextmap
, parent
, pg
, children
.size())) {
8494 enqueue_peering_evt(
8497 std::make_shared
<PGPeeringEvent
>(
8498 nextmap
->get_epoch(),
8499 nextmap
->get_epoch(),
8504 } else if (pg
->pg_id
.is_merge_target(old_pg_num
, new_pg_num
)) {
8505 // we are merge target
8506 set
<spg_t
> children
;
8507 pg
->pg_id
.is_split(new_pg_num
, old_pg_num
, &children
);
8508 dout(20) << __func__
<< " " << pg
->pg_id
8509 << " is merge target, sources are " << children
8511 map
<spg_t
,PGRef
> sources
;
8513 std::lock_guard
l(merge_lock
);
8514 auto& s
= merge_waiters
[nextmap
->get_epoch()][pg
->pg_id
];
8515 unsigned need
= children
.size();
8516 dout(20) << __func__
<< " have " << s
.size() << "/"
8518 if (s
.size() == need
) {
8520 merge_waiters
[nextmap
->get_epoch()].erase(pg
->pg_id
);
8521 if (merge_waiters
[nextmap
->get_epoch()].empty()) {
8522 merge_waiters
.erase(nextmap
->get_epoch());
8526 if (!sources
.empty()) {
8527 unsigned new_pg_num
= nextmap
->get_pg_num(pg
->pg_id
.pool());
8528 unsigned split_bits
= pg
->pg_id
.get_split_bits(new_pg_num
);
8529 dout(1) << __func__
<< " merging " << pg
->pg_id
<< dendl
;
8531 sources
, rctx
, split_bits
,
8532 nextmap
->get_pg_pool(
8533 pg
->pg_id
.pool())->last_pg_merge_meta
);
8534 pg
->pg_slot
->waiting_for_merge_epoch
= 0;
8536 dout(20) << __func__
<< " not ready to merge yet" << dendl
;
8537 pg
->write_if_dirty(rctx
);
8538 if (!new_pgs
.empty()) {
8539 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8543 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8545 // kick source(s) to get them ready
8546 for (auto& i
: children
) {
8547 dout(20) << __func__
<< " kicking source " << i
<< dendl
;
8548 enqueue_peering_evt(
8551 std::make_shared
<PGPeeringEvent
>(
8552 nextmap
->get_epoch(),
8553 nextmap
->get_epoch(),
8563 vector
<int> newup
, newacting
;
8564 int up_primary
, acting_primary
;
8565 nextmap
->pg_to_up_acting_osds(
8567 &newup
, &up_primary
,
8568 &newacting
, &acting_primary
);
8569 pg
->handle_advance_map(
8570 nextmap
, lastmap
, newup
, up_primary
,
8571 newacting
, acting_primary
, rctx
);
8573 auto oldpool
= lastmap
->get_pools().find(pg
->pg_id
.pool());
8574 auto newpool
= nextmap
->get_pools().find(pg
->pg_id
.pool());
8575 if (oldpool
!= lastmap
->get_pools().end()
8576 && newpool
!= nextmap
->get_pools().end()) {
8577 dout(20) << __func__
8578 << " new pool opts " << newpool
->second
.opts
8579 << " old pool opts " << oldpool
->second
.opts
8582 double old_min_interval
= 0, new_min_interval
= 0;
8583 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &old_min_interval
);
8584 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &new_min_interval
);
8586 double old_max_interval
= 0, new_max_interval
= 0;
8587 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &old_max_interval
);
8588 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &new_max_interval
);
8590 // Assume if an interval is change from set to unset or vice versa the actual config
8591 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8593 if (old_min_interval
!= new_min_interval
|| old_max_interval
!= new_max_interval
) {
8594 pg
->on_info_history_change();
8598 if (new_pg_num
&& old_pg_num
!= new_pg_num
) {
8600 set
<spg_t
> children
;
8601 if (pg
->pg_id
.is_split(
8606 pg
, children
, &new_pgs
, lastmap
, nextmap
,
8612 old_pg_num
= new_pg_num
;
8613 handle
.reset_tp_timeout();
8615 pg
->handle_activate_map(rctx
);
8619 if (!new_pgs
.empty()) {
8620 rctx
.transaction
.register_on_applied(new C_FinishSplits(this, new_pgs
));
8625 void OSD::consume_map()
8627 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8628 auto osdmap
= get_osdmap();
8629 dout(7) << "consume_map version " << osdmap
->get_epoch() << dendl
;
8631 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8632 * speak the older sorting version any more. Be careful not to force
8633 * a shutdown if we are merely processing old maps, though.
8635 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
) && is_active()) {
8636 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
8640 service
.pre_publish_map(osdmap
);
8641 service
.await_reserved_maps();
8642 service
.publish_map(osdmap
);
8644 // prime splits and merges
8645 set
<pair
<spg_t
,epoch_t
>> newly_split
; // splits, and when
8646 set
<pair
<spg_t
,epoch_t
>> merge_pgs
; // merge participants, and when
8647 for (auto& shard
: shards
) {
8648 shard
->identify_splits_and_merges(osdmap
, &newly_split
, &merge_pgs
);
8650 if (!newly_split
.empty()) {
8651 for (auto& shard
: shards
) {
8652 shard
->prime_splits(osdmap
, &newly_split
);
8654 ceph_assert(newly_split
.empty());
8657 // prune sent_ready_to_merge
8658 service
.prune_sent_ready_to_merge(osdmap
);
8660 // FIXME, maybe: We could race against an incoming peering message
8661 // that instantiates a merge PG after identify_merges() below and
8662 // never set up its peer to complete the merge. An OSD restart
8663 // would clear it up. This is a hard race to resolve,
8664 // extraordinarily rare (we only merge PGs that are stable and
8665 // clean, so it'd have to be an imported PG to an OSD with a
8666 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8667 // replace all of this with a seastar-based code soon anyway.
8668 if (!merge_pgs
.empty()) {
8669 // mark the pgs we already have, or create new and empty merge
8670 // participants for those we are missing. do this all under the
8671 // shard lock so we don't have to worry about racing pg creates
8673 for (auto& shard
: shards
) {
8674 shard
->prime_merges(osdmap
, &merge_pgs
);
8676 ceph_assert(merge_pgs
.empty());
8679 service
.prune_pg_created();
8681 unsigned pushes_to_free
= 0;
8682 for (auto& shard
: shards
) {
8683 shard
->consume_map(osdmap
, &pushes_to_free
);
8686 vector
<spg_t
> pgids
;
8689 // count (FIXME, probably during seastar rewrite)
8690 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
8693 for (auto& pg
: pgs
) {
8694 // FIXME (probably during seastar rewrite): this is lockless and
8695 // racy, but we don't want to take pg lock here.
8696 if (pg
->is_primary())
8698 else if (pg
->is_nonprimary())
8699 num_pg_replica
++; // misnomer
8705 // FIXME (as part of seastar rewrite): move to OSDShard
8706 std::lock_guard
l(pending_creates_lock
);
8707 for (auto pg
= pending_creates_from_osd
.begin();
8708 pg
!= pending_creates_from_osd
.end();) {
8709 if (osdmap
->get_pg_acting_role(pg
->first
, whoami
) < 0) {
8710 dout(10) << __func__
<< " pg " << pg
->first
<< " doesn't map here, "
8711 << "discarding pending_create_from_osd" << dendl
;
8712 pg
= pending_creates_from_osd
.erase(pg
);
8719 service
.maybe_inject_dispatch_delay();
8721 dispatch_sessions_waiting_on_map();
8723 service
.maybe_inject_dispatch_delay();
8725 service
.release_reserved_pushes(pushes_to_free
);
8727 // queue null events to push maps down to individual PGs
8728 for (auto pgid
: pgids
) {
8729 enqueue_peering_evt(
8732 std::make_shared
<PGPeeringEvent
>(
8733 osdmap
->get_epoch(),
8734 osdmap
->get_epoch(),
8737 logger
->set(l_osd_pg
, pgids
.size());
8738 logger
->set(l_osd_pg_primary
, num_pg_primary
);
8739 logger
->set(l_osd_pg_replica
, num_pg_replica
);
8740 logger
->set(l_osd_pg_stray
, num_pg_stray
);
8743 void OSD::activate_map()
8745 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8746 auto osdmap
= get_osdmap();
8748 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
8751 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
8752 if (!service
.recovery_is_paused()) {
8753 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
8754 service
.pause_recovery();
8757 if (service
.recovery_is_paused()) {
8758 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
8759 service
.unpause_recovery();
8763 service
.activate_map();
8766 take_waiters(waiting_for_osdmap
);
8769 bool OSD::require_mon_peer(const Message
*m
)
8771 if (!m
->get_connection()->peer_is_mon()) {
8772 dout(0) << "require_mon_peer received from non-mon "
8773 << m
->get_connection()->get_peer_addr()
8774 << " " << *m
<< dendl
;
8780 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
8782 if (!m
->get_connection()->peer_is_mon() &&
8783 !m
->get_connection()->peer_is_mgr()) {
8784 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8785 << m
->get_connection()->get_peer_addr()
8786 << " " << *m
<< dendl
;
8792 bool OSD::require_osd_peer(const Message
*m
)
8794 if (!m
->get_connection()->peer_is_osd()) {
8795 dout(0) << "require_osd_peer received from non-osd "
8796 << m
->get_connection()->get_peer_addr()
8797 << " " << *m
<< dendl
;
8803 bool OSD::require_self_aliveness(const Message
*m
, epoch_t epoch
)
8805 epoch_t up_epoch
= service
.get_up_epoch();
8806 if (epoch
< up_epoch
) {
8807 dout(7) << "from pre-up epoch " << epoch
<< " < " << up_epoch
<< dendl
;
8812 dout(7) << "still in boot state, dropping message " << *m
<< dendl
;
8819 bool OSD::require_same_peer_instance(const Message
*m
, const OSDMapRef
& map
,
8820 bool is_fast_dispatch
)
8822 int from
= m
->get_source().num();
8824 if (map
->is_down(from
) ||
8825 (map
->get_cluster_addrs(from
) != m
->get_source_addrs())) {
8826 dout(5) << "from dead osd." << from
<< ", marking down, "
8827 << " msg was " << m
->get_source_inst().addr
8829 << (map
->is_up(from
) ?
8830 map
->get_cluster_addrs(from
) : entity_addrvec_t())
8832 ConnectionRef con
= m
->get_connection();
8834 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); s
) {
8835 if (!is_fast_dispatch
)
8836 s
->session_dispatch_lock
.lock();
8837 clear_session_waiting_on_map(s
);
8838 con
->set_priv(nullptr); // break ref <-> session cycle, if any
8840 if (!is_fast_dispatch
)
8841 s
->session_dispatch_lock
.unlock();
8850 * require that we have same (or newer) map, and that
8851 * the source is the pg primary.
8853 bool OSD::require_same_or_newer_map(OpRequestRef
& op
, epoch_t epoch
,
8854 bool is_fast_dispatch
)
8856 const Message
*m
= op
->get_req();
8857 const auto osdmap
= get_osdmap();
8858 dout(15) << "require_same_or_newer_map " << epoch
8859 << " (i am " << osdmap
->get_epoch() << ") " << m
<< dendl
;
8861 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8863 // do they have a newer map?
8864 if (epoch
> osdmap
->get_epoch()) {
8865 dout(7) << "waiting for newer map epoch " << epoch
8866 << " > my " << osdmap
->get_epoch() << " with " << m
<< dendl
;
8867 wait_for_new_map(op
);
8871 if (!require_self_aliveness(op
->get_req(), epoch
)) {
8875 // ok, our map is same or newer.. do they still exist?
8876 if (m
->get_connection()->get_messenger() == cluster_messenger
&&
8877 !require_same_peer_instance(op
->get_req(), osdmap
, is_fast_dispatch
)) {
8888 // ----------------------------------------
8891 void OSD::split_pgs(
8893 const set
<spg_t
> &childpgids
, set
<PGRef
> *out_pgs
,
8898 unsigned pg_num
= nextmap
->get_pg_num(parent
->pg_id
.pool());
8899 parent
->update_snap_mapper_bits(parent
->get_pgid().get_split_bits(pg_num
));
8901 vector
<object_stat_sum_t
> updated_stats
;
8902 parent
->start_split_stats(childpgids
, &updated_stats
);
8904 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
8905 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
8906 i
!= childpgids
.end();
8908 ceph_assert(stat_iter
!= updated_stats
.end());
8909 dout(10) << __func__
<< " splitting " << *parent
<< " into " << *i
<< dendl
;
8910 PG
* child
= _make_pg(nextmap
, *i
);
8912 out_pgs
->insert(child
);
8913 child
->ch
= store
->create_new_collection(child
->coll
);
8916 uint32_t shard_index
= i
->hash_to_shard(shards
.size());
8917 assert(NULL
!= shards
[shard_index
]);
8918 store
->set_collection_commit_queue(child
->coll
, &(shards
[shard_index
]->context_queue
));
8921 unsigned split_bits
= i
->get_split_bits(pg_num
);
8922 dout(10) << " pg_num is " << pg_num
8923 << ", m_seed " << i
->ps()
8924 << ", split_bits is " << split_bits
<< dendl
;
8925 parent
->split_colls(
8929 &child
->get_pool().info
,
8936 child
->init_collection_pool_opts();
8938 child
->finish_split_stats(*stat_iter
, rctx
.transaction
);
8941 ceph_assert(stat_iter
!= updated_stats
.end());
8942 parent
->finish_split_stats(*stat_iter
, rctx
.transaction
);
8948 void OSD::handle_pg_create(OpRequestRef op
)
8950 // NOTE: this can be removed in P release (mimic is the last version to
8951 // send MOSDPGCreate messages).
8953 auto m
= op
->get_req
<MOSDPGCreate
>();
8954 ceph_assert(m
->get_type() == MSG_OSD_PG_CREATE
);
8956 dout(10) << "handle_pg_create " << *m
<< dendl
;
8958 if (!require_mon_peer(op
->get_req())) {
8962 if (!require_same_or_newer_map(op
, m
->epoch
, false))
8967 const auto osdmap
= get_osdmap();
8968 map
<pg_t
,utime_t
>::const_iterator ci
= m
->ctimes
.begin();
8969 for (map
<pg_t
,pg_create_t
>::const_iterator p
= m
->mkpg
.begin();
8972 ceph_assert(ci
!= m
->ctimes
.end() && ci
->first
== p
->first
);
8973 epoch_t created
= p
->second
.created
;
8974 if (p
->second
.split_bits
) // Skip split pgs
8978 if (!osdmap
->have_pg_pool(on
.pool())) {
8979 dout(20) << "ignoring pg on deleted pool " << on
<< dendl
;
8983 dout(20) << "mkpg " << on
<< " e" << created
<< "@" << ci
->second
<< dendl
;
8986 bool mapped
= osdmap
->get_primary_shard(on
, &pgid
);
8987 ceph_assert(mapped
);
8989 // is it still ours?
8990 vector
<int> up
, acting
;
8991 int up_primary
= -1;
8992 int acting_primary
= -1;
8993 osdmap
->pg_to_up_acting_osds(on
, &up
, &up_primary
, &acting
, &acting_primary
);
8994 int role
= osdmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
8996 if (acting_primary
!= whoami
) {
8997 dout(10) << "mkpg " << on
<< " not acting_primary (" << acting_primary
8998 << "), my role=" << role
<< ", skipping" << dendl
;
9004 pg_history_t history
;
9005 build_initial_pg_history(pgid
, created
, ci
->second
, &history
, &pi
);
9007 // The mon won't resend unless the primary changed, so we ignore
9008 // same_interval_since. We'll pass this history with the current
9009 // epoch as the event.
9010 if (history
.same_primary_since
> m
->epoch
) {
9011 dout(10) << __func__
<< ": got obsolete pg create on pgid "
9012 << pgid
<< " from epoch " << m
->epoch
9013 << ", primary changed in " << history
.same_primary_since
9017 enqueue_peering_evt(
9020 std::make_shared
<PGPeeringEvent
>(
9021 osdmap
->get_epoch(),
9022 osdmap
->get_epoch(),
9027 osdmap
->get_epoch(),
9035 std::lock_guard
l(pending_creates_lock
);
9036 if (pending_creates_from_mon
== 0) {
9037 last_pg_create_epoch
= m
->epoch
;
9041 maybe_update_heartbeat_peers();
9045 // ----------------------------------------
9046 // peering and recovery
9048 PeeringCtx
OSD::create_context()
9050 return PeeringCtx(get_osdmap()->require_osd_release
);
9053 void OSD::dispatch_context(PeeringCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
9054 ThreadPool::TPHandle
*handle
)
9056 if (!service
.get_osdmap()->is_up(whoami
)) {
9057 dout(20) << __func__
<< " not up in osdmap" << dendl
;
9058 } else if (!is_active()) {
9059 dout(20) << __func__
<< " not active" << dendl
;
9061 for (auto& [osd
, ls
] : ctx
.message_map
) {
9062 if (!curmap
->is_up(osd
)) {
9063 dout(20) << __func__
<< " skipping down osd." << osd
<< dendl
;
9066 ConnectionRef con
= service
.get_con_osd_cluster(
9067 osd
, curmap
->get_epoch());
9069 dout(20) << __func__
<< " skipping osd." << osd
<< " (NULL con)"
9073 service
.maybe_share_map(con
.get(), curmap
);
9075 con
->send_message2(m
);
9080 if ((!ctx
.transaction
.empty() || ctx
.transaction
.has_contexts()) && pg
) {
9081 int tr
= store
->queue_transaction(
9083 std::move(ctx
.transaction
), TrackedOpRef(),
9085 ceph_assert(tr
== 0);
9089 void OSD::handle_fast_pg_create(MOSDPGCreate2
*m
)
9091 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9092 if (!require_mon_peer(m
)) {
9096 for (auto& p
: m
->pgs
) {
9097 spg_t pgid
= p
.first
;
9098 epoch_t created
= p
.second
.first
;
9099 utime_t created_stamp
= p
.second
.second
;
9100 auto q
= m
->pg_extra
.find(pgid
);
9101 if (q
== m
->pg_extra
.end()) {
9102 dout(20) << __func__
<< " " << pgid
<< " e" << created
9103 << "@" << created_stamp
9104 << " (no history or past_intervals)" << dendl
;
9105 // pre-octopus ... no pg history. this can be removed in Q release.
9106 enqueue_peering_evt(
9109 std::make_shared
<PGPeeringEvent
>(
9117 pg_history_t(created
, created_stamp
),
9122 dout(20) << __func__
<< " " << pgid
<< " e" << created
9123 << "@" << created_stamp
9124 << " history " << q
->second
.first
9125 << " pi " << q
->second
.second
<< dendl
;
9126 if (!q
->second
.second
.empty() &&
9127 m
->epoch
< q
->second
.second
.get_bounds().second
) {
9128 clog
->error() << "got pg_create on " << pgid
<< " epoch " << m
->epoch
9129 << " and unmatched past_intervals " << q
->second
.second
9130 << " (history " << q
->second
.first
<< ")";
9132 enqueue_peering_evt(
9135 std::make_shared
<PGPeeringEvent
>(
9152 std::lock_guard
l(pending_creates_lock
);
9153 if (pending_creates_from_mon
== 0) {
9154 last_pg_create_epoch
= m
->epoch
;
9161 void OSD::handle_fast_pg_query(MOSDPGQuery
*m
)
9163 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9164 if (!require_osd_peer(m
)) {
9168 int from
= m
->get_source().num();
9169 for (auto& p
: m
->pg_list
) {
9170 enqueue_peering_evt(
9173 std::make_shared
<PGPeeringEvent
>(
9174 p
.second
.epoch_sent
, p
.second
.epoch_sent
,
9177 pg_shard_t(from
, p
.second
.from
),
9179 p
.second
.epoch_sent
),
9186 void OSD::handle_fast_pg_notify(MOSDPGNotify
* m
)
9188 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9189 if (!require_osd_peer(m
)) {
9193 int from
= m
->get_source().num();
9194 for (auto& p
: m
->get_pg_list()) {
9195 spg_t
pgid(p
.info
.pgid
.pgid
, p
.to
);
9196 enqueue_peering_evt(
9199 std::make_shared
<PGPeeringEvent
>(
9203 pgid
, pg_shard_t(from
, p
.from
),
9205 m
->get_connection()->get_features()),
9218 void OSD::handle_fast_pg_info(MOSDPGInfo
* m
)
9220 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9221 if (!require_osd_peer(m
)) {
9225 int from
= m
->get_source().num();
9226 for (auto& p
: m
->pg_list
) {
9227 enqueue_peering_evt(
9228 spg_t(p
.info
.pgid
.pgid
, p
.to
),
9230 std::make_shared
<PGPeeringEvent
>(
9231 p
.epoch_sent
, p
.query_epoch
,
9233 pg_shard_t(from
, p
.from
),
9241 void OSD::handle_fast_pg_remove(MOSDPGRemove
*m
)
9243 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9244 if (!require_osd_peer(m
)) {
9248 for (auto& pgid
: m
->pg_list
) {
9249 enqueue_peering_evt(
9252 std::make_shared
<PGPeeringEvent
>(
9253 m
->get_epoch(), m
->get_epoch(),
9254 PeeringState::DeleteStart())));
9259 void OSD::handle_fast_force_recovery(MOSDForceRecovery
*m
)
9261 dout(10) << __func__
<< " " << *m
<< dendl
;
9262 if (!require_mon_or_mgr_peer(m
)) {
9266 epoch_t epoch
= get_osdmap_epoch();
9267 for (auto pgid
: m
->forced_pgs
) {
9268 if (m
->options
& OFR_BACKFILL
) {
9269 if (m
->options
& OFR_CANCEL
) {
9270 enqueue_peering_evt(
9273 std::make_shared
<PGPeeringEvent
>(
9275 PeeringState::UnsetForceBackfill())));
9277 enqueue_peering_evt(
9280 std::make_shared
<PGPeeringEvent
>(
9282 PeeringState::SetForceBackfill())));
9284 } else if (m
->options
& OFR_RECOVERY
) {
9285 if (m
->options
& OFR_CANCEL
) {
9286 enqueue_peering_evt(
9289 std::make_shared
<PGPeeringEvent
>(
9291 PeeringState::UnsetForceRecovery())));
9293 enqueue_peering_evt(
9296 std::make_shared
<PGPeeringEvent
>(
9298 PeeringState::SetForceRecovery())));
9305 void OSD::handle_pg_query_nopg(const MQuery
& q
)
9307 spg_t pgid
= q
.pgid
;
9308 dout(10) << __func__
<< " " << pgid
<< dendl
;
9310 OSDMapRef osdmap
= get_osdmap();
9311 if (!osdmap
->have_pg_pool(pgid
.pool()))
9314 dout(10) << " pg " << pgid
<< " dne" << dendl
;
9315 pg_info_t
empty(spg_t(pgid
.pgid
, q
.query
.to
));
9316 ConnectionRef con
= service
.get_con_osd_cluster(q
.from
.osd
, osdmap
->get_epoch());
9319 if (q
.query
.type
== pg_query_t::LOG
||
9320 q
.query
.type
== pg_query_t::FULLLOG
) {
9322 q
.query
.from
, q
.query
.to
,
9323 osdmap
->get_epoch(), empty
,
9324 q
.query
.epoch_sent
);
9326 vector
<pg_notify_t
> ls
;
9329 q
.query
.from
, q
.query
.to
,
9331 osdmap
->get_epoch(),
9334 m
= new MOSDPGNotify(osdmap
->get_epoch(), std::move(ls
));
9336 service
.maybe_share_map(con
.get(), osdmap
);
9337 con
->send_message(m
);
9341 void OSDService::queue_check_readable(spg_t spgid
,
9343 ceph::signedspan delay
)
9345 if (delay
== ceph::signedspan::zero()) {
9346 osd
->enqueue_peering_evt(
9349 std::make_shared
<PGPeeringEvent
>(
9351 PeeringState::CheckReadable())));
9353 mono_timer
.add_event(
9355 [this, spgid
, lpr
]() {
9356 queue_check_readable(spgid
, lpr
);
9362 // =========================================================
9365 void OSDService::_maybe_queue_recovery() {
9366 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
9367 uint64_t available_pushes
;
9368 while (!awaiting_throttle
.empty() &&
9369 _recover_now(&available_pushes
)) {
9370 uint64_t to_start
= std::min(
9372 cct
->_conf
->osd_recovery_max_single_start
);
9373 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
9374 awaiting_throttle
.pop_front();
9375 dout(10) << __func__
<< " starting " << to_start
9376 << ", recovery_ops_reserved " << recovery_ops_reserved
9377 << " -> " << (recovery_ops_reserved
+ to_start
) << dendl
;
9378 recovery_ops_reserved
+= to_start
;
9382 bool OSDService::_recover_now(uint64_t *available_pushes
)
9384 if (available_pushes
)
9385 *available_pushes
= 0;
9387 if (ceph_clock_now() < defer_recovery_until
) {
9388 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
9392 if (recovery_paused
) {
9393 dout(15) << __func__
<< " paused" << dendl
;
9397 uint64_t max
= osd
->get_recovery_max_active();
9398 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
9399 dout(15) << __func__
<< " active " << recovery_ops_active
9400 << " + reserved " << recovery_ops_reserved
9401 << " >= max " << max
<< dendl
;
9405 if (available_pushes
)
9406 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
9411 unsigned OSDService::get_target_pg_log_entries() const
9413 auto num_pgs
= osd
->get_num_pgs();
9414 auto target
= cct
->_conf
->osd_target_pg_log_entries_per_osd
;
9415 if (num_pgs
> 0 && target
> 0) {
9416 // target an even spread of our budgeted log entries across all
9417 // PGs. note that while we only get to control the entry count
9418 // for primary PGs, we'll normally be responsible for a mix of
9419 // primary and replica PGs (for the same pool(s) even), so this
9421 return std::max
<unsigned>(
9422 std::min
<unsigned>(target
/ num_pgs
,
9423 cct
->_conf
->osd_max_pg_log_entries
),
9424 cct
->_conf
->osd_min_pg_log_entries
);
9426 // fall back to a per-pg value.
9427 return cct
->_conf
->osd_min_pg_log_entries
;
9431 void OSD::do_recovery(
9432 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
,
9433 ThreadPool::TPHandle
&handle
)
9435 uint64_t started
= 0;
9438 * When the value of osd_recovery_sleep is set greater than zero, recovery
9439 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9440 * recovery event's schedule time. This is done by adding a
9441 * recovery_requeue_callback event, which re-queues the recovery op using
9442 * queue_recovery_after_sleep.
9444 float recovery_sleep
= get_osd_recovery_sleep();
9446 std::lock_guard
l(service
.sleep_lock
);
9447 if (recovery_sleep
> 0 && service
.recovery_needs_sleep
) {
9449 auto recovery_requeue_callback
= new LambdaContext([this, pgref
, queued
, reserved_pushes
](int r
) {
9450 dout(20) << "do_recovery wake up at "
9452 << ", re-queuing recovery" << dendl
;
9453 std::lock_guard
l(service
.sleep_lock
);
9454 service
.recovery_needs_sleep
= false;
9455 service
.queue_recovery_after_sleep(pgref
.get(), queued
, reserved_pushes
);
9458 // This is true for the first recovery op and when the previous recovery op
9459 // has been scheduled in the past. The next recovery op is scheduled after
9460 // completing the sleep from now.
9462 if (auto now
= ceph::real_clock::now();
9463 service
.recovery_schedule_time
< now
) {
9464 service
.recovery_schedule_time
= now
;
9466 service
.recovery_schedule_time
+= ceph::make_timespan(recovery_sleep
);
9467 service
.sleep_timer
.add_event_at(service
.recovery_schedule_time
,
9468 recovery_requeue_callback
);
9469 dout(20) << "Recovery event scheduled at "
9470 << service
.recovery_schedule_time
<< dendl
;
9477 std::lock_guard
l(service
.sleep_lock
);
9478 service
.recovery_needs_sleep
= true;
9481 if (pg
->pg_has_reset_since(queued
)) {
9485 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
9486 #ifdef DEBUG_RECOVERY_OIDS
9487 dout(20) << " active was " << service
.recovery_oids
[pg
->pg_id
] << dendl
;
9490 bool do_unfound
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
9491 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
9492 << " on " << *pg
<< dendl
;
9495 PeeringCtx rctx
= create_context();
9496 rctx
.handle
= &handle
;
9497 pg
->find_unfound(queued
, rctx
);
9498 dispatch_context(rctx
, pg
, pg
->get_osdmap());
9503 ceph_assert(started
<= reserved_pushes
);
9504 service
.release_reserved_pushes(reserved_pushes
);
9507 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
9509 std::lock_guard
l(recovery_lock
);
9510 dout(10) << "start_recovery_op " << *pg
<< " " << soid
9511 << " (" << recovery_ops_active
<< "/"
9512 << osd
->get_recovery_max_active() << " rops)"
9514 recovery_ops_active
++;
9516 #ifdef DEBUG_RECOVERY_OIDS
9517 dout(20) << " active was " << recovery_oids
[pg
->pg_id
] << dendl
;
9518 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
) == 0);
9519 recovery_oids
[pg
->pg_id
].insert(soid
);
9523 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
9525 std::lock_guard
l(recovery_lock
);
9526 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
9527 << " dequeue=" << dequeue
9528 << " (" << recovery_ops_active
<< "/"
9529 << osd
->get_recovery_max_active() << " rops)"
9533 ceph_assert(recovery_ops_active
> 0);
9534 recovery_ops_active
--;
9536 #ifdef DEBUG_RECOVERY_OIDS
9537 dout(20) << " active oids was " << recovery_oids
[pg
->pg_id
] << dendl
;
9538 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
));
9539 recovery_oids
[pg
->pg_id
].erase(soid
);
9542 _maybe_queue_recovery();
9545 bool OSDService::is_recovery_active()
9547 if (cct
->_conf
->osd_debug_pretend_recovery_active
) {
9550 return local_reserver
.has_reservation() || remote_reserver
.has_reservation();
9553 void OSDService::release_reserved_pushes(uint64_t pushes
)
9555 std::lock_guard
l(recovery_lock
);
9556 dout(10) << __func__
<< "(" << pushes
<< "), recovery_ops_reserved "
9557 << recovery_ops_reserved
<< " -> " << (recovery_ops_reserved
-pushes
)
9559 ceph_assert(recovery_ops_reserved
>= pushes
);
9560 recovery_ops_reserved
-= pushes
;
9561 _maybe_queue_recovery();
9564 // =========================================================
9567 bool OSD::op_is_discardable(const MOSDOp
*op
)
9569 // drop client request if they are not connected and can't get the
9571 if (!op
->get_connection()->is_connected()) {
9577 void OSD::enqueue_op(spg_t pg
, OpRequestRef
&& op
, epoch_t epoch
)
9579 const utime_t stamp
= op
->get_req()->get_recv_stamp();
9580 const utime_t latency
= ceph_clock_now() - stamp
;
9581 const unsigned priority
= op
->get_req()->get_priority();
9582 const int cost
= op
->get_req()->get_cost();
9583 const uint64_t owner
= op
->get_req()->get_source().num();
9585 dout(15) << "enqueue_op " << op
<< " prio " << priority
9587 << " latency " << latency
9588 << " epoch " << epoch
9589 << " " << *(op
->get_req()) << dendl
;
9590 op
->osd_trace
.event("enqueue op");
9591 op
->osd_trace
.keyval("priority", priority
);
9592 op
->osd_trace
.keyval("cost", cost
);
9593 op
->mark_queued_for_pg();
9594 logger
->tinc(l_osd_op_before_queue_op_lat
, latency
);
9597 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGOpItem(pg
, std::move(op
))),
9598 cost
, priority
, stamp
, owner
, epoch
));
9601 void OSD::enqueue_peering_evt(spg_t pgid
, PGPeeringEventRef evt
)
9603 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
9606 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
9608 cct
->_conf
->osd_peering_op_priority
,
9611 evt
->get_epoch_sent()));
9615 * NOTE: dequeue called in worker thread, with pg lock
9617 void OSD::dequeue_op(
9618 PGRef pg
, OpRequestRef op
,
9619 ThreadPool::TPHandle
&handle
)
9621 const Message
*m
= op
->get_req();
9624 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_BEGIN", false);
9626 utime_t now
= ceph_clock_now();
9627 op
->set_dequeued_time(now
);
9629 utime_t latency
= now
- m
->get_recv_stamp();
9630 dout(10) << "dequeue_op " << op
<< " prio " << m
->get_priority()
9631 << " cost " << m
->get_cost()
9632 << " latency " << latency
9634 << " pg " << *pg
<< dendl
;
9636 logger
->tinc(l_osd_op_before_dequeue_op_lat
, latency
);
9638 service
.maybe_share_map(m
->get_connection().get(),
9642 if (pg
->is_deleting())
9645 op
->mark_reached_pg();
9646 op
->osd_trace
.event("dequeue_op");
9648 pg
->do_request(op
, handle
);
9651 dout(10) << "dequeue_op " << op
<< " finish" << dendl
;
9652 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_END", false);
9656 void OSD::dequeue_peering_evt(
9659 PGPeeringEventRef evt
,
9660 ThreadPool::TPHandle
& handle
)
9662 PeeringCtx rctx
= create_context();
9663 auto curmap
= sdata
->get_osdmap();
9664 bool need_up_thru
= false;
9665 epoch_t same_interval_since
= 0;
9667 if (const MQuery
*q
= dynamic_cast<const MQuery
*>(evt
->evt
.get())) {
9668 handle_pg_query_nopg(*q
);
9670 derr
<< __func__
<< " unrecognized pg-less event " << evt
->get_desc() << dendl
;
9673 } else if (advance_pg(curmap
->get_epoch(), pg
, handle
, rctx
)) {
9674 pg
->do_peering_event(evt
, rctx
);
9675 if (pg
->is_deleted()) {
9679 dispatch_context(rctx
, pg
, curmap
, &handle
);
9680 need_up_thru
= pg
->get_need_up_thru();
9681 same_interval_since
= pg
->get_same_interval_since();
9686 queue_want_up_thru(same_interval_since
);
9689 service
.send_pg_temp();
9692 void OSD::dequeue_delete(
9696 ThreadPool::TPHandle
& handle
)
9698 dequeue_peering_evt(
9702 std::make_shared
<PGPeeringEvent
>(
9704 PeeringState::DeleteSome())),
9710 // --------------------------------
9712 const char** OSD::get_tracked_conf_keys() const
9714 static const char* KEYS
[] = {
9715 "osd_max_backfills",
9716 "osd_min_recovery_priority",
9717 "osd_max_trimming_pgs",
9718 "osd_op_complaint_time",
9719 "osd_op_log_threshold",
9720 "osd_op_history_size",
9721 "osd_op_history_duration",
9722 "osd_op_history_slow_op_size",
9723 "osd_op_history_slow_op_threshold",
9724 "osd_enable_op_tracker",
9725 "osd_map_cache_size",
9726 "osd_pg_epoch_max_lag_factor",
9727 "osd_pg_epoch_persisted_max_stale",
9728 // clog & admin clog
9731 "clog_to_syslog_facility",
9732 "clog_to_syslog_level",
9733 "osd_objectstore_fuse",
9735 "clog_to_graylog_host",
9736 "clog_to_graylog_port",
9739 "osd_recovery_delay_start",
9740 "osd_client_message_size_cap",
9741 "osd_client_message_cap",
9742 "osd_heartbeat_min_size",
9743 "osd_heartbeat_interval",
9744 "osd_object_clean_region_max_num_intervals",
9745 "osd_scrub_min_interval",
9746 "osd_scrub_max_interval",
9752 void OSD::handle_conf_change(const ConfigProxy
& conf
,
9753 const std::set
<std::string
> &changed
)
9755 std::lock_guard l
{osd_lock
};
9756 if (changed
.count("osd_max_backfills")) {
9757 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9758 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9760 if (changed
.count("osd_min_recovery_priority")) {
9761 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9762 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9764 if (changed
.count("osd_max_trimming_pgs")) {
9765 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
9767 if (changed
.count("osd_op_complaint_time") ||
9768 changed
.count("osd_op_log_threshold")) {
9769 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
9770 cct
->_conf
->osd_op_log_threshold
);
9772 if (changed
.count("osd_op_history_size") ||
9773 changed
.count("osd_op_history_duration")) {
9774 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
9775 cct
->_conf
->osd_op_history_duration
);
9777 if (changed
.count("osd_op_history_slow_op_size") ||
9778 changed
.count("osd_op_history_slow_op_threshold")) {
9779 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
9780 cct
->_conf
->osd_op_history_slow_op_threshold
);
9782 if (changed
.count("osd_enable_op_tracker")) {
9783 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
9785 if (changed
.count("osd_map_cache_size")) {
9786 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9787 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9788 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9790 if (changed
.count("clog_to_monitors") ||
9791 changed
.count("clog_to_syslog") ||
9792 changed
.count("clog_to_syslog_level") ||
9793 changed
.count("clog_to_syslog_facility") ||
9794 changed
.count("clog_to_graylog") ||
9795 changed
.count("clog_to_graylog_host") ||
9796 changed
.count("clog_to_graylog_port") ||
9797 changed
.count("host") ||
9798 changed
.count("fsid")) {
9799 update_log_config();
9801 if (changed
.count("osd_pg_epoch_max_lag_factor")) {
9802 m_osd_pg_epoch_max_lag_factor
= conf
.get_val
<double>(
9803 "osd_pg_epoch_max_lag_factor");
9807 if (changed
.count("osd_objectstore_fuse")) {
9809 enable_disable_fuse(false);
9814 if (changed
.count("osd_recovery_delay_start")) {
9815 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
9816 service
.kick_recovery_queue();
9819 if (changed
.count("osd_client_message_cap")) {
9820 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
9821 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9822 if (pol
.throttler_messages
&& newval
> 0) {
9823 pol
.throttler_messages
->reset_max(newval
);
9826 if (changed
.count("osd_client_message_size_cap")) {
9827 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
9828 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9829 if (pol
.throttler_bytes
&& newval
> 0) {
9830 pol
.throttler_bytes
->reset_max(newval
);
9833 if (changed
.count("osd_object_clean_region_max_num_intervals")) {
9834 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
9837 if (changed
.count("osd_scrub_min_interval") ||
9838 changed
.count("osd_scrub_max_interval")) {
9839 resched_all_scrubs();
9840 dout(0) << __func__
<< ": scrub interval change" << dendl
;
9845 void OSD::update_log_config()
9847 map
<string
,string
> log_to_monitors
;
9848 map
<string
,string
> log_to_syslog
;
9849 map
<string
,string
> log_channel
;
9850 map
<string
,string
> log_prio
;
9851 map
<string
,string
> log_to_graylog
;
9852 map
<string
,string
> log_to_graylog_host
;
9853 map
<string
,string
> log_to_graylog_port
;
9857 if (parse_log_client_options(cct
, log_to_monitors
, log_to_syslog
,
9858 log_channel
, log_prio
, log_to_graylog
,
9859 log_to_graylog_host
, log_to_graylog_port
,
9861 clog
->update_config(log_to_monitors
, log_to_syslog
,
9862 log_channel
, log_prio
, log_to_graylog
,
9863 log_to_graylog_host
, log_to_graylog_port
,
9865 derr
<< "log_to_monitors " << log_to_monitors
<< dendl
;
9868 void OSD::check_config()
9870 // some sanity checks
9871 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
9872 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
9873 << " is not > osd_pg_epoch_persisted_max_stale ("
9874 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
9876 if (cct
->_conf
->osd_object_clean_region_max_num_intervals
< 0) {
9877 clog
->warn() << "osd_object_clean_region_max_num_intervals ("
9878 << cct
->_conf
->osd_object_clean_region_max_num_intervals
9883 // --------------------------------
9885 void OSD::get_latest_osdmap()
9887 dout(10) << __func__
<< " -- start" << dendl
;
9890 service
.objecter
->wait_for_latest_osdmap(&cond
);
9893 dout(10) << __func__
<< " -- finish" << dendl
;
9896 // --------------------------------
9898 void OSD::set_perf_queries(const ConfigPayload
&config_payload
) {
9899 const OSDConfigPayload
&osd_config_payload
= boost::get
<OSDConfigPayload
>(config_payload
);
9900 const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
= osd_config_payload
.config
;
9901 dout(10) << "setting " << queries
.size() << " queries" << dendl
;
9903 std::list
<OSDPerfMetricQuery
> supported_queries
;
9904 for (auto &it
: queries
) {
9905 auto &query
= it
.first
;
9906 if (!query
.key_descriptor
.empty()) {
9907 supported_queries
.push_back(query
);
9910 if (supported_queries
.size() < queries
.size()) {
9911 dout(1) << queries
.size() - supported_queries
.size()
9912 << " unsupported queries" << dendl
;
9915 std::lock_guard locker
{m_perf_queries_lock
};
9916 m_perf_queries
= supported_queries
;
9917 m_perf_limits
= queries
;
9919 std::vector
<PGRef
> pgs
;
9921 for (auto& pg
: pgs
) {
9922 std::scoped_lock l
{*pg
};
9923 pg
->set_dynamic_perf_stats_queries(supported_queries
);
9927 MetricPayload
OSD::get_perf_reports() {
9928 OSDMetricPayload payload
;
9929 std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> &reports
= payload
.report
;
9931 std::vector
<PGRef
> pgs
;
9933 DynamicPerfStats dps
;
9934 for (auto& pg
: pgs
) {
9935 // m_perf_queries can be modified only in set_perf_queries by mgr client
9936 // request, and it is protected by by mgr client's lock, which is held
9937 // when set_perf_queries/get_perf_reports are called, so we may not hold
9938 // m_perf_queries_lock here.
9939 DynamicPerfStats
pg_dps(m_perf_queries
);
9941 pg
->get_dynamic_perf_stats(&pg_dps
);
9945 dps
.add_to_reports(m_perf_limits
, &reports
);
9946 dout(20) << "reports for " << reports
.size() << " queries" << dendl
;
9951 // =============================================================
9954 #define dout_context cct
9956 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
9958 void OSDShard::_attach_pg(OSDShardPGSlot
*slot
, PG
*pg
)
9960 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
9962 pg
->osd_shard
= this;
9966 slot
->epoch
= pg
->get_osdmap_epoch();
9967 pg_slots_by_epoch
.insert(*slot
);
9970 void OSDShard::_detach_pg(OSDShardPGSlot
*slot
)
9972 dout(10) << slot
->pg
->pg_id
<< " " << slot
->pg
<< dendl
;
9973 slot
->pg
->osd_shard
= nullptr;
9974 slot
->pg
->pg_slot
= nullptr;
9978 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
9980 if (waiting_for_min_pg_epoch
) {
9981 min_pg_epoch_cond
.notify_all();
9985 void OSDShard::update_pg_epoch(OSDShardPGSlot
*slot
, epoch_t e
)
9987 std::lock_guard
l(shard_lock
);
9988 dout(30) << "min was " << pg_slots_by_epoch
.begin()->epoch
9989 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
9990 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
9991 dout(20) << slot
->pg
->pg_id
<< " " << slot
->epoch
<< " -> " << e
<< dendl
;
9993 pg_slots_by_epoch
.insert(*slot
);
9994 dout(30) << "min is now " << pg_slots_by_epoch
.begin()->epoch
9995 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
9996 if (waiting_for_min_pg_epoch
) {
9997 min_pg_epoch_cond
.notify_all();
10001 epoch_t
OSDShard::get_min_pg_epoch()
10003 std::lock_guard
l(shard_lock
);
10004 auto p
= pg_slots_by_epoch
.begin();
10005 if (p
== pg_slots_by_epoch
.end()) {
10011 void OSDShard::wait_min_pg_epoch(epoch_t need
)
10013 std::unique_lock l
{shard_lock
};
10014 ++waiting_for_min_pg_epoch
;
10015 min_pg_epoch_cond
.wait(l
, [need
, this] {
10016 if (pg_slots_by_epoch
.empty()) {
10018 } else if (pg_slots_by_epoch
.begin()->epoch
>= need
) {
10021 dout(10) << need
<< " waiting on "
10022 << pg_slots_by_epoch
.begin()->epoch
<< dendl
;
10026 --waiting_for_min_pg_epoch
;
10029 epoch_t
OSDShard::get_max_waiting_epoch()
10031 std::lock_guard
l(shard_lock
);
10033 for (auto& i
: pg_slots
) {
10034 if (!i
.second
->waiting_peering
.empty()) {
10035 r
= std::max(r
, i
.second
->waiting_peering
.rbegin()->first
);
10041 void OSDShard::consume_map(
10042 const OSDMapRef
& new_osdmap
,
10043 unsigned *pushes_to_free
)
10045 std::lock_guard
l(shard_lock
);
10046 OSDMapRef old_osdmap
;
10048 std::lock_guard
l(osdmap_lock
);
10049 old_osdmap
= std::move(shard_osdmap
);
10050 shard_osdmap
= new_osdmap
;
10052 dout(10) << new_osdmap
->get_epoch()
10053 << " (was " << (old_osdmap
? old_osdmap
->get_epoch() : 0) << ")"
10055 bool queued
= false;
10058 auto p
= pg_slots
.begin();
10059 while (p
!= pg_slots
.end()) {
10060 OSDShardPGSlot
*slot
= p
->second
.get();
10061 const spg_t
& pgid
= p
->first
;
10062 dout(20) << __func__
<< " " << pgid
<< dendl
;
10063 if (!slot
->waiting_for_split
.empty()) {
10064 dout(20) << __func__
<< " " << pgid
10065 << " waiting for split " << slot
->waiting_for_split
<< dendl
;
10069 if (slot
->waiting_for_merge_epoch
> new_osdmap
->get_epoch()) {
10070 dout(20) << __func__
<< " " << pgid
10071 << " waiting for merge by epoch " << slot
->waiting_for_merge_epoch
10076 if (!slot
->waiting_peering
.empty()) {
10077 epoch_t first
= slot
->waiting_peering
.begin()->first
;
10078 if (first
<= new_osdmap
->get_epoch()) {
10079 dout(20) << __func__
<< " " << pgid
10080 << " pending_peering first epoch " << first
10081 << " <= " << new_osdmap
->get_epoch() << ", requeueing" << dendl
;
10082 _wake_pg_slot(pgid
, slot
);
10088 if (!slot
->waiting
.empty()) {
10089 if (new_osdmap
->is_up_acting_osd_shard(pgid
, osd
->get_nodeid())) {
10090 dout(20) << __func__
<< " " << pgid
<< " maps to us, keeping"
10095 while (!slot
->waiting
.empty() &&
10096 slot
->waiting
.front().get_map_epoch() <= new_osdmap
->get_epoch()) {
10097 auto& qi
= slot
->waiting
.front();
10098 dout(20) << __func__
<< " " << pgid
10099 << " waiting item " << qi
10100 << " epoch " << qi
.get_map_epoch()
10101 << " <= " << new_osdmap
->get_epoch()
10103 << (qi
.get_map_epoch() < new_osdmap
->get_epoch() ? "stale" :
10105 << ", dropping" << dendl
;
10106 *pushes_to_free
+= qi
.get_reserved_pushes();
10107 slot
->waiting
.pop_front();
10110 if (slot
->waiting
.empty() &&
10111 slot
->num_running
== 0 &&
10112 slot
->waiting_for_split
.empty() &&
10114 dout(20) << __func__
<< " " << pgid
<< " empty, pruning" << dendl
;
10115 p
= pg_slots
.erase(p
);
10122 std::lock_guard l
{sdata_wait_lock
};
10123 sdata_cond
.notify_one();
10127 void OSDShard::_wake_pg_slot(
10129 OSDShardPGSlot
*slot
)
10131 dout(20) << __func__
<< " " << pgid
10132 << " to_process " << slot
->to_process
10133 << " waiting " << slot
->waiting
10134 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10135 for (auto i
= slot
->to_process
.rbegin();
10136 i
!= slot
->to_process
.rend();
10138 scheduler
->enqueue_front(std::move(*i
));
10140 slot
->to_process
.clear();
10141 for (auto i
= slot
->waiting
.rbegin();
10142 i
!= slot
->waiting
.rend();
10144 scheduler
->enqueue_front(std::move(*i
));
10146 slot
->waiting
.clear();
10147 for (auto i
= slot
->waiting_peering
.rbegin();
10148 i
!= slot
->waiting_peering
.rend();
10150 // this is overkill; we requeue everything, even if some of these
10151 // items are waiting for maps we don't have yet. FIXME, maybe,
10152 // someday, if we decide this inefficiency matters
10153 for (auto j
= i
->second
.rbegin(); j
!= i
->second
.rend(); ++j
) {
10154 scheduler
->enqueue_front(std::move(*j
));
10157 slot
->waiting_peering
.clear();
10158 ++slot
->requeue_seq
;
10161 void OSDShard::identify_splits_and_merges(
10162 const OSDMapRef
& as_of_osdmap
,
10163 set
<pair
<spg_t
,epoch_t
>> *split_pgs
,
10164 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10166 std::lock_guard
l(shard_lock
);
10167 if (shard_osdmap
) {
10168 for (auto& i
: pg_slots
) {
10169 const spg_t
& pgid
= i
.first
;
10170 auto *slot
= i
.second
.get();
10172 osd
->service
.identify_splits_and_merges(
10173 shard_osdmap
, as_of_osdmap
, pgid
,
10174 split_pgs
, merge_pgs
);
10175 } else if (!slot
->waiting_for_split
.empty()) {
10176 osd
->service
.identify_splits_and_merges(
10177 shard_osdmap
, as_of_osdmap
, pgid
,
10178 split_pgs
, nullptr);
10180 dout(20) << __func__
<< " slot " << pgid
10181 << " has no pg and waiting_for_split " << dendl
;
10187 void OSDShard::prime_splits(const OSDMapRef
& as_of_osdmap
,
10188 set
<pair
<spg_t
,epoch_t
>> *pgids
)
10190 std::lock_guard
l(shard_lock
);
10191 _prime_splits(pgids
);
10192 if (shard_osdmap
->get_epoch() > as_of_osdmap
->get_epoch()) {
10193 set
<pair
<spg_t
,epoch_t
>> newer_children
;
10194 for (auto i
: *pgids
) {
10195 osd
->service
.identify_splits_and_merges(
10196 as_of_osdmap
, shard_osdmap
, i
.first
,
10197 &newer_children
, nullptr);
10199 newer_children
.insert(pgids
->begin(), pgids
->end());
10200 dout(10) << "as_of_osdmap " << as_of_osdmap
->get_epoch() << " < shard "
10201 << shard_osdmap
->get_epoch() << ", new children " << newer_children
10203 _prime_splits(&newer_children
);
10204 // note: we don't care what is left over here for other shards.
10205 // if this shard is ahead of us and one isn't, e.g., one thread is
10206 // calling into prime_splits via _process (due to a newly created
10207 // pg) and this shard has a newer map due to a racing consume_map,
10208 // then any grandchildren left here will be identified (or were
10209 // identified) when the slower shard's osdmap is advanced.
10210 // _prime_splits() will tolerate the case where the pgid is
10215 void OSDShard::_prime_splits(set
<pair
<spg_t
,epoch_t
>> *pgids
)
10217 dout(10) << *pgids
<< dendl
;
10218 auto p
= pgids
->begin();
10219 while (p
!= pgids
->end()) {
10220 unsigned shard_index
= p
->first
.hash_to_shard(osd
->num_shards
);
10221 if (shard_index
== shard_id
) {
10222 auto r
= pg_slots
.emplace(p
->first
, nullptr);
10224 dout(10) << "priming slot " << p
->first
<< " e" << p
->second
<< dendl
;
10225 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10226 r
.first
->second
->waiting_for_split
.insert(p
->second
);
10229 ceph_assert(q
!= pg_slots
.end());
10230 dout(10) << "priming (existing) slot " << p
->first
<< " e" << p
->second
10232 q
->second
->waiting_for_split
.insert(p
->second
);
10234 p
= pgids
->erase(p
);
10241 void OSDShard::prime_merges(const OSDMapRef
& as_of_osdmap
,
10242 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10244 std::lock_guard
l(shard_lock
);
10245 dout(20) << __func__
<< " checking shard " << shard_id
10246 << " for remaining merge pgs " << merge_pgs
<< dendl
;
10247 auto p
= merge_pgs
->begin();
10248 while (p
!= merge_pgs
->end()) {
10249 spg_t pgid
= p
->first
;
10250 epoch_t epoch
= p
->second
;
10251 unsigned shard_index
= pgid
.hash_to_shard(osd
->num_shards
);
10252 if (shard_index
!= shard_id
) {
10256 OSDShardPGSlot
*slot
;
10257 auto r
= pg_slots
.emplace(pgid
, nullptr);
10259 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10261 slot
= r
.first
->second
.get();
10264 dout(20) << __func__
<< " have merge participant pg " << pgid
10265 << " " << slot
->pg
<< dendl
;
10266 } else if (!slot
->waiting_for_split
.empty() &&
10267 *slot
->waiting_for_split
.begin() < epoch
) {
10268 dout(20) << __func__
<< " pending split on merge participant pg " << pgid
10269 << " " << slot
->waiting_for_split
<< dendl
;
10271 dout(20) << __func__
<< " creating empty merge participant " << pgid
10272 << " for merge in " << epoch
<< dendl
;
10273 // leave history zeroed; PG::merge_from() will fill it in.
10274 pg_history_t history
;
10275 PGCreateInfo
cinfo(pgid
, epoch
- 1,
10276 history
, PastIntervals(), false);
10277 PGRef pg
= osd
->handle_pg_create_info(shard_osdmap
, &cinfo
);
10278 _attach_pg(r
.first
->second
.get(), pg
.get());
10279 _wake_pg_slot(pgid
, slot
);
10282 // mark slot for merge
10283 dout(20) << __func__
<< " marking merge participant " << pgid
<< dendl
;
10284 slot
->waiting_for_merge_epoch
= epoch
;
10285 p
= merge_pgs
->erase(p
);
10289 void OSDShard::register_and_wake_split_child(PG
*pg
)
10293 std::lock_guard
l(shard_lock
);
10294 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10295 auto p
= pg_slots
.find(pg
->pg_id
);
10296 ceph_assert(p
!= pg_slots
.end());
10297 auto *slot
= p
->second
.get();
10298 dout(20) << pg
->pg_id
<< " waiting_for_split " << slot
->waiting_for_split
10300 ceph_assert(!slot
->pg
);
10301 ceph_assert(!slot
->waiting_for_split
.empty());
10302 _attach_pg(slot
, pg
);
10304 epoch
= pg
->get_osdmap_epoch();
10305 ceph_assert(slot
->waiting_for_split
.count(epoch
));
10306 slot
->waiting_for_split
.erase(epoch
);
10307 if (slot
->waiting_for_split
.empty()) {
10308 _wake_pg_slot(pg
->pg_id
, slot
);
10310 dout(10) << __func__
<< " still waiting for split on "
10311 << slot
->waiting_for_split
<< dendl
;
10315 // kick child to ensure it pulls up to the latest osdmap
10316 osd
->enqueue_peering_evt(
10319 std::make_shared
<PGPeeringEvent
>(
10324 std::lock_guard l
{sdata_wait_lock
};
10325 sdata_cond
.notify_one();
10328 void OSDShard::unprime_split_children(spg_t parent
, unsigned old_pg_num
)
10330 std::lock_guard
l(shard_lock
);
10331 vector
<spg_t
> to_delete
;
10332 for (auto& i
: pg_slots
) {
10333 if (i
.first
!= parent
&&
10334 i
.first
.get_ancestor(old_pg_num
) == parent
) {
10335 dout(10) << __func__
<< " parent " << parent
<< " clearing " << i
.first
10337 _wake_pg_slot(i
.first
, i
.second
.get());
10338 to_delete
.push_back(i
.first
);
10341 for (auto pgid
: to_delete
) {
10342 pg_slots
.erase(pgid
);
10346 OSDShard::OSDShard(
10353 shard_name(string("OSDShard.") + stringify(id
)),
10354 sdata_wait_lock_name(shard_name
+ "::sdata_wait_lock"),
10355 sdata_wait_lock
{make_mutex(sdata_wait_lock_name
)},
10356 osdmap_lock
{make_mutex(shard_name
+ "::osdmap_lock")},
10357 shard_lock_name(shard_name
+ "::shard_lock"),
10358 shard_lock
{make_mutex(shard_lock_name
)},
10359 scheduler(ceph::osd::scheduler::make_scheduler(cct
)),
10360 context_queue(sdata_wait_lock
, sdata_cond
)
10362 dout(0) << "using op scheduler " << *scheduler
<< dendl
;
10366 // =============================================================
10368 #undef dout_context
10369 #define dout_context osd->cct
10371 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10373 void OSD::ShardedOpWQ::_add_slot_waiter(
10375 OSDShardPGSlot
*slot
,
10376 OpSchedulerItem
&& qi
)
10378 if (qi
.is_peering()) {
10379 dout(20) << __func__
<< " " << pgid
10380 << " peering, item epoch is "
10381 << qi
.get_map_epoch()
10382 << ", will wait on " << qi
<< dendl
;
10383 slot
->waiting_peering
[qi
.get_map_epoch()].push_back(std::move(qi
));
10385 dout(20) << __func__
<< " " << pgid
10386 << " item epoch is "
10387 << qi
.get_map_epoch()
10388 << ", will wait on " << qi
<< dendl
;
10389 slot
->waiting
.push_back(std::move(qi
));
10394 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10396 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
10398 uint32_t shard_index
= thread_index
% osd
->num_shards
;
10399 auto& sdata
= osd
->shards
[shard_index
];
10400 ceph_assert(sdata
);
10402 // If all threads of shards do oncommits, there is a out-of-order
10403 // problem. So we choose the thread which has the smallest
10404 // thread_index(thread_index < num_shards) of shard to do oncommit
10406 bool is_smallest_thread_index
= thread_index
< osd
->num_shards
;
10409 sdata
->shard_lock
.lock();
10410 if (sdata
->scheduler
->empty() &&
10411 (!is_smallest_thread_index
|| sdata
->context_queue
.empty())) {
10412 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10413 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
10414 // we raced with a context_queue addition, don't wait
10415 wait_lock
.unlock();
10416 } else if (!sdata
->stop_waiting
) {
10417 dout(20) << __func__
<< " empty q, waiting" << dendl
;
10418 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
10419 sdata
->shard_lock
.unlock();
10420 sdata
->sdata_cond
.wait(wait_lock
);
10421 wait_lock
.unlock();
10422 sdata
->shard_lock
.lock();
10423 if (sdata
->scheduler
->empty() &&
10424 !(is_smallest_thread_index
&& !sdata
->context_queue
.empty())) {
10425 sdata
->shard_lock
.unlock();
10428 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10429 osd
->cct
->_conf
->threadpool_default_timeout
, 0);
10431 dout(20) << __func__
<< " need return immediately" << dendl
;
10432 wait_lock
.unlock();
10433 sdata
->shard_lock
.unlock();
10438 list
<Context
*> oncommits
;
10439 if (is_smallest_thread_index
) {
10440 sdata
->context_queue
.move_to(oncommits
);
10443 if (sdata
->scheduler
->empty()) {
10444 if (osd
->is_stopping()) {
10445 sdata
->shard_lock
.unlock();
10446 for (auto c
: oncommits
) {
10447 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10450 return; // OSD shutdown, discard.
10452 sdata
->shard_lock
.unlock();
10453 handle_oncommits(oncommits
);
10457 OpSchedulerItem item
= sdata
->scheduler
->dequeue();
10458 if (osd
->is_stopping()) {
10459 sdata
->shard_lock
.unlock();
10460 for (auto c
: oncommits
) {
10461 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10464 return; // OSD shutdown, discard.
10467 const auto token
= item
.get_ordering_token();
10468 auto r
= sdata
->pg_slots
.emplace(token
, nullptr);
10470 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10472 OSDShardPGSlot
*slot
= r
.first
->second
.get();
10473 dout(20) << __func__
<< " " << token
10474 << (r
.second
? " (new)" : "")
10475 << " to_process " << slot
->to_process
10476 << " waiting " << slot
->waiting
10477 << " waiting_peering " << slot
->waiting_peering
10479 slot
->to_process
.push_back(std::move(item
));
10480 dout(20) << __func__
<< " " << slot
->to_process
.back()
10481 << " queued" << dendl
;
10484 PGRef pg
= slot
->pg
;
10486 // lock pg (if we have it)
10488 // note the requeue seq now...
10489 uint64_t requeue_seq
= slot
->requeue_seq
;
10490 ++slot
->num_running
;
10492 sdata
->shard_lock
.unlock();
10493 osd
->service
.maybe_inject_dispatch_delay();
10495 osd
->service
.maybe_inject_dispatch_delay();
10496 sdata
->shard_lock
.lock();
10498 auto q
= sdata
->pg_slots
.find(token
);
10499 if (q
== sdata
->pg_slots
.end()) {
10500 // this can happen if we race with pg removal.
10501 dout(20) << __func__
<< " slot " << token
<< " no longer there" << dendl
;
10503 sdata
->shard_lock
.unlock();
10504 handle_oncommits(oncommits
);
10507 slot
= q
->second
.get();
10508 --slot
->num_running
;
10510 if (slot
->to_process
.empty()) {
10511 // raced with _wake_pg_slot or consume_map
10512 dout(20) << __func__
<< " " << token
10513 << " nothing queued" << dendl
;
10515 sdata
->shard_lock
.unlock();
10516 handle_oncommits(oncommits
);
10519 if (requeue_seq
!= slot
->requeue_seq
) {
10520 dout(20) << __func__
<< " " << token
10521 << " requeue_seq " << slot
->requeue_seq
<< " > our "
10522 << requeue_seq
<< ", we raced with _wake_pg_slot"
10525 sdata
->shard_lock
.unlock();
10526 handle_oncommits(oncommits
);
10529 if (slot
->pg
!= pg
) {
10530 // this can happen if we race with pg removal.
10531 dout(20) << __func__
<< " slot " << token
<< " no longer attached to "
10538 dout(20) << __func__
<< " " << token
10539 << " to_process " << slot
->to_process
10540 << " waiting " << slot
->waiting
10541 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10543 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
10547 auto qi
= std::move(slot
->to_process
.front());
10548 slot
->to_process
.pop_front();
10549 dout(20) << __func__
<< " " << qi
<< " pg " << pg
<< dendl
;
10550 set
<pair
<spg_t
,epoch_t
>> new_children
;
10554 // should this pg shard exist on this osd in this (or a later) epoch?
10555 osdmap
= sdata
->shard_osdmap
;
10556 const PGCreateInfo
*create_info
= qi
.creates_pg();
10557 if (!slot
->waiting_for_split
.empty()) {
10558 dout(20) << __func__
<< " " << token
10559 << " splitting " << slot
->waiting_for_split
<< dendl
;
10560 _add_slot_waiter(token
, slot
, std::move(qi
));
10561 } else if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10562 dout(20) << __func__
<< " " << token
10563 << " map " << qi
.get_map_epoch() << " > "
10564 << osdmap
->get_epoch() << dendl
;
10565 _add_slot_waiter(token
, slot
, std::move(qi
));
10566 } else if (qi
.is_peering()) {
10567 if (!qi
.peering_requires_pg()) {
10568 // for pg-less events, we run them under the ordering lock, since
10569 // we don't have the pg lock to keep them ordered.
10570 qi
.run(osd
, sdata
, pg
, tp_handle
);
10571 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10573 if (create_info
->by_mon
&&
10574 osdmap
->get_pg_acting_primary(token
.pgid
) != osd
->whoami
) {
10575 dout(20) << __func__
<< " " << token
10576 << " no pg, no longer primary, ignoring mon create on "
10579 dout(20) << __func__
<< " " << token
10580 << " no pg, should create on " << qi
<< dendl
;
10581 pg
= osd
->handle_pg_create_info(osdmap
, create_info
);
10583 // we created the pg! drop out and continue "normally"!
10584 sdata
->_attach_pg(slot
, pg
.get());
10585 sdata
->_wake_pg_slot(token
, slot
);
10587 // identify split children between create epoch and shard epoch.
10588 osd
->service
.identify_splits_and_merges(
10589 pg
->get_osdmap(), osdmap
, pg
->pg_id
, &new_children
, nullptr);
10590 sdata
->_prime_splits(&new_children
);
10591 // distribute remaining split children to other shards below!
10594 dout(20) << __func__
<< " ignored create on " << qi
<< dendl
;
10597 dout(20) << __func__
<< " " << token
10598 << " no pg, peering, !create, discarding " << qi
<< dendl
;
10601 dout(20) << __func__
<< " " << token
10602 << " no pg, peering, doesn't map here e" << osdmap
->get_epoch()
10603 << ", discarding " << qi
10606 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10607 dout(20) << __func__
<< " " << token
10608 << " no pg, should exist e" << osdmap
->get_epoch()
10609 << ", will wait on " << qi
<< dendl
;
10610 _add_slot_waiter(token
, slot
, std::move(qi
));
10612 dout(20) << __func__
<< " " << token
10613 << " no pg, shouldn't exist e" << osdmap
->get_epoch()
10614 << ", dropping " << qi
<< dendl
;
10615 // share map with client?
10616 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10617 osd
->service
.maybe_share_map((*_op
)->get_req()->get_connection().get(),
10618 sdata
->shard_osdmap
,
10619 (*_op
)->sent_epoch
);
10621 unsigned pushes_to_free
= qi
.get_reserved_pushes();
10622 if (pushes_to_free
> 0) {
10623 sdata
->shard_lock
.unlock();
10624 osd
->service
.release_reserved_pushes(pushes_to_free
);
10625 handle_oncommits(oncommits
);
10629 sdata
->shard_lock
.unlock();
10630 handle_oncommits(oncommits
);
10633 if (qi
.is_peering()) {
10634 OSDMapRef osdmap
= sdata
->shard_osdmap
;
10635 if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10636 _add_slot_waiter(token
, slot
, std::move(qi
));
10637 sdata
->shard_lock
.unlock();
10639 handle_oncommits(oncommits
);
10643 sdata
->shard_lock
.unlock();
10645 if (!new_children
.empty()) {
10646 for (auto shard
: osd
->shards
) {
10647 shard
->prime_splits(osdmap
, &new_children
);
10649 ceph_assert(new_children
.empty());
10652 // osd_opwq_process marks the point at which an operation has been dequeued
10653 // and will begin to be handled by a worker thread.
10657 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10658 reqid
= (*_op
)->get_reqid();
10661 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
10662 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
10665 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
10666 Formatter
*f
= Formatter::create("json");
10667 f
->open_object_section("q");
10669 f
->close_section();
10674 qi
.run(osd
, sdata
, pg
, tp_handle
);
10679 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10680 reqid
= (*_op
)->get_reqid();
10683 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
10684 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
10687 handle_oncommits(oncommits
);
10690 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem
&& item
) {
10691 uint32_t shard_index
=
10692 item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
10694 dout(20) << __func__
<< " " << item
<< dendl
;
10696 OSDShard
* sdata
= osd
->shards
[shard_index
];
10697 assert (NULL
!= sdata
);
10701 std::lock_guard l
{sdata
->shard_lock
};
10702 empty
= sdata
->scheduler
->empty();
10703 sdata
->scheduler
->enqueue(std::move(item
));
10707 std::lock_guard l
{sdata
->sdata_wait_lock
};
10708 sdata
->sdata_cond
.notify_one();
10712 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem
&& item
)
10714 auto shard_index
= item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
10715 auto& sdata
= osd
->shards
[shard_index
];
10716 ceph_assert(sdata
);
10717 sdata
->shard_lock
.lock();
10718 auto p
= sdata
->pg_slots
.find(item
.get_ordering_token());
10719 if (p
!= sdata
->pg_slots
.end() &&
10720 !p
->second
->to_process
.empty()) {
10721 // we may be racing with _process, which has dequeued a new item
10722 // from scheduler, put it on to_process, and is now busy taking the
10723 // pg lock. ensure this old requeued item is ordered before any
10724 // such newer item in to_process.
10725 p
->second
->to_process
.push_front(std::move(item
));
10726 item
= std::move(p
->second
->to_process
.back());
10727 p
->second
->to_process
.pop_back();
10728 dout(20) << __func__
10729 << " " << p
->second
->to_process
.front()
10730 << " shuffled w/ " << item
<< dendl
;
10732 dout(20) << __func__
<< " " << item
<< dendl
;
10734 sdata
->scheduler
->enqueue_front(std::move(item
));
10735 sdata
->shard_lock
.unlock();
10736 std::lock_guard l
{sdata
->sdata_wait_lock
};
10737 sdata
->sdata_cond
.notify_one();
10741 namespace osd_cmds
{
10743 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
,
10746 if (!ceph_using_tcmalloc()) {
10747 os
<< "could not issue heap profiler command -- not using tcmalloc!";
10748 return -EOPNOTSUPP
;
10752 if (!cmd_getval(cmdmap
, "heapcmd", cmd
)) {
10753 os
<< "unable to get value for command \"" << cmd
<< "\"";
10757 std::vector
<std::string
> cmd_vec
;
10758 get_str_vec(cmd
, cmd_vec
);
10761 if (cmd_getval(cmdmap
, "value", val
)) {
10762 cmd_vec
.push_back(val
);
10765 ceph_heap_profiler_handle_command(cmd_vec
, os
);
10770 }} // namespace ceph::osd_cmds