1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
47 #include "osdc/Objecter.h"
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_releases.h"
52 #include "common/ceph_time.h"
53 #include "common/version.h"
54 #include "common/pick_address.h"
55 #include "common/blkdev.h"
56 #include "common/numa.h"
58 #include "os/ObjectStore.h"
60 #include "os/FuseStore.h"
63 #include "PrimaryLogPG.h"
65 #include "msg/Messenger.h"
66 #include "msg/Message.h"
68 #include "mon/MonClient.h"
70 #include "messages/MLog.h"
72 #include "messages/MGenericMessage.h"
73 #include "messages/MOSDPing.h"
74 #include "messages/MOSDFailure.h"
75 #include "messages/MOSDMarkMeDown.h"
76 #include "messages/MOSDMarkMeDead.h"
77 #include "messages/MOSDFull.h"
78 #include "messages/MOSDOp.h"
79 #include "messages/MOSDOpReply.h"
80 #include "messages/MOSDBackoff.h"
81 #include "messages/MOSDBeacon.h"
82 #include "messages/MOSDRepOp.h"
83 #include "messages/MOSDRepOpReply.h"
84 #include "messages/MOSDBoot.h"
85 #include "messages/MOSDPGTemp.h"
86 #include "messages/MOSDPGReadyToMerge.h"
88 #include "messages/MOSDMap.h"
89 #include "messages/MMonGetOSDMap.h"
90 #include "messages/MOSDPGNotify.h"
91 #include "messages/MOSDPGNotify2.h"
92 #include "messages/MOSDPGQuery.h"
93 #include "messages/MOSDPGQuery2.h"
94 #include "messages/MOSDPGLog.h"
95 #include "messages/MOSDPGRemove.h"
96 #include "messages/MOSDPGInfo.h"
97 #include "messages/MOSDPGInfo2.h"
98 #include "messages/MOSDPGCreate.h"
99 #include "messages/MOSDPGCreate2.h"
100 #include "messages/MOSDPGScan.h"
101 #include "messages/MBackfillReserve.h"
102 #include "messages/MRecoveryReserve.h"
103 #include "messages/MOSDForceRecovery.h"
104 #include "messages/MOSDECSubOpWrite.h"
105 #include "messages/MOSDECSubOpWriteReply.h"
106 #include "messages/MOSDECSubOpRead.h"
107 #include "messages/MOSDECSubOpReadReply.h"
108 #include "messages/MOSDPGCreated.h"
109 #include "messages/MOSDPGUpdateLogMissing.h"
110 #include "messages/MOSDPGUpdateLogMissingReply.h"
112 #include "messages/MOSDPeeringOp.h"
114 #include "messages/MOSDAlive.h"
116 #include "messages/MOSDScrub.h"
117 #include "messages/MOSDScrub2.h"
118 #include "messages/MOSDRepScrub.h"
120 #include "messages/MCommand.h"
121 #include "messages/MCommandReply.h"
123 #include "messages/MPGStats.h"
124 #include "messages/MPGStatsAck.h"
126 #include "messages/MWatchNotify.h"
127 #include "messages/MOSDPGPush.h"
128 #include "messages/MOSDPGPushReply.h"
129 #include "messages/MOSDPGPull.h"
131 #include "messages/MMonGetPurgedSnaps.h"
132 #include "messages/MMonGetPurgedSnapsReply.h"
134 #include "common/perf_counters.h"
135 #include "common/Timer.h"
136 #include "common/LogClient.h"
137 #include "common/AsyncReserver.h"
138 #include "common/HeartbeatMap.h"
139 #include "common/admin_socket.h"
140 #include "common/ceph_context.h"
142 #include "global/signal_handler.h"
143 #include "global/pidfile.h"
145 #include "include/color.h"
146 #include "perfglue/cpu_profiler.h"
147 #include "perfglue/heap_profiler.h"
149 #include "osd/OpRequest.h"
151 #include "auth/AuthAuthorizeHandler.h"
152 #include "auth/RotatingKeyRing.h"
154 #include "objclass/objclass.h"
156 #include "common/cmdparse.h"
157 #include "include/str_list.h"
158 #include "include/util.h"
160 #include "include/ceph_assert.h"
161 #include "common/config.h"
162 #include "common/EventTrace.h"
164 #include "json_spirit/json_spirit_reader.h"
165 #include "json_spirit/json_spirit_writer.h"
168 #define TRACEPOINT_DEFINE
169 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170 #include "tracing/osd.h"
171 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #undef TRACEPOINT_DEFINE
174 #define tracepoint(...)
177 #define dout_context cct
178 #define dout_subsys ceph_subsys_osd
180 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
182 using namespace ceph::osd::scheduler
;
183 using TOPNSPC::common::cmd_getval
;
185 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
186 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
189 //Initial features in new superblock.
190 //Features here are also automatically upgraded
191 CompatSet
OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat
;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
194 CompatSet::FeatureSet ceph_osd_feature_incompat
;
195 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
196 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
197 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
198 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
199 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
200 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
201 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
202 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
203 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
204 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
205 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
206 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
207 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
208 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
209 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES
);
210 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
);
211 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
212 ceph_osd_feature_incompat
);
215 //Features are added here that this OSD supports.
216 CompatSet
OSD::get_osd_compat_set() {
217 CompatSet compat
= get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
223 OSDService::OSDService(OSD
*osd
) :
226 whoami(osd
->whoami
), store(osd
->store
),
227 log_client(osd
->log_client
), clog(osd
->clog
),
228 pg_recovery_stats(osd
->pg_recovery_stats
),
229 cluster_messenger(osd
->cluster_messenger
),
230 client_messenger(osd
->client_messenger
),
232 recoverystate_perf(osd
->recoverystate_perf
),
234 osd_max_object_size(cct
->_conf
, "osd_max_object_size"),
235 osd_skip_data_digest(cct
->_conf
, "osd_skip_data_digest"),
236 publish_lock
{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock
{ceph::make_mutex("OSDService::pre_publish_lock")},
241 agent_valid_iterator(false),
243 flush_mode_high_count(0),
246 agent_stop_flag(false),
247 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
251 objecter(make_unique
<Objecter
>(osd
->client_messenger
->cct
,
252 osd
->objecter_messenger
,
253 osd
->monc
, nullptr, 0, 0)),
254 m_objecter_finishers(cct
->_conf
->osd_objecter_finishers
),
255 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
257 recovery_request_timer(cct
, recovery_request_lock
, false),
258 sleep_timer(cct
, sleep_lock
, false),
259 reserver_finisher(cct
),
260 local_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
261 cct
->_conf
->osd_min_recovery_priority
),
262 remote_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
263 cct
->_conf
->osd_min_recovery_priority
),
264 snap_reserver(cct
, &reserver_finisher
,
265 cct
->_conf
->osd_max_trimming_pgs
),
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
269 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
270 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
271 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
273 cur_ratio(0), physical_ratio(0),
274 boot_epoch(0), up_epoch(0), bind_epoch(0)
278 for (int i
= 0; i
< m_objecter_finishers
; i
++) {
280 str
<< "objecter-finisher-" << i
;
281 auto fin
= make_unique
<Finisher
>(osd
->client_messenger
->cct
, str
.str(), "finisher");
282 objecter_finishers
.push_back(std::move(fin
));
287 void OSDService::add_pgid(spg_t pgid
, PG
*pg
){
288 std::lock_guard
l(pgid_lock
);
289 if (!pgid_tracker
.count(pgid
)) {
292 pgid_tracker
[pgid
]++;
294 void OSDService::remove_pgid(spg_t pgid
, PG
*pg
)
296 std::lock_guard
l(pgid_lock
);
297 ceph_assert(pgid_tracker
.count(pgid
));
298 ceph_assert(pgid_tracker
[pgid
] > 0);
299 pgid_tracker
[pgid
]--;
300 if (pgid_tracker
[pgid
] == 0) {
301 pgid_tracker
.erase(pgid
);
302 live_pgs
.erase(pgid
);
305 void OSDService::dump_live_pgids()
307 std::lock_guard
l(pgid_lock
);
308 derr
<< "live pgids:" << dendl
;
309 for (map
<spg_t
, int>::const_iterator i
= pgid_tracker
.cbegin();
310 i
!= pgid_tracker
.cend();
312 derr
<< "\t" << *i
<< dendl
;
313 live_pgs
[i
->first
]->dump_live_ids();
319 ceph::signedspan
OSDService::get_mnow()
321 return ceph::mono_clock::now() - osd
->startup_time
;
324 void OSDService::identify_splits_and_merges(
328 set
<pair
<spg_t
,epoch_t
>> *split_children
,
329 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
331 if (!old_map
->have_pg_pool(pgid
.pool())) {
334 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
335 auto p
= osd
->pg_num_history
.pg_nums
.find(pgid
.pool());
336 if (p
== osd
->pg_num_history
.pg_nums
.end()) {
339 dout(20) << __func__
<< " " << pgid
<< " e" << old_map
->get_epoch()
340 << " to e" << new_map
->get_epoch()
341 << " pg_nums " << p
->second
<< dendl
;
343 queue
.push_back(pgid
);
345 while (!queue
.empty()) {
346 auto cur
= queue
.front();
349 unsigned pgnum
= old_pgnum
;
350 for (auto q
= p
->second
.lower_bound(old_map
->get_epoch());
351 q
!= p
->second
.end() &&
352 q
->first
<= new_map
->get_epoch();
354 if (pgnum
< q
->second
) {
356 if (cur
.ps() < pgnum
) {
358 if (cur
.is_split(pgnum
, q
->second
, &children
)) {
359 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
360 << " pg_num " << pgnum
<< " -> " << q
->second
361 << " children " << children
<< dendl
;
362 for (auto i
: children
) {
363 split_children
->insert(make_pair(i
, q
->first
));
368 } else if (cur
.ps() < q
->second
) {
369 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
370 << " pg_num " << pgnum
<< " -> " << q
->second
371 << " is a child" << dendl
;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children
->insert(make_pair(cur
, q
->first
));
378 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
379 << " pg_num " << pgnum
<< " -> " << q
->second
380 << " is post-split, skipping" << dendl
;
382 } else if (merge_pgs
) {
384 if (cur
.ps() >= q
->second
) {
385 if (cur
.ps() < pgnum
) {
387 if (cur
.is_merge_source(pgnum
, q
->second
, &parent
)) {
389 parent
.is_split(q
->second
, pgnum
, &children
);
390 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
391 << " pg_num " << pgnum
<< " -> " << q
->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children
<< dendl
;
394 merge_pgs
->insert(make_pair(parent
, q
->first
));
395 if (!did
.count(parent
)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue
.push_back(parent
);
400 for (auto c
: children
) {
401 merge_pgs
->insert(make_pair(c
, q
->first
));
407 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
408 << " pg_num " << pgnum
<< " -> " << q
->second
409 << " is beyond old pgnum, skipping" << dendl
;
413 if (cur
.is_split(q
->second
, pgnum
, &children
)) {
414 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
415 << " pg_num " << pgnum
<< " -> " << q
->second
416 << " is merge target, source " << children
<< dendl
;
417 for (auto c
: children
) {
418 merge_pgs
->insert(make_pair(c
, q
->first
));
422 merge_pgs
->insert(make_pair(cur
, q
->first
));
431 void OSDService::need_heartbeat_peer_update()
433 osd
->need_heartbeat_peer_update();
436 HeartbeatStampsRef
OSDService::get_hb_stamps(unsigned peer
)
438 std::lock_guard
l(hb_stamp_lock
);
439 if (peer
>= hb_stamps
.size()) {
440 hb_stamps
.resize(peer
+ 1);
442 if (!hb_stamps
[peer
]) {
443 hb_stamps
[peer
] = ceph::make_ref
<HeartbeatStamps
>(peer
);
445 return hb_stamps
[peer
];
448 void OSDService::queue_renew_lease(epoch_t epoch
, spg_t spgid
)
450 osd
->enqueue_peering_evt(
453 std::make_shared
<PGPeeringEvent
>(
458 void OSDService::start_shutdown()
461 std::lock_guard
l(agent_timer_lock
);
462 agent_timer
.shutdown();
466 std::lock_guard
l(sleep_lock
);
467 sleep_timer
.shutdown();
471 std::lock_guard
l(recovery_request_lock
);
472 recovery_request_timer
.shutdown();
476 void OSDService::shutdown_reserver()
478 reserver_finisher
.wait_for_empty();
479 reserver_finisher
.stop();
482 void OSDService::shutdown()
484 mono_timer
.suspend();
487 std::lock_guard
l(watch_lock
);
488 watch_timer
.shutdown();
491 objecter
->shutdown();
492 for (auto& f
: objecter_finishers
) {
497 publish_map(OSDMapRef());
498 next_osdmap
= OSDMapRef();
501 void OSDService::init()
503 reserver_finisher
.start();
504 for (auto& f
: objecter_finishers
) {
507 objecter
->set_client_incarnation(0);
509 // deprioritize objecter in daemonperf output
510 objecter
->get_logger()->set_prio_adjust(-3);
516 agent_thread
.create("osd_srv_agent");
518 if (cct
->_conf
->osd_recovery_delay_start
)
519 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
522 void OSDService::final_init()
524 objecter
->start(osdmap
.get());
527 void OSDService::activate_map()
529 // wake/unwake the tiering agent
530 std::lock_guard l
{agent_lock
};
532 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
534 agent_cond
.notify_all();
537 void OSDService::request_osdmap_update(epoch_t e
)
539 osd
->osdmap_subscribe(e
, false);
543 class AgentTimeoutCB
: public Context
{
546 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
547 void finish(int) override
{
548 pg
->agent_choose_mode_restart();
552 void OSDService::agent_entry()
554 dout(10) << __func__
<< " start" << dendl
;
555 std::unique_lock agent_locker
{agent_lock
};
557 while (!agent_stop_flag
) {
558 if (agent_queue
.empty()) {
559 dout(20) << __func__
<< " empty queue" << dendl
;
560 agent_cond
.wait(agent_locker
);
563 uint64_t level
= agent_queue
.rbegin()->first
;
564 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
566 << " tiers " << agent_queue
.size()
567 << ", top is " << level
568 << " with pgs " << top
.size()
569 << ", ops " << agent_ops
<< "/"
570 << cct
->_conf
->osd_agent_max_ops
571 << (agent_active
? " active" : " NOT ACTIVE")
573 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
574 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
575 int agent_flush_quota
= max
;
576 if (!flush_mode_high_count
)
577 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
578 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
579 agent_cond
.wait(agent_locker
);
583 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
584 agent_queue_pos
= top
.begin();
585 agent_valid_iterator
= true;
587 PGRef pg
= *agent_queue_pos
;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota
<< dendl
;
591 agent_locker
.unlock();
592 if (!pg
->agent_work(max
, agent_flush_quota
)) {
593 dout(10) << __func__
<< " " << pg
->pg_id
594 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
595 << " seconds" << dendl
;
597 osd
->logger
->inc(l_osd_tier_delay
);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
599 std::lock_guard timer_locker
{agent_timer_lock
};
600 Context
*cb
= new AgentTimeoutCB(pg
);
601 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
605 dout(10) << __func__
<< " finish" << dendl
;
608 void OSDService::agent_stop()
611 std::lock_guard
l(agent_lock
);
613 // By this time all ops should be cancelled
614 ceph_assert(agent_ops
== 0);
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue
.empty()) {
617 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
618 derr
<< "agent queue not empty, for example " << (*top
.begin())->get_pgid() << dendl
;
619 ceph_abort_msg("agent queue not empty");
622 agent_stop_flag
= true;
623 agent_cond
.notify_all();
628 // -------------------------------------
630 void OSDService::promote_throttle_recalibrate()
632 utime_t now
= ceph_clock_now();
633 double dur
= now
- last_recalibrate
;
634 last_recalibrate
= now
;
635 unsigned prob
= promote_probability_millis
;
637 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
638 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
640 unsigned min_prob
= 1;
642 uint64_t attempts
, obj
, bytes
;
643 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
644 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
645 << obj
<< " objects and " << byte_u_t(bytes
) << "; target "
646 << target_obj_sec
<< " obj/sec or "
647 << byte_u_t(target_bytes_sec
) << "/sec"
650 // calculate what the probability *should* be, given the targets
652 if (attempts
&& dur
> 0) {
653 uint64_t avg_size
= 1;
655 avg_size
= std::max
<uint64_t>(bytes
/ obj
, 1);
656 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
657 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
659 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
660 << avg_size
<< dendl
;
661 if (target_obj_sec
&& target_bytes_sec
)
662 new_prob
= std::min(po
, pb
);
663 else if (target_obj_sec
)
665 else if (target_bytes_sec
)
672 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
674 // correct for persistent skew between target rate and actual rate, adjust
677 if (attempts
&& obj
) {
678 actual
= obj
* 1000 / attempts
;
679 ratio
= (double)actual
/ (double)prob
;
680 new_prob
= (double)new_prob
/ ratio
;
682 new_prob
= std::max(new_prob
, min_prob
);
683 new_prob
= std::min(new_prob
, 1000u);
686 prob
= (prob
+ new_prob
) / 2;
687 prob
= std::max(prob
, min_prob
);
688 prob
= std::min(prob
, 1000u);
689 dout(10) << __func__
<< " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis
<< " -> " << prob
694 promote_probability_millis
= prob
;
696 // set hard limits for this interval to mitigate stampedes
697 promote_max_objects
= target_obj_sec
* osd
->OSD_TICK_INTERVAL
* 2;
698 promote_max_bytes
= target_bytes_sec
* osd
->OSD_TICK_INTERVAL
* 2;
701 // -------------------------------------
703 float OSDService::get_failsafe_full_ratio()
705 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
706 if (full_ratio
> 1.0) full_ratio
/= 100.0;
710 OSDService::s_names
OSDService::recalc_full_state(float ratio
, float pratio
, string
&inject
)
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap
= get_osdmap();
717 if (!osdmap
|| osdmap
->get_epoch() == 0) {
720 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
721 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
722 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
723 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
725 if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio
= failsafe_ratio
;
729 backfillfull_ratio
= failsafe_ratio
;
730 nearfull_ratio
= failsafe_ratio
;
731 } else if (full_ratio
<= 0 ||
732 backfillfull_ratio
<= 0 ||
733 nearfull_ratio
<= 0) {
734 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio
= failsafe_ratio
;
738 backfillfull_ratio
= failsafe_ratio
;
739 nearfull_ratio
= failsafe_ratio
;
742 if (injectfull_state
> NONE
&& injectfull
) {
743 inject
= "(Injected)";
744 return injectfull_state
;
745 } else if (pratio
> failsafe_ratio
) {
747 } else if (ratio
> full_ratio
) {
749 } else if (ratio
> backfillfull_ratio
) {
751 } else if (pratio
> nearfull_ratio
) {
757 void OSDService::check_full_status(float ratio
, float pratio
)
759 std::lock_guard
l(full_status_lock
);
762 physical_ratio
= pratio
;
766 new_state
= recalc_full_state(ratio
, pratio
, inject
);
768 dout(20) << __func__
<< " cur ratio " << ratio
769 << ", physical ratio " << pratio
770 << ", new state " << get_full_state_name(new_state
)
775 if (cur_state
!= new_state
) {
776 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
777 << " -> " << get_full_state_name(new_state
) << dendl
;
778 if (new_state
== FAILSAFE
) {
779 clog
->error() << "full status failsafe engaged, dropping updates, now "
780 << (int)roundf(ratio
* 100) << "% full";
781 } else if (cur_state
== FAILSAFE
) {
782 clog
->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio
* 100) << "% full";
785 cur_state
= new_state
;
789 bool OSDService::need_fullness_update()
791 OSDMapRef osdmap
= get_osdmap();
793 if (osdmap
->exists(whoami
)) {
794 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
796 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
798 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
805 else if (is_backfillfull())
807 else if (is_nearfull())
812 bool OSDService::_check_inject_full(DoutPrefixProvider
*dpp
, s_names type
) const
814 if (injectfull
&& injectfull_state
>= type
) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
819 ldpp_dout(dpp
, 10) << __func__
<< " Injected " << get_full_state_name(type
) << " OSD ("
820 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")"
827 bool OSDService::_check_full(DoutPrefixProvider
*dpp
, s_names type
) const
829 std::lock_guard
l(full_status_lock
);
831 if (_check_inject_full(dpp
, type
))
834 if (cur_state
>= type
)
835 ldpp_dout(dpp
, 10) << __func__
<< " current usage is " << cur_ratio
836 << " physical " << physical_ratio
<< dendl
;
838 return cur_state
>= type
;
841 bool OSDService::_tentative_full(DoutPrefixProvider
*dpp
, s_names type
, uint64_t adjust_used
, osd_stat_t adjusted_stat
)
843 ldpp_dout(dpp
, 20) << __func__
<< " type " << get_full_state_name(type
) << " adjust_used " << (adjust_used
>> 10) << "KiB" << dendl
;
845 std::lock_guard
l(full_status_lock
);
846 if (_check_inject_full(dpp
, type
)) {
852 float ratio
= compute_adjusted_ratio(adjusted_stat
, &pratio
, adjust_used
);
855 s_names tentative_state
= recalc_full_state(ratio
, pratio
, notused
);
857 if (tentative_state
>= type
)
858 ldpp_dout(dpp
, 10) << __func__
<< " tentative usage is " << ratio
<< dendl
;
860 return tentative_state
>= type
;
863 bool OSDService::check_failsafe_full(DoutPrefixProvider
*dpp
) const
865 return _check_full(dpp
, FAILSAFE
);
868 bool OSDService::check_full(DoutPrefixProvider
*dpp
) const
870 return _check_full(dpp
, FULL
);
873 bool OSDService::tentative_backfill_full(DoutPrefixProvider
*dpp
, uint64_t adjust_used
, osd_stat_t stats
)
875 return _tentative_full(dpp
, BACKFILLFULL
, adjust_used
, stats
);
878 bool OSDService::check_backfill_full(DoutPrefixProvider
*dpp
) const
880 return _check_full(dpp
, BACKFILLFULL
);
883 bool OSDService::check_nearfull(DoutPrefixProvider
*dpp
) const
885 return _check_full(dpp
, NEARFULL
);
888 bool OSDService::is_failsafe_full() const
890 std::lock_guard
l(full_status_lock
);
891 return cur_state
== FAILSAFE
;
894 bool OSDService::is_full() const
896 std::lock_guard
l(full_status_lock
);
897 return cur_state
>= FULL
;
900 bool OSDService::is_backfillfull() const
902 std::lock_guard
l(full_status_lock
);
903 return cur_state
>= BACKFILLFULL
;
906 bool OSDService::is_nearfull() const
908 std::lock_guard
l(full_status_lock
);
909 return cur_state
>= NEARFULL
;
912 void OSDService::set_injectfull(s_names type
, int64_t count
)
914 std::lock_guard
l(full_status_lock
);
915 injectfull_state
= type
;
919 void OSDService::set_statfs(const struct store_statfs_t
&stbuf
,
920 osd_alert_list_t
& alerts
)
922 uint64_t bytes
= stbuf
.total
;
923 uint64_t avail
= stbuf
.available
;
924 uint64_t used
= stbuf
.get_used_raw();
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct
->_conf
->fake_statfs_for_testing
) {
929 uint64_t total_num_bytes
= 0;
933 total_num_bytes
+= p
->get_stats_num_bytes();
935 bytes
= cct
->_conf
->fake_statfs_for_testing
;
936 if (total_num_bytes
< bytes
)
937 avail
= bytes
- total_num_bytes
;
940 dout(0) << __func__
<< " fake total " << cct
->_conf
->fake_statfs_for_testing
941 << " adjust available " << avail
943 used
= bytes
- avail
;
946 osd
->logger
->set(l_osd_stat_bytes
, bytes
);
947 osd
->logger
->set(l_osd_stat_bytes_used
, used
);
948 osd
->logger
->set(l_osd_stat_bytes_avail
, avail
);
950 std::lock_guard
l(stat_lock
);
951 osd_stat
.statfs
= stbuf
;
952 osd_stat
.os_alerts
.clear();
953 osd_stat
.os_alerts
[whoami
].swap(alerts
);
954 if (cct
->_conf
->fake_statfs_for_testing
) {
955 osd_stat
.statfs
.total
= bytes
;
956 osd_stat
.statfs
.available
= avail
;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat
.statfs
.internally_reserved
= 0;
962 osd_stat_t
OSDService::set_osd_stat(vector
<int>& hb_peers
,
965 utime_t now
= ceph_clock_now();
966 auto stale_time
= g_conf().get_val
<int64_t>("osd_mon_heartbeat_stat_stale");
967 std::lock_guard
l(stat_lock
);
968 osd_stat
.hb_peers
.swap(hb_peers
);
969 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
970 osd_stat
.num_pgs
= num_pgs
;
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i
: osd_stat
.hb_pingtime
) {
974 if (i
.second
.last_update
== 0)
976 if (stale_time
&& now
.sec() - i
.second
.last_update
> stale_time
) {
977 dout(20) << __func__
<< " time out heartbeat for osd " << i
.first
978 << " last_update " << i
.second
.last_update
<< dendl
;
979 osd_stat
.hb_pingtime
.erase(i
.first
);
986 void OSDService::inc_osd_stat_repaired()
988 std::lock_guard
l(stat_lock
);
989 osd_stat
.num_shards_repaired
++;
993 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat
, float *pratio
,
994 uint64_t adjust_used
)
997 ((float)new_stat
.statfs
.get_used()) / ((float)new_stat
.statfs
.total
);
1000 dout(20) << __func__
<< " Before kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1001 if (new_stat
.statfs
.available
> adjust_used
)
1002 new_stat
.statfs
.available
-= adjust_used
;
1004 new_stat
.statfs
.available
= 0;
1005 dout(20) << __func__
<< " After kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted
= 0;
1011 osd
->_get_pgs(&pgs
);
1012 for (auto p
: pgs
) {
1013 backfill_adjusted
+= p
->pg_stat_adjust(&new_stat
);
1015 if (backfill_adjusted
) {
1016 dout(20) << __func__
<< " backfill adjusted " << new_stat
<< dendl
;
1018 return ((float)new_stat
.statfs
.get_used()) / ((float)new_stat
.statfs
.total
);
1021 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
1023 OSDMapRef next_map
= get_nextmap_reserved();
1024 // service map is always newer/newest
1025 ceph_assert(from_epoch
<= next_map
->get_epoch());
1027 if (next_map
->is_down(peer
) ||
1028 next_map
->get_info(peer
).up_from
> from_epoch
) {
1030 release_map(next_map
);
1033 ConnectionRef peer_con
;
1034 if (peer
== whoami
) {
1035 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1037 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1038 next_map
->get_cluster_addrs(peer
), false, true);
1040 maybe_share_map(peer_con
.get(), next_map
);
1041 peer_con
->send_message(m
);
1042 release_map(next_map
);
1045 void OSDService::send_message_osd_cluster(std::vector
<std::pair
<int, Message
*>>& messages
, epoch_t from_epoch
)
1047 OSDMapRef next_map
= get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch
<= next_map
->get_epoch());
1051 for (auto& iter
: messages
) {
1052 if (next_map
->is_down(iter
.first
) ||
1053 next_map
->get_info(iter
.first
).up_from
> from_epoch
) {
1057 ConnectionRef peer_con
;
1058 if (iter
.first
== whoami
) {
1059 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1061 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1062 next_map
->get_cluster_addrs(iter
.first
), false, true);
1064 maybe_share_map(peer_con
.get(), next_map
);
1065 peer_con
->send_message(iter
.second
);
1067 release_map(next_map
);
1069 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
1071 OSDMapRef next_map
= get_nextmap_reserved();
1072 // service map is always newer/newest
1073 ceph_assert(from_epoch
<= next_map
->get_epoch());
1075 if (next_map
->is_down(peer
) ||
1076 next_map
->get_info(peer
).up_from
> from_epoch
) {
1077 release_map(next_map
);
1081 if (peer
== whoami
) {
1082 con
= osd
->cluster_messenger
->get_loopback_connection();
1084 con
= osd
->cluster_messenger
->connect_to_osd(
1085 next_map
->get_cluster_addrs(peer
), false, true);
1087 release_map(next_map
);
1091 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
1093 OSDMapRef next_map
= get_nextmap_reserved();
1094 // service map is always newer/newest
1095 ceph_assert(from_epoch
<= next_map
->get_epoch());
1097 pair
<ConnectionRef
,ConnectionRef
> ret
;
1098 if (next_map
->is_down(peer
) ||
1099 next_map
->get_info(peer
).up_from
> from_epoch
) {
1100 release_map(next_map
);
1103 ret
.first
= osd
->hb_back_client_messenger
->connect_to_osd(
1104 next_map
->get_hb_back_addrs(peer
));
1105 ret
.second
= osd
->hb_front_client_messenger
->connect_to_osd(
1106 next_map
->get_hb_front_addrs(peer
));
1107 release_map(next_map
);
1111 entity_name_t
OSDService::get_cluster_msgr_name() const
1113 return cluster_messenger
->get_myname();
1116 void OSDService::queue_want_pg_temp(pg_t pgid
,
1117 const vector
<int>& want
,
1120 std::lock_guard
l(pg_temp_lock
);
1121 auto p
= pg_temp_pending
.find(pgid
);
1122 if (p
== pg_temp_pending
.end() ||
1123 p
->second
.acting
!= want
||
1125 pg_temp_wanted
[pgid
] = {want
, forced
};
1129 void OSDService::remove_want_pg_temp(pg_t pgid
)
1131 std::lock_guard
l(pg_temp_lock
);
1132 pg_temp_wanted
.erase(pgid
);
1133 pg_temp_pending
.erase(pgid
);
1136 void OSDService::_sent_pg_temp()
1138 #ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending
.merge(pg_temp_wanted
);
1141 pg_temp_pending
.insert(make_move_iterator(begin(pg_temp_wanted
)),
1142 make_move_iterator(end(pg_temp_wanted
)));
1144 pg_temp_wanted
.clear();
1147 void OSDService::requeue_pg_temp()
1149 std::lock_guard
l(pg_temp_lock
);
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted
= pg_temp_wanted
.size();
1153 unsigned old_pending
= pg_temp_pending
.size();
1155 pg_temp_wanted
.swap(pg_temp_pending
);
1156 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1157 << pg_temp_wanted
.size() << dendl
;
1160 std::ostream
& operator<<(std::ostream
& out
,
1161 const OSDService::pg_temp_t
& pg_temp
)
1163 out
<< pg_temp
.acting
;
1164 if (pg_temp
.forced
) {
1170 void OSDService::send_pg_temp()
1172 std::lock_guard
l(pg_temp_lock
);
1173 if (pg_temp_wanted
.empty())
1175 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1176 MOSDPGTemp
*ms
[2] = {nullptr, nullptr};
1177 for (auto& [pgid
, pg_temp
] : pg_temp_wanted
) {
1178 auto& m
= ms
[pg_temp
.forced
];
1180 m
= new MOSDPGTemp(osdmap
->get_epoch());
1181 m
->forced
= pg_temp
.forced
;
1183 m
->pg_temp
.emplace(pgid
, pg_temp
.acting
);
1187 monc
->send_mon_message(m
);
1193 void OSDService::send_pg_created(pg_t pgid
)
1195 std::lock_guard
l(pg_created_lock
);
1196 dout(20) << __func__
<< dendl
;
1197 auto o
= get_osdmap();
1198 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1199 pg_created
.insert(pgid
);
1200 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1204 void OSDService::send_pg_created()
1206 std::lock_guard
l(pg_created_lock
);
1207 dout(20) << __func__
<< dendl
;
1208 auto o
= get_osdmap();
1209 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1210 for (auto pgid
: pg_created
) {
1211 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1216 void OSDService::prune_pg_created()
1218 std::lock_guard
l(pg_created_lock
);
1219 dout(20) << __func__
<< dendl
;
1220 auto o
= get_osdmap();
1221 auto i
= pg_created
.begin();
1222 while (i
!= pg_created
.end()) {
1223 auto p
= o
->get_pg_pool(i
->pool());
1224 if (!p
|| !p
->has_flag(pg_pool_t::FLAG_CREATING
)) {
1225 dout(20) << __func__
<< " pruning " << *i
<< dendl
;
1226 i
= pg_created
.erase(i
);
1228 dout(20) << __func__
<< " keeping " << *i
<< dendl
;
1235 // --------------------------------------
1238 bool OSDService::can_inc_scrubs()
1240 bool can_inc
= false;
1241 std::lock_guard
l(sched_scrub_lock
);
1243 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1244 dout(20) << __func__
<< " == true " << scrubs_local
<< " local + " << scrubs_remote
1245 << " remote < max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1248 dout(20) << __func__
<< " == false " << scrubs_local
<< " local + " << scrubs_remote
1249 << " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1255 bool OSDService::inc_scrubs_local()
1257 bool result
= false;
1258 std::lock_guard l
{sched_scrub_lock
};
1259 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1260 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
+1)
1261 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1265 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1270 void OSDService::dec_scrubs_local()
1272 std::lock_guard l
{sched_scrub_lock
};
1273 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
-1)
1274 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1276 ceph_assert(scrubs_local
>= 0);
1279 bool OSDService::inc_scrubs_remote()
1281 bool result
= false;
1282 std::lock_guard l
{sched_scrub_lock
};
1283 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1284 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
+1)
1285 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1289 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1294 void OSDService::dec_scrubs_remote()
1296 std::lock_guard l
{sched_scrub_lock
};
1297 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
-1)
1298 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1300 ceph_assert(scrubs_remote
>= 0);
1303 void OSDService::dump_scrub_reservations(Formatter
*f
)
1305 std::lock_guard l
{sched_scrub_lock
};
1306 f
->dump_int("scrubs_local", scrubs_local
);
1307 f
->dump_int("scrubs_remote", scrubs_remote
);
1308 f
->dump_int("osd_max_scrubs", cct
->_conf
->osd_max_scrubs
);
1311 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1312 epoch_t
*_bind_epoch
) const
1314 std::lock_guard
l(epoch_lock
);
1316 *_boot_epoch
= boot_epoch
;
1318 *_up_epoch
= up_epoch
;
1320 *_bind_epoch
= bind_epoch
;
1323 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1324 const epoch_t
*_bind_epoch
)
1326 std::lock_guard
l(epoch_lock
);
1328 ceph_assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1329 boot_epoch
= *_boot_epoch
;
1332 ceph_assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1333 up_epoch
= *_up_epoch
;
1336 ceph_assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1337 bind_epoch
= *_bind_epoch
;
1341 bool OSDService::prepare_to_stop()
1343 std::unique_lock
l(is_stopping_lock
);
1344 if (get_state() != NOT_STOPPING
)
1347 OSDMapRef osdmap
= get_osdmap();
1348 if (osdmap
&& osdmap
->is_up(whoami
)) {
1349 dout(0) << __func__
<< " telling mon we are shutting down" << dendl
;
1350 set_state(PREPARING_TO_STOP
);
1351 monc
->send_mon_message(
1355 osdmap
->get_addrs(whoami
),
1356 osdmap
->get_epoch(),
1359 const auto timeout
= ceph::make_timespan(cct
->_conf
->osd_mon_shutdown_timeout
);
1360 is_stopping_cond
.wait_for(l
, timeout
,
1361 [this] { return get_state() == STOPPING
; });
1363 dout(0) << __func__
<< " starting shutdown" << dendl
;
1364 set_state(STOPPING
);
1368 void OSDService::got_stop_ack()
1370 std::scoped_lock
l(is_stopping_lock
);
1371 if (get_state() == PREPARING_TO_STOP
) {
1372 dout(0) << __func__
<< " starting shutdown" << dendl
;
1373 set_state(STOPPING
);
1374 is_stopping_cond
.notify_all();
1376 dout(10) << __func__
<< " ignoring msg" << dendl
;
1380 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1381 OSDSuperblock
& sblock
)
1383 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1384 osdmap
->get_encoding_features());
1385 m
->oldest_map
= max_oldest_map
;
1386 m
->newest_map
= sblock
.newest_map
;
1388 int max
= cct
->_conf
->osd_map_message_max
;
1389 ssize_t max_bytes
= cct
->_conf
->osd_map_message_max_bytes
;
1391 if (since
< m
->oldest_map
) {
1392 // we don't have the next map the target wants, so start with a
1395 dout(10) << __func__
<< " oldest map " << max_oldest_map
<< " > since "
1396 << since
<< ", starting with full map" << dendl
;
1397 since
= m
->oldest_map
;
1398 if (!get_map_bl(since
, bl
)) {
1399 derr
<< __func__
<< " missing full map " << since
<< dendl
;
1403 max_bytes
-= bl
.length();
1404 m
->maps
[since
].claim(bl
);
1406 for (epoch_t e
= since
+ 1; e
<= to
; ++e
) {
1408 if (get_inc_map_bl(e
, bl
)) {
1409 m
->incremental_maps
[e
].claim(bl
);
1411 dout(10) << __func__
<< " missing incremental map " << e
<< dendl
;
1412 if (!get_map_bl(e
, bl
)) {
1413 derr
<< __func__
<< " also missing full map " << e
<< dendl
;
1416 m
->maps
[e
].claim(bl
);
1419 max_bytes
-= bl
.length();
1420 if (max
<= 0 || max_bytes
<= 0) {
1427 if (!m
->maps
.empty() ||
1428 !m
->incremental_maps
.empty()) {
1429 // send what we have so far
1434 if (get_inc_map_bl(m
->newest_map
, bl
)) {
1435 m
->incremental_maps
[m
->newest_map
].claim(bl
);
1437 derr
<< __func__
<< " unable to load latest map " << m
->newest_map
<< dendl
;
1438 if (!get_map_bl(m
->newest_map
, bl
)) {
1439 derr
<< __func__
<< " unable to load latest full map " << m
->newest_map
1443 m
->maps
[m
->newest_map
].claim(bl
);
1448 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1450 con
->send_message(m
);
1453 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1454 const OSDMapRef
& osdmap
)
1456 epoch_t to
= osdmap
->get_epoch();
1457 dout(10) << "send_incremental_map " << since
<< " -> " << to
1458 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1462 OSDSuperblock
sblock(get_superblock());
1463 if (since
< sblock
.oldest_map
) {
1464 // just send latest full map
1465 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1466 osdmap
->get_encoding_features());
1467 m
->oldest_map
= max_oldest_map
;
1468 m
->newest_map
= sblock
.newest_map
;
1469 get_map_bl(to
, m
->maps
[to
]);
1474 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1475 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl
;
1477 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1480 m
= build_incremental_map_msg(since
, to
, sblock
);
1485 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1487 bool found
= map_bl_cache
.lookup(e
, &bl
);
1490 logger
->inc(l_osd_map_bl_cache_hit
);
1494 logger
->inc(l_osd_map_bl_cache_miss
);
1495 found
= store
->read(meta_ch
,
1496 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1504 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1506 std::lock_guard
l(map_cache_lock
);
1507 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1510 logger
->inc(l_osd_map_bl_cache_hit
);
1514 logger
->inc(l_osd_map_bl_cache_miss
);
1515 found
= store
->read(meta_ch
,
1516 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1519 _add_map_inc_bl(e
, bl
);
1524 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1526 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1527 // cache a contiguous buffer
1528 if (bl
.get_num_buffers() > 1) {
1531 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1532 map_bl_cache
.add(e
, bl
);
1535 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1537 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1538 // cache a contiguous buffer
1539 if (bl
.get_num_buffers() > 1) {
1542 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1543 map_bl_inc_cache
.add(e
, bl
);
1546 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1548 epoch_t e
= o
->get_epoch();
1550 if (cct
->_conf
->osd_map_dedup
) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1554 OSDMap::dedup(for_dedup
.get(), o
);
1558 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1565 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1567 std::lock_guard
l(map_cache_lock
);
1568 OSDMapRef retval
= map_cache
.lookup(epoch
);
1570 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1572 logger
->inc(l_osd_map_cache_hit
);
1577 logger
->inc(l_osd_map_cache_miss
);
1578 epoch_t lb
= map_cache
.cached_key_lower_bound();
1580 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1581 logger
->inc(l_osd_map_cache_miss_low
);
1582 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1586 OSDMap
*map
= new OSDMap
;
1588 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1590 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1591 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1597 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1599 return _add_map(map
);
1605 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1607 reply_op_error(op
, err
, eversion_t(), 0, {});
1610 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1612 vector
<pg_log_op_return_item_t
> op_returns
)
1614 auto m
= op
->get_req
<MOSDOp
>();
1615 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1617 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1619 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
,
1620 !m
->has_flag(CEPH_OSD_FLAG_RETURNVEC
));
1621 reply
->set_reply_versions(v
, uv
);
1622 reply
->set_op_returns(op_returns
);
1623 m
->get_connection()->send_message(reply
);
1626 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1628 if (!cct
->_conf
->osd_debug_misdirected_ops
) {
1632 auto m
= op
->get_req
<MOSDOp
>();
1633 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1635 ceph_assert(m
->get_map_epoch() >= pg
->get_history().same_primary_since
);
1637 if (pg
->is_ec_pg()) {
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1654 ceph_assert(m
->get_map_epoch() <= superblock
.newest_map
);
1655 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1657 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1658 << m
->get_map_epoch() << ", dropping" << dendl
;
1661 pg_t _pgid
= m
->get_raw_pg();
1663 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1664 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1665 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1666 pgid
.shard
!= pg
->pg_id
.shard
) {
1667 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1668 << m
->get_map_epoch() << ", dropping" << dendl
;
1673 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1674 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1675 << " pg " << m
->get_raw_pg()
1676 << " to osd." << whoami
1677 << " not " << pg
->get_acting()
1678 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1681 void OSDService::enqueue_back(OpSchedulerItem
&& qi
)
1683 osd
->op_shardedwq
.queue(std::move(qi
));
1686 void OSDService::enqueue_front(OpSchedulerItem
&& qi
)
1688 osd
->op_shardedwq
.queue_front(std::move(qi
));
1691 void OSDService::queue_recovery_context(
1693 GenContext
<ThreadPool::TPHandle
&> *c
)
1695 epoch_t e
= get_osdmap_epoch();
1698 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1699 new PGRecoveryContext(pg
->get_pgid(), c
, e
)),
1700 cct
->_conf
->osd_recovery_cost
,
1701 cct
->_conf
->osd_recovery_priority
,
1707 void OSDService::queue_for_snap_trim(PG
*pg
)
1709 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1712 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1713 new PGSnapTrim(pg
->get_pgid(), pg
->get_osdmap_epoch())),
1714 cct
->_conf
->osd_snap_trim_cost
,
1715 cct
->_conf
->osd_snap_trim_priority
,
1718 pg
->get_osdmap_epoch()));
1721 void OSDService::queue_for_scrub(PG
*pg
, bool with_high_priority
)
1723 unsigned scrub_queue_priority
= pg
->scrubber
.priority
;
1724 if (with_high_priority
&& scrub_queue_priority
< cct
->_conf
->osd_client_op_priority
) {
1725 scrub_queue_priority
= cct
->_conf
->osd_client_op_priority
;
1727 const auto epoch
= pg
->get_osdmap_epoch();
1730 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGScrub(pg
->get_pgid(), epoch
)),
1731 cct
->_conf
->osd_scrub_cost
,
1732 scrub_queue_priority
,
1738 void OSDService::queue_for_pg_delete(spg_t pgid
, epoch_t e
)
1740 dout(10) << __func__
<< " on " << pgid
<< " e " << e
<< dendl
;
1743 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1744 new PGDelete(pgid
, e
)),
1745 cct
->_conf
->osd_pg_delete_cost
,
1746 cct
->_conf
->osd_pg_delete_priority
,
1752 bool OSDService::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
1754 return osd
->try_finish_pg_delete(pg
, old_pg_num
);
1759 void OSDService::set_ready_to_merge_source(PG
*pg
, eversion_t version
)
1761 std::lock_guard
l(merge_lock
);
1762 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1763 ready_to_merge_source
[pg
->pg_id
.pgid
] = version
;
1764 assert(not_ready_to_merge_source
.count(pg
->pg_id
.pgid
) == 0);
1765 _send_ready_to_merge();
1768 void OSDService::set_ready_to_merge_target(PG
*pg
,
1770 epoch_t last_epoch_started
,
1771 epoch_t last_epoch_clean
)
1773 std::lock_guard
l(merge_lock
);
1774 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1775 ready_to_merge_target
.insert(make_pair(pg
->pg_id
.pgid
,
1778 last_epoch_clean
)));
1779 assert(not_ready_to_merge_target
.count(pg
->pg_id
.pgid
) == 0);
1780 _send_ready_to_merge();
1783 void OSDService::set_not_ready_to_merge_source(pg_t source
)
1785 std::lock_guard
l(merge_lock
);
1786 dout(10) << __func__
<< " " << source
<< dendl
;
1787 not_ready_to_merge_source
.insert(source
);
1788 assert(ready_to_merge_source
.count(source
) == 0);
1789 _send_ready_to_merge();
1792 void OSDService::set_not_ready_to_merge_target(pg_t target
, pg_t source
)
1794 std::lock_guard
l(merge_lock
);
1795 dout(10) << __func__
<< " " << target
<< " source " << source
<< dendl
;
1796 not_ready_to_merge_target
[target
] = source
;
1797 assert(ready_to_merge_target
.count(target
) == 0);
1798 _send_ready_to_merge();
1801 void OSDService::send_ready_to_merge()
1803 std::lock_guard
l(merge_lock
);
1804 _send_ready_to_merge();
1807 void OSDService::_send_ready_to_merge()
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1816 for (auto src
: not_ready_to_merge_source
) {
1817 if (sent_ready_to_merge_source
.count(src
) == 0) {
1818 monc
->send_mon_message(new MOSDPGReadyToMerge(
1822 osdmap
->get_epoch()));
1823 sent_ready_to_merge_source
.insert(src
);
1826 for (auto p
: not_ready_to_merge_target
) {
1827 if (sent_ready_to_merge_source
.count(p
.second
) == 0) {
1828 monc
->send_mon_message(new MOSDPGReadyToMerge(
1832 osdmap
->get_epoch()));
1833 sent_ready_to_merge_source
.insert(p
.second
);
1836 for (auto src
: ready_to_merge_source
) {
1837 if (not_ready_to_merge_source
.count(src
.first
) ||
1838 not_ready_to_merge_target
.count(src
.first
.get_parent())) {
1841 auto p
= ready_to_merge_target
.find(src
.first
.get_parent());
1842 if (p
!= ready_to_merge_target
.end() &&
1843 sent_ready_to_merge_source
.count(src
.first
) == 0) {
1844 monc
->send_mon_message(new MOSDPGReadyToMerge(
1845 src
.first
, // source pgid
1846 src
.second
, // src version
1847 std::get
<0>(p
->second
), // target version
1848 std::get
<1>(p
->second
), // PG's last_epoch_started
1849 std::get
<2>(p
->second
), // PG's last_epoch_clean
1851 osdmap
->get_epoch()));
1852 sent_ready_to_merge_source
.insert(src
.first
);
1857 void OSDService::clear_ready_to_merge(PG
*pg
)
1859 std::lock_guard
l(merge_lock
);
1860 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1861 ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1862 ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1863 not_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1864 not_ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1865 sent_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1868 void OSDService::clear_sent_ready_to_merge()
1870 std::lock_guard
l(merge_lock
);
1871 sent_ready_to_merge_source
.clear();
1874 void OSDService::prune_sent_ready_to_merge(const OSDMapRef
& osdmap
)
1876 std::lock_guard
l(merge_lock
);
1877 auto i
= sent_ready_to_merge_source
.begin();
1878 while (i
!= sent_ready_to_merge_source
.end()) {
1879 if (!osdmap
->pg_exists(*i
)) {
1880 dout(10) << __func__
<< " " << *i
<< dendl
;
1881 i
= sent_ready_to_merge_source
.erase(i
);
1890 void OSDService::_queue_for_recovery(
1891 std::pair
<epoch_t
, PGRef
> p
,
1892 uint64_t reserved_pushes
)
1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
1897 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1899 p
.second
->get_pgid(), p
.first
, reserved_pushes
)),
1900 cct
->_conf
->osd_recovery_cost
,
1901 cct
->_conf
->osd_recovery_priority
,
1907 // ====================================================================
1911 #define dout_prefix *_dout
1913 // Commands shared between OSD's console and admin console:
1915 namespace osd_cmds
{
1917 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
);
1919 }} // namespace ceph::osd_cmds
1921 int OSD::mkfs(CephContext
*cct
, ObjectStore
*store
, uuid_d fsid
, int whoami
, string osdspec_affinity
)
1927 ObjectStore::CollectionHandle ch
;
1929 // if we are fed a uuid for this osd, use it.
1930 store
->set_fsid(cct
->_conf
->osd_uuid
);
1932 ret
= store
->mkfs();
1934 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret
) << dendl
;
1939 store
->set_cache_shards(1); // doesn't matter for mkfs!
1941 ret
= store
->mount();
1943 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret
) << dendl
;
1948 ch
= store
->open_collection(coll_t::meta());
1950 ret
= store
->read(ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
1952 derr
<< "OSD::mkfs: have meta collection but no superblock" << dendl
;
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl
;
1957 auto p
= sbbl
.cbegin();
1959 if (whoami
!= sb
.whoami
) {
1960 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
1965 if (fsid
!= sb
.cluster_fsid
) {
1966 derr
<< "provided cluster fsid " << fsid
1967 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
1972 // create superblock
1973 sb
.cluster_fsid
= fsid
;
1974 sb
.osd_fsid
= store
->get_fsid();
1976 sb
.compat_features
= get_osd_initial_compat_set();
1981 ObjectStore::CollectionHandle ch
= store
->create_new_collection(
1983 ObjectStore::Transaction t
;
1984 t
.create_collection(coll_t::meta(), 0);
1985 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
1986 ret
= store
->queue_transaction(ch
, std::move(t
));
1988 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1989 << "queue_transaction returned " << cpp_strerror(ret
) << dendl
;
1994 ret
= write_meta(cct
, store
, sb
.cluster_fsid
, sb
.osd_fsid
, whoami
, osdspec_affinity
);
1996 derr
<< "OSD::mkfs: failed to write fsid file: error "
1997 << cpp_strerror(ret
) << dendl
;
2011 int OSD::write_meta(CephContext
*cct
, ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
, string
& osdspec_affinity
)
2016 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
2017 r
= store
->write_meta("magic", val
);
2021 snprintf(val
, sizeof(val
), "%d", whoami
);
2022 r
= store
->write_meta("whoami", val
);
2026 cluster_fsid
.print(val
);
2027 r
= store
->write_meta("ceph_fsid", val
);
2031 string key
= cct
->_conf
.get_val
<string
>("key");
2033 r
= store
->write_meta("osd_key", key
);
2037 string keyfile
= cct
->_conf
.get_val
<string
>("keyfile");
2038 if (!keyfile
.empty()) {
2041 r
= keybl
.read_file(keyfile
.c_str(), &err
);
2043 derr
<< __func__
<< " failed to read keyfile " << keyfile
<< ": "
2044 << err
<< ": " << cpp_strerror(r
) << dendl
;
2047 r
= store
->write_meta("osd_key", keybl
.to_str());
2052 if (!osdspec_affinity
.empty()) {
2053 r
= store
->write_meta("osdspec_affinity", osdspec_affinity
.c_str());
2058 r
= store
->write_meta("ready", "ready");
2065 int OSD::peek_meta(ObjectStore
*store
,
2067 uuid_d
*cluster_fsid
,
2070 ceph_release_t
*require_osd_release
)
2074 int r
= store
->read_meta("magic", &val
);
2079 r
= store
->read_meta("whoami", &val
);
2082 *whoami
= atoi(val
.c_str());
2084 r
= store
->read_meta("ceph_fsid", &val
);
2087 r
= cluster_fsid
->parse(val
.c_str());
2091 r
= store
->read_meta("fsid", &val
);
2093 *osd_fsid
= uuid_d();
2095 r
= osd_fsid
->parse(val
.c_str());
2100 r
= store
->read_meta("require_osd_release", &val
);
2102 *require_osd_release
= ceph_release_from_name(val
);
2110 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2114 OSD::OSD(CephContext
*cct_
, ObjectStore
*store_
,
2116 Messenger
*internal_messenger
,
2117 Messenger
*external_messenger
,
2118 Messenger
*hb_client_front
,
2119 Messenger
*hb_client_back
,
2120 Messenger
*hb_front_serverm
,
2121 Messenger
*hb_back_serverm
,
2122 Messenger
*osdc_messenger
,
2124 const std::string
&dev
, const std::string
&jdev
) :
2126 tick_timer(cct
, osd_lock
),
2127 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
2128 gss_ktfile_client(cct
->_conf
.get_val
<std::string
>("gss_ktab_client_file")),
2129 cluster_messenger(internal_messenger
),
2130 client_messenger(external_messenger
),
2131 objecter_messenger(osdc_messenger
),
2133 mgrc(cct_
, client_messenger
, &mc
->monmap
),
2135 recoverystate_perf(NULL
),
2137 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
2138 clog(log_client
.create_channel()),
2140 dev_path(dev
), journal_path(jdev
),
2141 store_is_rotational(store
->is_rotational()),
2142 trace_endpoint("0.0.0.0", 0, "osd"),
2144 m_osd_pg_epoch_max_lag_factor(cct
->_conf
.get_val
<double>(
2145 "osd_pg_epoch_max_lag_factor")),
2146 osd_compat(get_osd_compat_set()),
2147 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
2148 get_num_op_threads()),
2149 heartbeat_stop(false),
2150 heartbeat_need_update(true),
2151 hb_front_client_messenger(hb_client_front
),
2152 hb_back_client_messenger(hb_client_back
),
2153 hb_front_server_messenger(hb_front_serverm
),
2154 hb_back_server_messenger(hb_back_serverm
),
2156 heartbeat_thread(this),
2157 heartbeat_dispatcher(this),
2158 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
2159 cct
->_conf
->osd_num_op_tracker_shard
),
2160 test_ops_hook(NULL
),
2163 cct
->_conf
->osd_op_thread_timeout
,
2164 cct
->_conf
->osd_op_thread_suicide_timeout
,
2166 last_pg_create_epoch(0),
2169 requested_full_first(0),
2170 requested_full_last(0),
2174 if (!gss_ktfile_client
.empty()) {
2175 // Assert we can export environment variable
2177 The default client keytab is used, if it is present and readable,
2178 to automatically obtain initial credentials for GSSAPI client
2179 applications. The principal name of the first entry in the client
2180 keytab is used by default when obtaining initial credentials.
2181 1. The KRB5_CLIENT_KTNAME environment variable.
2182 2. The default_client_keytab_name profile variable in [libdefaults].
2183 3. The hardcoded default, DEFCKTNAME.
2185 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2186 gss_ktfile_client
.c_str(), 1));
2187 ceph_assert(set_result
== 0);
2190 monc
->set_messenger(client_messenger
);
2191 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
2192 cct
->_conf
->osd_op_log_threshold
);
2193 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
2194 cct
->_conf
->osd_op_history_duration
);
2195 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
2196 cct
->_conf
->osd_op_history_slow_op_threshold
);
2197 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
2199 std::stringstream ss
;
2200 ss
<< "osd." << whoami
;
2201 trace_endpoint
.copy_name(ss
.str());
2204 // initialize shards
2205 num_shards
= get_num_op_shards();
2206 for (uint32_t i
= 0; i
< num_shards
; i
++) {
2207 OSDShard
*one_shard
= new OSDShard(
2211 shards
.push_back(one_shard
);
2217 while (!shards
.empty()) {
2218 delete shards
.back();
2221 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
2222 cct
->get_perfcounters_collection()->remove(logger
);
2223 delete recoverystate_perf
;
2228 double OSD::get_tick_interval() const
2230 // vary +/- 5% to avoid scrub scheduling livelocks
2231 constexpr auto delta
= 0.05;
2232 return (OSD_TICK_INTERVAL
*
2233 ceph::util::generate_random_number(1.0 - delta
, 1.0 + delta
));
2236 void OSD::handle_signal(int signum
)
2238 ceph_assert(signum
== SIGINT
|| signum
== SIGTERM
);
2239 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
2245 std::lock_guard
lock(osd_lock
);
2249 if (store
->test_mount_in_use()) {
2250 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
2251 << "currently in use. (Is ceph-osd already running?)" << dendl
;
2255 cct
->_conf
.add_observer(this);
2259 int OSD::set_numa_affinity()
2261 // storage numa node
2262 int store_node
= -1;
2263 store
->get_numa_node(&store_node
, nullptr, nullptr);
2264 if (store_node
>= 0) {
2265 dout(1) << __func__
<< " storage numa node " << store_node
<< dendl
;
2268 // check network numa node(s)
2269 int front_node
= -1, back_node
= -1;
2270 string front_iface
= pick_iface(
2272 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
2273 string back_iface
= pick_iface(
2275 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
2276 int r
= get_iface_numa_node(front_iface
, &front_node
);
2277 if (r
>= 0 && front_node
>= 0) {
2278 dout(1) << __func__
<< " public network " << front_iface
<< " numa node "
2279 << front_node
<< dendl
;
2280 r
= get_iface_numa_node(back_iface
, &back_node
);
2281 if (r
>= 0 && back_node
>= 0) {
2282 dout(1) << __func__
<< " cluster network " << back_iface
<< " numa node "
2283 << back_node
<< dendl
;
2284 if (front_node
== back_node
&&
2285 front_node
== store_node
) {
2286 dout(1) << " objectstore and network numa nodes all match" << dendl
;
2287 if (g_conf().get_val
<bool>("osd_numa_auto_affinity")) {
2288 numa_node
= front_node
;
2290 } else if (front_node
!= back_node
) {
2291 dout(1) << __func__
<< " public and cluster network numa nodes do not match"
2294 dout(1) << __func__
<< " objectstore and network numa nodes do not match"
2297 } else if (back_node
== -2) {
2298 dout(1) << __func__
<< " cluster network " << back_iface
2299 << " ports numa nodes do not match" << dendl
;
2301 derr
<< __func__
<< " unable to identify cluster interface '" << back_iface
2302 << "' numa node: " << cpp_strerror(r
) << dendl
;
2304 } else if (front_node
== -2) {
2305 dout(1) << __func__
<< " public network " << front_iface
2306 << " ports numa nodes do not match" << dendl
;
2308 derr
<< __func__
<< " unable to identify public interface '" << front_iface
2309 << "' numa node: " << cpp_strerror(r
) << dendl
;
2311 if (int node
= g_conf().get_val
<int64_t>("osd_numa_node"); node
>= 0) {
2312 // this takes precedence over the automagic logic above
2315 if (numa_node
>= 0) {
2316 int r
= get_numa_node_cpu_set(numa_node
, &numa_cpu_set_size
, &numa_cpu_set
);
2318 dout(1) << __func__
<< " unable to determine numa node " << numa_node
2319 << " CPUs" << dendl
;
2322 dout(1) << __func__
<< " setting numa affinity to node " << numa_node
2324 << cpu_set_to_str_list(numa_cpu_set_size
, &numa_cpu_set
)
2326 r
= set_cpu_affinity_all_threads(numa_cpu_set_size
, &numa_cpu_set
);
2329 derr
<< __func__
<< " failed to set numa affinity: " << cpp_strerror(r
)
2335 dout(1) << __func__
<< " not setting numa affinity" << dendl
;
2342 class OSDSocketHook
: public AdminSocketHook
{
2345 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
2346 int call(std::string_view prefix
, const cmdmap_t
& cmdmap
,
2349 bufferlist
& out
) override
{
2350 ceph_abort("should use async hook");
2353 std::string_view prefix
,
2354 const cmdmap_t
& cmdmap
,
2356 const bufferlist
& inbl
,
2357 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
) override
{
2359 osd
->asok_command(prefix
, cmdmap
, f
, inbl
, on_finish
);
2360 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2362 on_finish(-EINVAL
, e
.what(), empty
);
2367 std::set
<int64_t> OSD::get_mapped_pools()
2369 std::set
<int64_t> pools
;
2370 std::vector
<spg_t
> pgids
;
2372 for (const auto &pgid
: pgids
) {
2373 pools
.insert(pgid
.pool());
2378 void OSD::asok_command(
2379 std::string_view prefix
, const cmdmap_t
& cmdmap
,
2381 const bufferlist
& inbl
,
2382 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
2385 stringstream ss
; // stderr error message stream
2386 bufferlist outbl
; // if empty at end, we'll dump formatter as output
2388 // --- PG commands are routed here to PG::do_command ---
2389 if (prefix
== "pg" ||
2390 prefix
== "query" ||
2391 prefix
== "mark_unfound_lost" ||
2392 prefix
== "list_unfound" ||
2393 prefix
== "scrub" ||
2394 prefix
== "deep_scrub"
2398 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
2399 ss
<< "no pgid specified";
2403 if (!pgid
.parse(pgidstr
.c_str())) {
2404 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
2410 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
2411 (pg
= _lookup_lock_pg(pcand
))) {
2412 if (pg
->is_primary()) {
2413 cmdmap_t new_cmdmap
= cmdmap
;
2415 pg
->do_command(prefix
, new_cmdmap
, inbl
, on_finish
);
2417 return; // the pg handler calls on_finish directly
2418 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2425 ss
<< "not primary for pgid " << pgid
;
2426 // do not reply; they will get newer maps and realize they
2433 ss
<< "i don't have pgid " << pgid
;
2438 // --- OSD commands follow ---
2440 else if (prefix
== "status") {
2441 lock_guard
l(osd_lock
);
2442 f
->open_object_section("status");
2443 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
2444 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
2445 f
->dump_unsigned("whoami", superblock
.whoami
);
2446 f
->dump_string("state", get_state_name(get_state()));
2447 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
2448 f
->dump_unsigned("newest_map", superblock
.newest_map
);
2449 f
->dump_unsigned("num_pgs", num_pgs
);
2451 } else if (prefix
== "flush_journal") {
2452 store
->flush_journal();
2453 } else if (prefix
== "dump_ops_in_flight" ||
2455 prefix
== "dump_blocked_ops" ||
2456 prefix
== "dump_historic_ops" ||
2457 prefix
== "dump_historic_ops_by_duration" ||
2458 prefix
== "dump_historic_slow_ops") {
2460 const string error_str
= "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2461 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2462 will start to track new ops received afterwards.";
2464 set
<string
> filters
;
2465 vector
<string
> filter_str
;
2466 if (cmd_getval(cmdmap
, "filterstr", filter_str
)) {
2467 copy(filter_str
.begin(), filter_str
.end(),
2468 inserter(filters
, filters
.end()));
2471 if (prefix
== "dump_ops_in_flight" ||
2473 if (!op_tracker
.dump_ops_in_flight(f
, false, filters
)) {
2479 if (prefix
== "dump_blocked_ops") {
2480 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
)) {
2486 if (prefix
== "dump_historic_ops") {
2487 if (!op_tracker
.dump_historic_ops(f
, false, filters
)) {
2493 if (prefix
== "dump_historic_ops_by_duration") {
2494 if (!op_tracker
.dump_historic_ops(f
, true, filters
)) {
2500 if (prefix
== "dump_historic_slow_ops") {
2501 if (!op_tracker
.dump_historic_slow_ops(f
, filters
)) {
2507 } else if (prefix
== "dump_op_pq_state") {
2508 f
->open_object_section("pq");
2509 op_shardedwq
.dump(f
);
2511 } else if (prefix
== "dump_blacklist") {
2512 list
<pair
<entity_addr_t
,utime_t
> > bl
;
2513 OSDMapRef curmap
= service
.get_osdmap();
2515 f
->open_array_section("blacklist");
2516 curmap
->get_blacklist(&bl
);
2517 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
2518 it
!= bl
.end(); ++it
) {
2519 f
->open_object_section("entry");
2520 f
->open_object_section("entity_addr_t");
2522 f
->close_section(); //entity_addr_t
2523 it
->second
.localtime(f
->dump_stream("expire_time"));
2524 f
->close_section(); //entry
2526 f
->close_section(); //blacklist
2527 } else if (prefix
== "dump_watchers") {
2528 list
<obj_watch_item_t
> watchers
;
2532 for (auto& pg
: pgs
) {
2533 list
<obj_watch_item_t
> pg_watchers
;
2534 pg
->get_watchers(&pg_watchers
);
2535 watchers
.splice(watchers
.end(), pg_watchers
);
2538 f
->open_array_section("watchers");
2539 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2540 it
!= watchers
.end(); ++it
) {
2542 f
->open_object_section("watch");
2544 f
->dump_string("namespace", it
->obj
.nspace
);
2545 f
->dump_string("object", it
->obj
.oid
.name
);
2547 f
->open_object_section("entity_name");
2548 it
->wi
.name
.dump(f
);
2549 f
->close_section(); //entity_name_t
2551 f
->dump_unsigned("cookie", it
->wi
.cookie
);
2552 f
->dump_unsigned("timeout", it
->wi
.timeout_seconds
);
2554 f
->open_object_section("entity_addr_t");
2555 it
->wi
.addr
.dump(f
);
2556 f
->close_section(); //entity_addr_t
2558 f
->close_section(); //watch
2561 f
->close_section(); //watchers
2562 } else if (prefix
== "dump_recovery_reservations") {
2563 f
->open_object_section("reservations");
2564 f
->open_object_section("local_reservations");
2565 service
.local_reserver
.dump(f
);
2567 f
->open_object_section("remote_reservations");
2568 service
.remote_reserver
.dump(f
);
2571 } else if (prefix
== "dump_scrub_reservations") {
2572 f
->open_object_section("scrub_reservations");
2573 service
.dump_scrub_reservations(f
);
2575 } else if (prefix
== "get_latest_osdmap") {
2576 get_latest_osdmap();
2577 } else if (prefix
== "set_heap_property") {
2581 bool success
= false;
2582 if (!cmd_getval(cmdmap
, "property", property
)) {
2583 error
= "unable to get property";
2585 } else if (!cmd_getval(cmdmap
, "value", value
)) {
2586 error
= "unable to get value";
2588 } else if (value
< 0) {
2589 error
= "negative value not allowed";
2591 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2592 error
= "invalid property";
2597 f
->open_object_section("result");
2598 f
->dump_string("error", error
);
2599 f
->dump_bool("success", success
);
2601 } else if (prefix
== "get_heap_property") {
2605 bool success
= false;
2606 if (!cmd_getval(cmdmap
, "property", property
)) {
2607 error
= "unable to get property";
2609 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2610 error
= "invalid property";
2615 f
->open_object_section("result");
2616 f
->dump_string("error", error
);
2617 f
->dump_bool("success", success
);
2618 f
->dump_int("value", value
);
2620 } else if (prefix
== "dump_objectstore_kv_stats") {
2621 store
->get_db_statistics(f
);
2622 } else if (prefix
== "dump_scrubs") {
2623 service
.dumps_scrub(f
);
2624 } else if (prefix
== "calc_objectstore_db_histogram") {
2625 store
->generate_db_histogram(f
);
2626 } else if (prefix
== "flush_store_cache") {
2627 store
->flush_cache(&ss
);
2628 } else if (prefix
== "dump_pgstate_history") {
2629 f
->open_object_section("pgstate_history");
2630 f
->open_array_section("pgs");
2633 for (auto& pg
: pgs
) {
2634 f
->open_object_section("pg");
2635 f
->dump_stream("pg") << pg
->pg_id
;
2636 f
->dump_string("currently", pg
->get_current_state());
2637 pg
->dump_pgstate_history(f
);
2642 } else if (prefix
== "compact") {
2643 dout(1) << "triggering manual compaction" << dendl
;
2644 auto start
= ceph::coarse_mono_clock::now();
2646 auto end
= ceph::coarse_mono_clock::now();
2647 double duration
= std::chrono::duration
<double>(end
-start
).count();
2648 dout(1) << "finished manual compaction in "
2650 << " seconds" << dendl
;
2651 f
->open_object_section("compact_result");
2652 f
->dump_float("elapsed_time", duration
);
2654 } else if (prefix
== "get_mapped_pools") {
2655 f
->open_array_section("mapped_pools");
2656 set
<int64_t> poollist
= get_mapped_pools();
2657 for (auto pool
: poollist
) {
2658 f
->dump_int("pool_id", pool
);
2661 } else if (prefix
== "smart") {
2663 cmd_getval(cmdmap
, "devid", devid
);
2665 probe_smart(devid
, out
);
2666 outbl
.append(out
.str());
2667 } else if (prefix
== "list_devices") {
2668 set
<string
> devnames
;
2669 store
->get_devices(&devnames
);
2670 f
->open_array_section("list_devices");
2671 for (auto dev
: devnames
) {
2672 if (dev
.find("dm-") == 0) {
2676 f
->open_object_section("device");
2677 f
->dump_string("device", "/dev/" + dev
);
2678 f
->dump_string("device_id", get_device_id(dev
, &err
));
2682 } else if (prefix
== "send_beacon") {
2683 lock_guard
l(osd_lock
);
2685 send_beacon(ceph::coarse_mono_clock::now());
2689 else if (prefix
== "cluster_log") {
2691 cmd_getval(cmdmap
, "message", msg
);
2694 ss
<< "ignoring empty log message";
2697 string message
= msg
.front();
2698 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
2699 message
+= " " + *a
;
2701 cmd_getval(cmdmap
, "level", lvl
);
2702 clog_type level
= string_to_clog_type(lvl
);
2705 ss
<< "unknown level '" << lvl
<< "'";
2708 clog
->do_log(level
, message
);
2711 else if (prefix
== "bench") {
2712 lock_guard
l(osd_lock
);
2715 int64_t osize
, onum
;
2716 // default count 1G, size 4MB
2717 cmd_getval(cmdmap
, "count", count
, (int64_t)1 << 30);
2718 cmd_getval(cmdmap
, "size", bsize
, (int64_t)4 << 20);
2719 cmd_getval(cmdmap
, "object_size", osize
, (int64_t)0);
2720 cmd_getval(cmdmap
, "object_num", onum
, (int64_t)0);
2722 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
2724 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
2725 // let us limit the block size because the next checks rely on it
2726 // having a sane value. If we allow any block size to be set things
2727 // can still go sideways.
2728 ss
<< "block 'size' values are capped at "
2729 << byte_u_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
2730 << " a higher value, please adjust 'osd_bench_max_block_size'";
2733 } else if (bsize
< (int64_t) (1 << 20)) {
2734 // entering the realm of small block sizes.
2735 // limit the count to a sane value, assuming a configurable amount of
2736 // IOPS and duration, so that the OSD doesn't get hung up on this,
2737 // preventing timeouts from going off
2739 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
2740 if (count
> max_count
) {
2741 ss
<< "'count' values greater than " << max_count
2742 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
2743 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
2744 << " for " << duration
<< " seconds,"
2745 << " can cause ill effects on osd. "
2746 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2747 << " value if you wish to use a higher 'count'.";
2752 // 1MB block sizes are big enough so that we get more stuff done.
2753 // However, to avoid the osd from getting hung on this and having
2754 // timers being triggered, we are going to limit the count assuming
2755 // a configurable throughput and duration.
2756 // NOTE: max_count is the total amount of bytes that we believe we
2757 // will be able to write during 'duration' for the given
2758 // throughput. The block size hardly impacts this unless it's
2759 // way too big. Given we already check how big the block size
2760 // is, it's safe to assume everything will check out.
2762 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
2763 if (count
> max_count
) {
2764 ss
<< "'count' values greater than " << max_count
2765 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
2766 << byte_u_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
2767 << " for " << duration
<< " seconds,"
2768 << " can cause ill effects on osd. "
2769 << " Please adjust 'osd_bench_large_size_max_throughput'"
2770 << " with a higher value if you wish to use a higher 'count'.";
2776 if (osize
&& bsize
> osize
)
2779 dout(1) << " bench count " << count
2780 << " bsize " << byte_u_t(bsize
) << dendl
;
2782 ObjectStore::Transaction cleanupt
;
2784 if (osize
&& onum
) {
2786 bufferptr
bp(osize
);
2788 bl
.push_back(std::move(bp
));
2789 bl
.rebuild_page_aligned();
2790 for (int i
=0; i
<onum
; ++i
) {
2792 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
2794 hobject_t
soid(sobject_t(oid
, 0));
2795 ObjectStore::Transaction t
;
2796 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
2797 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
2798 cleanupt
.remove(coll_t(), ghobject_t(soid
));
2803 bufferptr
bp(bsize
);
2805 bl
.push_back(std::move(bp
));
2806 bl
.rebuild_page_aligned();
2810 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2815 utime_t start
= ceph_clock_now();
2816 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
2818 unsigned offset
= 0;
2819 if (onum
&& osize
) {
2820 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
2821 offset
= rand() % (osize
/ bsize
) * bsize
;
2823 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
2826 hobject_t
soid(sobject_t(oid
, 0));
2827 ObjectStore::Transaction t
;
2828 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
2829 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
2830 if (!onum
|| !osize
)
2831 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
2836 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2840 utime_t end
= ceph_clock_now();
2843 store
->queue_transaction(service
.meta_ch
, std::move(cleanupt
), NULL
);
2846 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2851 double elapsed
= end
- start
;
2852 double rate
= count
/ elapsed
;
2853 double iops
= rate
/ bsize
;
2854 f
->open_object_section("osd_bench_results");
2855 f
->dump_int("bytes_written", count
);
2856 f
->dump_int("blocksize", bsize
);
2857 f
->dump_float("elapsed_sec", elapsed
);
2858 f
->dump_float("bytes_per_sec", rate
);
2859 f
->dump_float("iops", iops
);
2863 else if (prefix
== "flush_pg_stats") {
2864 mgrc
.send_pgstats();
2865 f
->dump_unsigned("stat_seq", service
.get_osd_stat_seq());
2868 else if (prefix
== "heap") {
2869 ret
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ss
);
2872 else if (prefix
== "debug dump_missing") {
2873 f
->open_array_section("pgs");
2876 for (auto& pg
: pgs
) {
2877 string s
= stringify(pg
->pg_id
);
2878 f
->open_array_section(s
.c_str());
2880 pg
->dump_missing(f
);
2887 else if (prefix
== "debug kick_recovery_wq") {
2889 cmd_getval(cmdmap
, "delay", delay
);
2892 ret
= cct
->_conf
.set_val("osd_recovery_delay_start", oss
.str().c_str());
2894 ss
<< "kick_recovery_wq: error setting "
2895 << "osd_recovery_delay_start to '" << delay
<< "': error "
2899 cct
->_conf
.apply_changes(nullptr);
2900 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
2901 << "to " << cct
->_conf
->osd_recovery_delay_start
;
2904 else if (prefix
== "cpu_profiler") {
2907 cmd_getval(cmdmap
, "arg", arg
);
2908 vector
<string
> argvec
;
2909 get_str_vec(arg
, argvec
);
2910 cpu_profiler_handle_command(argvec
, ds
);
2911 outbl
.append(ds
.str());
2914 else if (prefix
== "dump_pg_recovery_stats") {
2915 lock_guard
l(osd_lock
);
2916 pg_recovery_stats
.dump_formatted(f
);
2919 else if (prefix
== "reset_pg_recovery_stats") {
2920 lock_guard
l(osd_lock
);
2921 pg_recovery_stats
.reset();
2924 else if (prefix
== "perf histogram dump") {
2926 std::string counter
;
2927 cmd_getval(cmdmap
, "logger", logger
);
2928 cmd_getval(cmdmap
, "counter", counter
);
2929 cct
->get_perfcounters_collection()->dump_formatted_histograms(
2930 f
, false, logger
, counter
);
2933 else if (prefix
== "cache drop") {
2934 lock_guard
l(osd_lock
);
2935 dout(20) << "clearing all caches" << dendl
;
2936 // Clear the objectstore's cache - onode and buffer for Bluestore,
2937 // system's pagecache for Filestore
2938 ret
= store
->flush_cache(&ss
);
2940 ss
<< "Error flushing objectstore cache: " << cpp_strerror(ret
);
2943 // Clear the objectcontext cache (per PG)
2946 for (auto& pg
: pgs
) {
2951 else if (prefix
== "cache status") {
2952 lock_guard
l(osd_lock
);
2953 int obj_ctx_count
= 0;
2956 for (auto& pg
: pgs
) {
2957 obj_ctx_count
+= pg
->get_cache_obj_count();
2959 f
->open_object_section("cache_status");
2960 f
->dump_int("object_ctx", obj_ctx_count
);
2961 store
->dump_cache_stats(f
);
2965 else if (prefix
== "scrub_purged_snaps") {
2966 lock_guard
l(osd_lock
);
2967 scrub_purged_snaps();
2970 else if (prefix
== "dump_osd_network") {
2971 lock_guard
l(osd_lock
);
2973 if (!(cmd_getval(cmdmap
, "value", value
))) {
2974 // Convert milliseconds to microseconds
2975 value
= static_cast<double>(g_conf().get_val
<double>(
2976 "mon_warn_on_slow_ping_time")) * 1000;
2978 double ratio
= g_conf().get_val
<double>("mon_warn_on_slow_ping_ratio");
2979 value
= g_conf().get_val
<int64_t>("osd_heartbeat_grace");
2980 value
*= 1000000 * ratio
; // Seconds of grace to microseconds at ratio
2983 // Convert user input to microseconds
2986 if (value
< 0) value
= 0;
2988 struct osd_ping_time_t
{
2992 std::array
<uint32_t,3> times
;
2993 std::array
<uint32_t,3> min
;
2994 std::array
<uint32_t,3> max
;
2996 uint32_t last_update
;
2998 bool operator<(const osd_ping_time_t
& rhs
) const {
2999 if (pingtime
< rhs
.pingtime
)
3001 if (pingtime
> rhs
.pingtime
)
3011 set
<osd_ping_time_t
> sorted
;
3012 // Get pingtimes under lock and not on the stack
3013 map
<int, osd_stat_t::Interfaces
> *pingtimes
= new map
<int, osd_stat_t::Interfaces
>;
3014 service
.get_hb_pingtime(pingtimes
);
3015 for (auto j
: *pingtimes
) {
3016 if (j
.second
.last_update
== 0)
3018 osd_ping_time_t item
;
3019 item
.pingtime
= std::max(j
.second
.back_pingtime
[0], j
.second
.back_pingtime
[1]);
3020 item
.pingtime
= std::max(item
.pingtime
, j
.second
.back_pingtime
[2]);
3021 if (item
.pingtime
>= value
) {
3023 item
.times
[0] = j
.second
.back_pingtime
[0];
3024 item
.times
[1] = j
.second
.back_pingtime
[1];
3025 item
.times
[2] = j
.second
.back_pingtime
[2];
3026 item
.min
[0] = j
.second
.back_min
[0];
3027 item
.min
[1] = j
.second
.back_min
[1];
3028 item
.min
[2] = j
.second
.back_min
[2];
3029 item
.max
[0] = j
.second
.back_max
[0];
3030 item
.max
[1] = j
.second
.back_max
[1];
3031 item
.max
[2] = j
.second
.back_max
[2];
3032 item
.last
= j
.second
.back_last
;
3034 item
.last_update
= j
.second
.last_update
;
3035 sorted
.emplace(item
);
3037 if (j
.second
.front_last
== 0)
3039 item
.pingtime
= std::max(j
.second
.front_pingtime
[0], j
.second
.front_pingtime
[1]);
3040 item
.pingtime
= std::max(item
.pingtime
, j
.second
.front_pingtime
[2]);
3041 if (item
.pingtime
>= value
) {
3043 item
.times
[0] = j
.second
.front_pingtime
[0];
3044 item
.times
[1] = j
.second
.front_pingtime
[1];
3045 item
.times
[2] = j
.second
.front_pingtime
[2];
3046 item
.min
[0] = j
.second
.front_min
[0];
3047 item
.min
[1] = j
.second
.front_min
[1];
3048 item
.min
[2] = j
.second
.front_min
[2];
3049 item
.max
[0] = j
.second
.front_max
[0];
3050 item
.max
[1] = j
.second
.front_max
[1];
3051 item
.max
[2] = j
.second
.front_max
[2];
3052 item
.last
= j
.second
.front_last
;
3053 item
.last_update
= j
.second
.last_update
;
3055 sorted
.emplace(item
);
3060 // Network ping times (1min 5min 15min)
3061 f
->open_object_section("network_ping_times");
3062 f
->dump_int("threshold", value
/ 1000);
3063 f
->open_array_section("entries");
3064 for (auto &sitem
: boost::adaptors::reverse(sorted
)) {
3065 ceph_assert(sitem
.pingtime
>= value
);
3066 f
->open_object_section("entry");
3068 const time_t lu(sitem
.last_update
);
3070 string
lustr(ctime_r(&lu
, buffer
));
3071 lustr
.pop_back(); // Remove trailing \n
3072 auto stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3073 f
->dump_string("last update", lustr
);
3074 f
->dump_bool("stale", ceph_clock_now().sec() - sitem
.last_update
> stale
);
3075 f
->dump_int("from osd", whoami
);
3076 f
->dump_int("to osd", sitem
.to
);
3077 f
->dump_string("interface", (sitem
.back
? "back" : "front"));
3078 f
->open_object_section("average");
3079 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.times
[0],3).c_str());
3080 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.times
[1],3).c_str());
3081 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.times
[2],3).c_str());
3082 f
->close_section(); // average
3083 f
->open_object_section("min");
3084 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3085 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3086 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3087 f
->close_section(); // min
3088 f
->open_object_section("max");
3089 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3090 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3091 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3092 f
->close_section(); // max
3093 f
->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem
.last
,3).c_str());
3094 f
->close_section(); // entry
3096 f
->close_section(); // entries
3097 f
->close_section(); // network_ping_times
3099 ceph_abort_msg("broken asok registration");
3103 on_finish(ret
, ss
.str(), outbl
);
3106 class TestOpsSocketHook
: public AdminSocketHook
{
3107 OSDService
*service
;
3110 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
3111 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
3113 std::ostream
& errss
,
3114 bufferlist
& out
) override
{
3118 test_ops(service
, store
, command
, cmdmap
, outss
);
3120 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
3126 void test_ops(OSDService
*service
, ObjectStore
*store
,
3127 std::string_view command
, const cmdmap_t
& cmdmap
, ostream
&ss
);
3131 class OSD::C_Tick
: public Context
{
3134 explicit C_Tick(OSD
*o
) : osd(o
) {}
3135 void finish(int r
) override
{
3140 class OSD::C_Tick_WithoutOSDLock
: public Context
{
3143 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
3144 void finish(int r
) override
{
3145 osd
->tick_without_osd_lock();
3149 int OSD::enable_disable_fuse(bool stop
)
3153 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
3154 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
3155 dout(1) << __func__
<< " disabling" << dendl
;
3159 r
= ::rmdir(mntpath
.c_str());
3162 derr
<< __func__
<< " failed to rmdir " << mntpath
<< ": "
3163 << cpp_strerror(r
) << dendl
;
3168 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
3169 dout(1) << __func__
<< " enabling" << dendl
;
3170 r
= ::mkdir(mntpath
.c_str(), 0700);
3173 if (r
< 0 && r
!= -EEXIST
) {
3174 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
3175 << cpp_strerror(r
) << dendl
;
3178 fuse_store
= new FuseStore(store
, mntpath
);
3179 r
= fuse_store
->start();
3181 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
3187 #endif // HAVE_LIBFUSE
3191 size_t OSD::get_num_cache_shards()
3193 return cct
->_conf
.get_val
<Option::size_t>("osd_num_cache_shards");
3196 int OSD::get_num_op_shards()
3198 if (cct
->_conf
->osd_op_num_shards
)
3199 return cct
->_conf
->osd_op_num_shards
;
3200 if (store_is_rotational
)
3201 return cct
->_conf
->osd_op_num_shards_hdd
;
3203 return cct
->_conf
->osd_op_num_shards_ssd
;
3206 int OSD::get_num_op_threads()
3208 if (cct
->_conf
->osd_op_num_threads_per_shard
)
3209 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard
;
3210 if (store_is_rotational
)
3211 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_hdd
;
3213 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_ssd
;
3216 float OSD::get_osd_recovery_sleep()
3218 if (cct
->_conf
->osd_recovery_sleep
)
3219 return cct
->_conf
->osd_recovery_sleep
;
3220 if (!store_is_rotational
&& !journal_is_rotational
)
3221 return cct
->_conf
->osd_recovery_sleep_ssd
;
3222 else if (store_is_rotational
&& !journal_is_rotational
)
3223 return cct
->_conf
.get_val
<double>("osd_recovery_sleep_hybrid");
3225 return cct
->_conf
->osd_recovery_sleep_hdd
;
3228 float OSD::get_osd_delete_sleep()
3230 float osd_delete_sleep
= cct
->_conf
.get_val
<double>("osd_delete_sleep");
3231 if (osd_delete_sleep
> 0)
3232 return osd_delete_sleep
;
3233 if (!store_is_rotational
&& !journal_is_rotational
)
3234 return cct
->_conf
.get_val
<double>("osd_delete_sleep_ssd");
3235 if (store_is_rotational
&& !journal_is_rotational
)
3236 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hybrid");
3237 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hdd");
3240 int OSD::get_recovery_max_active()
3242 if (cct
->_conf
->osd_recovery_max_active
)
3243 return cct
->_conf
->osd_recovery_max_active
;
3244 if (store_is_rotational
)
3245 return cct
->_conf
->osd_recovery_max_active_hdd
;
3247 return cct
->_conf
->osd_recovery_max_active_ssd
;
3250 float OSD::get_osd_snap_trim_sleep()
3252 float osd_snap_trim_sleep
= cct
->_conf
.get_val
<double>("osd_snap_trim_sleep");
3253 if (osd_snap_trim_sleep
> 0)
3254 return osd_snap_trim_sleep
;
3255 if (!store_is_rotational
&& !journal_is_rotational
)
3256 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_ssd");
3257 if (store_is_rotational
&& !journal_is_rotational
)
3258 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hybrid");
3259 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hdd");
3265 CompatSet initial
, diff
;
3266 std::lock_guard
lock(osd_lock
);
3271 tick_timer_without_osd_lock
.init();
3272 service
.recovery_request_timer
.init();
3273 service
.sleep_timer
.init();
3275 boot_finisher
.start();
3279 store
->read_meta("require_osd_release", &val
);
3280 last_require_osd_release
= ceph_release_from_name(val
);
3284 dout(2) << "init " << dev_path
3285 << " (looks like " << (store_is_rotational
? "hdd" : "ssd") << ")"
3287 dout(2) << "journal " << journal_path
<< dendl
;
3288 ceph_assert(store
); // call pre_init() first!
3290 store
->set_cache_shards(get_num_cache_shards());
3292 int r
= store
->mount();
3294 derr
<< "OSD:init: unable to mount object store" << dendl
;
3297 journal_is_rotational
= store
->is_journal_rotational();
3298 dout(2) << "journal looks like " << (journal_is_rotational
? "hdd" : "ssd")
3301 enable_disable_fuse(false);
3303 dout(2) << "boot" << dendl
;
3305 service
.meta_ch
= store
->open_collection(coll_t::meta());
3307 // initialize the daily loadavg with current 15min loadavg
3309 if (getloadavg(loadavgs
, 3) == 3) {
3310 daily_loadavg
= loadavgs
[2];
3312 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
3313 daily_loadavg
= 1.0;
3316 int rotating_auth_attempts
= 0;
3317 auto rotating_auth_timeout
=
3318 g_conf().get_val
<int64_t>("rotating_keys_bootstrap_timeout");
3320 // sanity check long object name handling
3323 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
3324 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
3325 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
3326 r
= store
->validate_hobject_key(l
);
3328 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
3329 << "object name[space] len" << dendl
;
3330 derr
<< " osd max object name len = "
3331 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
3332 derr
<< " osd max object namespace len = "
3333 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
3334 derr
<< cpp_strerror(r
) << dendl
;
3335 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
3338 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
3341 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
3346 r
= read_superblock();
3348 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
3353 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
3354 derr
<< "The disk uses features unsupported by the executable." << dendl
;
3355 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
3356 derr
<< " daemon features " << osd_compat
<< dendl
;
3358 if (osd_compat
.writeable(superblock
.compat_features
)) {
3359 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3360 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
3365 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3366 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
3372 assert_warn(whoami
== superblock
.whoami
);
3373 if (whoami
!= superblock
.whoami
) {
3374 derr
<< "OSD::init: superblock says osd"
3375 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
3380 startup_time
= ceph::mono_clock::now();
3382 // load up "current" osdmap
3383 assert_warn(!get_osdmap());
3385 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
3389 osdmap
= get_map(superblock
.current_epoch
);
3392 // make sure we don't have legacy pgs deleting
3395 int r
= store
->list_collections(ls
);
3396 ceph_assert(r
>= 0);
3399 if (c
.is_pg(&pgid
) &&
3400 !osdmap
->have_pg_pool(pgid
.pool())) {
3401 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
3402 if (!store
->exists(service
.meta_ch
, oid
)) {
3403 derr
<< __func__
<< " missing pg_pool_t for deleted pool "
3404 << pgid
.pool() << " for pg " << pgid
3405 << "; please downgrade to luminous and allow "
3406 << "pg deletion to complete before upgrading" << dendl
;
3413 initial
= get_osd_initial_compat_set();
3414 diff
= superblock
.compat_features
.unsupported(initial
);
3415 if (superblock
.compat_features
.merge(initial
)) {
3416 // Are we adding SNAPMAPPER2?
3417 if (diff
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
)) {
3418 dout(1) << __func__
<< " upgrade snap_mapper (first start as octopus)"
3420 auto ch
= service
.meta_ch
;
3421 auto hoid
= make_snapmapper_oid();
3422 unsigned max
= cct
->_conf
->osd_target_transaction_size
;
3423 r
= SnapMapper::convert_legacy(cct
, store
, ch
, hoid
, max
);
3427 // We need to persist the new compat_set before we
3429 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
3430 ObjectStore::Transaction t
;
3431 write_superblock(t
);
3432 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3437 // make sure snap mapper object exists
3438 if (!store
->exists(service
.meta_ch
, OSD::make_snapmapper_oid())) {
3439 dout(10) << "init creating/touching snapmapper object" << dendl
;
3440 ObjectStore::Transaction t
;
3441 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3442 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3446 if (!store
->exists(service
.meta_ch
, OSD::make_purged_snaps_oid())) {
3447 dout(10) << "init creating/touching purged_snaps object" << dendl
;
3448 ObjectStore::Transaction t
;
3449 t
.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3450 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3455 if (cct
->_conf
->osd_open_classes_on_start
) {
3456 int r
= ClassHandler::get_instance().open_all_classes();
3458 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
3461 check_osdmap_features();
3463 create_recoverystate_perf();
3466 epoch_t bind_epoch
= osdmap
->get_epoch();
3467 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
3470 clear_temp_objects();
3472 // initialize osdmap references in sharded wq
3473 for (auto& shard
: shards
) {
3474 std::lock_guard
l(shard
->osdmap_lock
);
3475 shard
->shard_osdmap
= osdmap
;
3478 // load up pgs (as they previously existed)
3481 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
3487 struct store_statfs_t stbuf
;
3488 osd_alert_list_t alerts
;
3489 int r
= store
->statfs(&stbuf
, &alerts
);
3490 ceph_assert(r
== 0);
3491 service
.set_statfs(stbuf
, alerts
);
3494 // client_messenger auth_client is already set up by monc.
3495 for (auto m
: { cluster_messenger
,
3497 hb_front_client_messenger
,
3498 hb_back_client_messenger
,
3499 hb_front_server_messenger
,
3500 hb_back_server_messenger
} ) {
3501 m
->set_auth_client(monc
);
3503 for (auto m
: { client_messenger
,
3505 hb_front_server_messenger
,
3506 hb_back_server_messenger
}) {
3507 m
->set_auth_server(monc
);
3509 monc
->set_handle_authentication_dispatcher(this);
3511 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
3512 | CEPH_ENTITY_TYPE_MGR
);
3517 mgrc
.set_pgstats_cb([this](){ return collect_pg_stats(); });
3518 mgrc
.set_perf_metric_query_cb(
3519 [this](const ConfigPayload
&config_payload
) {
3520 set_perf_queries(config_payload
);
3523 return get_perf_reports();
3527 // tell monc about log_client so it will know about mon session resets
3528 monc
->set_log_client(&log_client
);
3529 update_log_config();
3532 client_messenger
->add_dispatcher_tail(&mgrc
);
3533 client_messenger
->add_dispatcher_tail(this);
3534 cluster_messenger
->add_dispatcher_head(this);
3536 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3537 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3538 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3539 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3541 objecter_messenger
->add_dispatcher_head(service
.objecter
.get());
3544 service
.publish_map(osdmap
);
3545 service
.publish_superblock(superblock
);
3546 service
.max_oldest_map
= superblock
.oldest_map
;
3548 for (auto& shard
: shards
) {
3549 // put PGs in a temporary set because we may modify pg_slots
3550 // unordered_map below.
3552 for (auto& i
: shard
->pg_slots
) {
3553 PGRef pg
= i
.second
->pg
;
3559 for (auto pg
: pgs
) {
3560 std::scoped_lock l
{*pg
};
3561 set
<pair
<spg_t
,epoch_t
>> new_children
;
3562 set
<pair
<spg_t
,epoch_t
>> merge_pgs
;
3563 service
.identify_splits_and_merges(pg
->get_osdmap(), osdmap
, pg
->pg_id
,
3564 &new_children
, &merge_pgs
);
3565 if (!new_children
.empty()) {
3566 for (auto shard
: shards
) {
3567 shard
->prime_splits(osdmap
, &new_children
);
3569 assert(new_children
.empty());
3571 if (!merge_pgs
.empty()) {
3572 for (auto shard
: shards
) {
3573 shard
->prime_merges(osdmap
, &merge_pgs
);
3575 assert(merge_pgs
.empty());
3582 // start the heartbeat
3583 heartbeat_thread
.create("osd_srv_heartbt");
3586 tick_timer
.add_event_after(get_tick_interval(),
3589 std::lock_guard
l(tick_timer_lock
);
3590 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
3591 new C_Tick_WithoutOSDLock(this));
3596 r
= monc
->authenticate();
3598 derr
<< __func__
<< " authentication failed: " << cpp_strerror(r
)
3603 while (monc
->wait_auth_rotating(rotating_auth_timeout
) < 0) {
3604 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
3605 ++rotating_auth_attempts
;
3606 if (rotating_auth_attempts
> g_conf()->max_rotating_auth_attempts
) {
3607 derr
<< __func__
<< " wait_auth_rotating timed out" << dendl
;
3612 r
= update_crush_device_class();
3614 derr
<< __func__
<< " unable to update_crush_device_class: "
3615 << cpp_strerror(r
) << dendl
;
3619 r
= update_crush_location();
3621 derr
<< __func__
<< " unable to update_crush_location: "
3622 << cpp_strerror(r
) << dendl
;
3630 // start objecter *after* we have authenticated, so that we don't ignore
3631 // the OSDMaps it requests.
3632 service
.final_init();
3636 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
3639 dout(0) << "done with init, starting boot process" << dendl
;
3641 // subscribe to any pg creations
3642 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
3644 // MgrClient needs this (it doesn't have MonClient reference itself)
3645 monc
->sub_want("mgrmap", 0, 0);
3647 // we don't need to ask for an osdmap here; objecter will
3648 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3657 enable_disable_fuse(true);
3664 void OSD::final_init()
3666 AdminSocket
*admin_socket
= cct
->get_admin_socket();
3667 asok_hook
= new OSDSocketHook(this);
3668 int r
= admin_socket
->register_command("status", asok_hook
,
3669 "high-level status of OSD");
3670 ceph_assert(r
== 0);
3671 r
= admin_socket
->register_command("flush_journal",
3673 "flush the journal to permanent store");
3674 ceph_assert(r
== 0);
3675 r
= admin_socket
->register_command("dump_ops_in_flight " \
3676 "name=filterstr,type=CephString,n=N,req=false",
3678 "show the ops currently in flight");
3679 ceph_assert(r
== 0);
3680 r
= admin_socket
->register_command("ops " \
3681 "name=filterstr,type=CephString,n=N,req=false",
3683 "show the ops currently in flight");
3684 ceph_assert(r
== 0);
3685 r
= admin_socket
->register_command("dump_blocked_ops " \
3686 "name=filterstr,type=CephString,n=N,req=false",
3688 "show the blocked ops currently in flight");
3689 ceph_assert(r
== 0);
3690 r
= admin_socket
->register_command("dump_historic_ops " \
3691 "name=filterstr,type=CephString,n=N,req=false",
3694 ceph_assert(r
== 0);
3695 r
= admin_socket
->register_command("dump_historic_slow_ops " \
3696 "name=filterstr,type=CephString,n=N,req=false",
3698 "show slowest recent ops");
3699 ceph_assert(r
== 0);
3700 r
= admin_socket
->register_command("dump_historic_ops_by_duration " \
3701 "name=filterstr,type=CephString,n=N,req=false",
3703 "show slowest recent ops, sorted by duration");
3704 ceph_assert(r
== 0);
3705 r
= admin_socket
->register_command("dump_op_pq_state",
3707 "dump op priority queue state");
3708 ceph_assert(r
== 0);
3709 r
= admin_socket
->register_command("dump_blacklist",
3711 "dump blacklisted clients and times");
3712 ceph_assert(r
== 0);
3713 r
= admin_socket
->register_command("dump_watchers",
3715 "show clients which have active watches,"
3716 " and on which objects");
3717 ceph_assert(r
== 0);
3718 r
= admin_socket
->register_command("dump_recovery_reservations",
3720 "show recovery reservations");
3721 ceph_assert(r
== 0);
3722 r
= admin_socket
->register_command("dump_scrub_reservations",
3724 "show recovery reservations");
3725 ceph_assert(r
== 0);
3726 r
= admin_socket
->register_command("get_latest_osdmap",
3728 "force osd to update the latest map from "
3730 ceph_assert(r
== 0);
3732 r
= admin_socket
->register_command("set_heap_property " \
3733 "name=property,type=CephString " \
3734 "name=value,type=CephInt",
3736 "update malloc extension heap property");
3737 ceph_assert(r
== 0);
3739 r
= admin_socket
->register_command("get_heap_property " \
3740 "name=property,type=CephString",
3742 "get malloc extension heap property");
3743 ceph_assert(r
== 0);
3745 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
3747 "print statistics of kvdb which used by bluestore");
3748 ceph_assert(r
== 0);
3750 r
= admin_socket
->register_command("dump_scrubs",
3752 "print scheduled scrubs");
3753 ceph_assert(r
== 0);
3755 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
3757 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3758 ceph_assert(r
== 0);
3760 r
= admin_socket
->register_command("flush_store_cache",
3762 "Flush bluestore internal cache");
3763 ceph_assert(r
== 0);
3764 r
= admin_socket
->register_command("dump_pgstate_history",
3766 "show recent state history");
3767 ceph_assert(r
== 0);
3769 r
= admin_socket
->register_command("compact",
3771 "Commpact object store's omap."
3772 " WARNING: Compaction probably slows your requests");
3773 ceph_assert(r
== 0);
3775 r
= admin_socket
->register_command("get_mapped_pools",
3777 "dump pools whose PG(s) are mapped to this OSD.");
3779 ceph_assert(r
== 0);
3781 r
= admin_socket
->register_command("smart name=devid,type=CephString,req=false",
3783 "probe OSD devices for SMART data.");
3785 ceph_assert(r
== 0);
3787 r
= admin_socket
->register_command("list_devices",
3789 "list OSD devices.");
3790 r
= admin_socket
->register_command("send_beacon",
3792 "send OSD beacon to mon immediately");
3794 r
= admin_socket
->register_command(
3795 "dump_osd_network name=value,type=CephInt,req=false", asok_hook
,
3796 "Dump osd heartbeat network ping times");
3797 ceph_assert(r
== 0);
3799 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
);
3800 // Note: pools are CephString instead of CephPoolname because
3801 // these commands traditionally support both pool names and numbers
3802 r
= admin_socket
->register_command(
3804 "name=pool,type=CephString " \
3805 "name=objname,type=CephObjectname " \
3806 "name=key,type=CephString "\
3807 "name=val,type=CephString",
3810 ceph_assert(r
== 0);
3811 r
= admin_socket
->register_command(
3813 "name=pool,type=CephString " \
3814 "name=objname,type=CephObjectname " \
3815 "name=key,type=CephString",
3818 ceph_assert(r
== 0);
3819 r
= admin_socket
->register_command(
3821 "name=pool,type=CephString " \
3822 "name=objname,type=CephObjectname " \
3823 "name=header,type=CephString",
3826 ceph_assert(r
== 0);
3828 r
= admin_socket
->register_command(
3830 "name=pool,type=CephString " \
3831 "name=objname,type=CephObjectname",
3833 "output entire object map");
3834 ceph_assert(r
== 0);
3836 r
= admin_socket
->register_command(
3838 "name=pool,type=CephString " \
3839 "name=objname,type=CephObjectname " \
3840 "name=len,type=CephInt",
3842 "truncate object to length");
3843 ceph_assert(r
== 0);
3845 r
= admin_socket
->register_command(
3847 "name=pool,type=CephString " \
3848 "name=objname,type=CephObjectname " \
3849 "name=shardid,type=CephInt,req=false,range=0|255",
3851 "inject data error to an object");
3852 ceph_assert(r
== 0);
3854 r
= admin_socket
->register_command(
3856 "name=pool,type=CephString " \
3857 "name=objname,type=CephObjectname " \
3858 "name=shardid,type=CephInt,req=false,range=0|255",
3860 "inject metadata error to an object");
3861 ceph_assert(r
== 0);
3862 r
= admin_socket
->register_command(
3863 "set_recovery_delay " \
3864 "name=utime,type=CephInt,req=false",
3866 "Delay osd recovery by specified seconds");
3867 ceph_assert(r
== 0);
3868 r
= admin_socket
->register_command(
3870 "name=type,type=CephString,req=false " \
3871 "name=count,type=CephInt,req=false ",
3873 "Inject a full disk (optional count times)");
3874 ceph_assert(r
== 0);
3875 r
= admin_socket
->register_command(
3877 "name=count,type=CephInt,req=false " \
3878 "name=size,type=CephInt,req=false " \
3879 "name=object_size,type=CephInt,req=false " \
3880 "name=object_num,type=CephInt,req=false ",
3882 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3883 "(default count=1G default size=4MB). Results in log.");
3884 ceph_assert(r
== 0);
3885 r
= admin_socket
->register_command(
3887 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3888 "name=message,type=CephString,n=N",
3890 "log a message to the cluster log");
3891 ceph_assert(r
== 0);
3892 r
= admin_socket
->register_command(
3896 ceph_assert(r
== 0);
3897 r
= admin_socket
->register_command(
3899 "name=heapcmd,type=CephChoices,strings=" \
3900 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3901 "name=value,type=CephString,req=false",
3903 "show heap usage info (available only if compiled with tcmalloc)");
3904 ceph_assert(r
== 0);
3905 r
= admin_socket
->register_command(
3906 "debug dump_missing " \
3907 "name=filename,type=CephFilepath",
3909 "dump missing objects to a named file");
3910 ceph_assert(r
== 0);
3911 r
= admin_socket
->register_command(
3912 "debug kick_recovery_wq " \
3913 "name=delay,type=CephInt,range=0",
3915 "set osd_recovery_delay_start to <val>");
3916 ceph_assert(r
== 0);
3917 r
= admin_socket
->register_command(
3919 "name=arg,type=CephChoices,strings=status|flush",
3921 "run cpu profiling on daemon");
3922 ceph_assert(r
== 0);
3923 r
= admin_socket
->register_command(
3924 "dump_pg_recovery_stats",
3926 "dump pg recovery statistics");
3927 ceph_assert(r
== 0);
3928 r
= admin_socket
->register_command(
3929 "reset_pg_recovery_stats",
3931 "reset pg recovery statistics");
3932 ceph_assert(r
== 0);
3933 r
= admin_socket
->register_command(
3936 "Drop all OSD caches");
3937 ceph_assert(r
== 0);
3938 r
= admin_socket
->register_command(
3941 "Get OSD caches statistics");
3942 ceph_assert(r
== 0);
3943 r
= admin_socket
->register_command(
3944 "scrub_purged_snaps",
3946 "Scrub purged_snaps vs snapmapper index");
3947 ceph_assert(r
== 0);
3949 // -- pg commands --
3950 // old form: ceph pg <pgid> command ...
3951 r
= admin_socket
->register_command(
3953 "name=pgid,type=CephPgid " \
3954 "name=cmd,type=CephChoices,strings=query",
3957 ceph_assert(r
== 0);
3958 r
= admin_socket
->register_command(
3960 "name=pgid,type=CephPgid " \
3961 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3962 "name=mulcmd,type=CephChoices,strings=revert|delete",
3965 ceph_assert(r
== 0);
3966 r
= admin_socket
->register_command(
3968 "name=pgid,type=CephPgid " \
3969 "name=cmd,type=CephChoices,strings=list_unfound " \
3970 "name=offset,type=CephString,req=false",
3973 ceph_assert(r
== 0);
3974 r
= admin_socket
->register_command(
3976 "name=pgid,type=CephPgid " \
3977 "name=cmd,type=CephChoices,strings=scrub " \
3978 "name=time,type=CephInt,req=false",
3981 ceph_assert(r
== 0);
3982 r
= admin_socket
->register_command(
3984 "name=pgid,type=CephPgid " \
3985 "name=cmd,type=CephChoices,strings=deep_scrub " \
3986 "name=time,type=CephInt,req=false",
3989 ceph_assert(r
== 0);
3990 // new form: tell <pgid> <cmd> for both cli and rest
3991 r
= admin_socket
->register_command(
3994 "show details of a specific pg");
3995 ceph_assert(r
== 0);
3996 r
= admin_socket
->register_command(
3997 "mark_unfound_lost " \
3998 "name=pgid,type=CephPgid,req=false " \
3999 "name=mulcmd,type=CephChoices,strings=revert|delete",
4001 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4002 ceph_assert(r
== 0);
4003 r
= admin_socket
->register_command(
4005 "name=pgid,type=CephPgid,req=false " \
4006 "name=offset,type=CephString,req=false",
4008 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4009 ceph_assert(r
== 0);
4010 r
= admin_socket
->register_command(
4012 "name=pgid,type=CephPgid,req=false " \
4013 "name=time,type=CephInt,req=false",
4015 "Trigger a scheduled scrub ");
4016 ceph_assert(r
== 0);
4017 r
= admin_socket
->register_command(
4019 "name=pgid,type=CephPgid,req=false " \
4020 "name=time,type=CephInt,req=false",
4022 "Trigger a scheduled deep scrub ");
4023 ceph_assert(r
== 0);
4026 void OSD::create_logger()
4028 dout(10) << "create_logger" << dendl
;
4030 logger
= build_osd_logger(cct
);
4031 cct
->get_perfcounters_collection()->add(logger
);
4034 void OSD::create_recoverystate_perf()
4036 dout(10) << "create_recoverystate_perf" << dendl
;
4038 recoverystate_perf
= build_recoverystate_perf(cct
);
4039 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
4044 if (cct
->_conf
->osd_fast_shutdown
) {
4045 derr
<< "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl
;
4050 if (!service
.prepare_to_stop())
4051 return 0; // already shutting down
4053 if (is_stopping()) {
4057 dout(0) << "shutdown" << dendl
;
4059 set_state(STATE_STOPPING
);
4062 if (cct
->_conf
.get_val
<bool>("osd_debug_shutdown")) {
4063 cct
->_conf
.set_val("debug_osd", "100");
4064 cct
->_conf
.set_val("debug_journal", "100");
4065 cct
->_conf
.set_val("debug_filestore", "100");
4066 cct
->_conf
.set_val("debug_bluestore", "100");
4067 cct
->_conf
.set_val("debug_ms", "100");
4068 cct
->_conf
.apply_changes(nullptr);
4071 // stop MgrClient earlier as it's more like an internal consumer of OSD
4074 service
.start_shutdown();
4076 // stop sending work to pgs. this just prevents any new work in _process
4077 // from racing with on_shutdown and potentially entering the pg after.
4078 op_shardedwq
.drain();
4084 for (auto pg
: pgs
) {
4089 // drain op queue again (in case PGs requeued something)
4090 op_shardedwq
.drain();
4092 finished
.clear(); // zap waiters (bleh, this is messy)
4093 waiting_for_osdmap
.clear();
4096 // unregister commands
4097 cct
->get_admin_socket()->unregister_commands(asok_hook
);
4101 cct
->get_admin_socket()->unregister_commands(test_ops_hook
);
4102 delete test_ops_hook
;
4103 test_ops_hook
= NULL
;
4108 std::lock_guard l
{heartbeat_lock
};
4109 heartbeat_stop
= true;
4110 heartbeat_cond
.notify_all();
4111 heartbeat_peers
.clear();
4113 heartbeat_thread
.join();
4115 hb_back_server_messenger
->mark_down_all();
4116 hb_front_server_messenger
->mark_down_all();
4117 hb_front_client_messenger
->mark_down_all();
4118 hb_back_client_messenger
->mark_down_all();
4122 dout(10) << "op sharded tp stopped" << dendl
;
4124 dout(10) << "stopping agent" << dendl
;
4125 service
.agent_stop();
4127 boot_finisher
.wait_for_empty();
4131 boot_finisher
.stop();
4132 reset_heartbeat_peers(true);
4134 tick_timer
.shutdown();
4137 std::lock_guard
l(tick_timer_lock
);
4138 tick_timer_without_osd_lock
.shutdown();
4141 // note unmount epoch
4142 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl
;
4143 superblock
.mounted
= service
.get_boot_epoch();
4144 superblock
.clean_thru
= get_osdmap_epoch();
4145 ObjectStore::Transaction t
;
4146 write_superblock(t
);
4147 int r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
4149 derr
<< "OSD::shutdown: error writing superblock: "
4150 << cpp_strerror(r
) << dendl
;
4154 service
.shutdown_reserver();
4157 #ifdef PG_DEBUG_REFS
4158 service
.dump_live_pgids();
4162 _get_pgs(&pgs
, true);
4166 for (auto& pg
: pgs
) {
4167 if (pg
->is_deleted()) {
4170 dout(20) << " kicking pg " << pg
<< dendl
;
4172 if (pg
->get_num_ref() != 1) {
4173 derr
<< "pgid " << pg
->get_pgid() << " has ref count of "
4174 << pg
->get_num_ref() << dendl
;
4175 #ifdef PG_DEBUG_REFS
4176 pg
->dump_live_ids();
4178 if (cct
->_conf
->osd_shutdown_pgref_assert
) {
4186 #ifdef PG_DEBUG_REFS
4187 service
.dump_live_pgids();
4191 cct
->_conf
.remove_observer(this);
4194 service
.meta_ch
.reset();
4196 dout(10) << "syncing store" << dendl
;
4197 enable_disable_fuse(true);
4199 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
4200 dout(10) << "flushing journal" << dendl
;
4201 store
->flush_journal();
4207 std::unique_lock l
{map_lock
};
4208 set_osdmap(OSDMapRef());
4210 for (auto s
: shards
) {
4211 std::lock_guard
l(s
->osdmap_lock
);
4212 s
->shard_osdmap
= OSDMapRef();
4216 std::lock_guard
lock(osd_lock
);
4220 dout(10) << "Store synced" << dendl
;
4222 op_tracker
.on_shutdown();
4224 ClassHandler::get_instance().shutdown();
4225 client_messenger
->shutdown();
4226 cluster_messenger
->shutdown();
4227 hb_front_client_messenger
->shutdown();
4228 hb_back_client_messenger
->shutdown();
4229 objecter_messenger
->shutdown();
4230 hb_front_server_messenger
->shutdown();
4231 hb_back_server_messenger
->shutdown();
4236 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
4238 bool created
= false;
4240 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
4241 vector
<string
> vcmd
{cmd
};
4245 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
4248 if (r
== -ENOENT
&& !created
) {
4249 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
4250 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
4251 vector
<string
> vnewcmd
{newcmd
};
4255 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
4258 derr
<< __func__
<< " fail: osd does not exist and created failed: "
4259 << cpp_strerror(r
) << dendl
;
4265 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
4274 int OSD::update_crush_location()
4276 if (!cct
->_conf
->osd_crush_update_on_start
) {
4277 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
4282 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
4283 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
4285 struct store_statfs_t st
;
4286 osd_alert_list_t alerts
;
4287 int r
= store
->statfs(&st
, &alerts
);
4289 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
4292 snprintf(weight
, sizeof(weight
), "%.4lf",
4295 double(1ull << 40 /* TB */)));
4298 dout(10) << __func__
<< " crush location is " << cct
->crush_location
<< dendl
;
4301 string("{\"prefix\": \"osd crush create-or-move\", ") +
4302 string("\"id\": ") + stringify(whoami
) + ", " +
4303 string("\"weight\":") + weight
+ ", " +
4304 string("\"args\": [") + stringify(cct
->crush_location
) + "]}";
4305 return mon_cmd_maybe_osd_create(cmd
);
4308 int OSD::update_crush_device_class()
4310 if (!cct
->_conf
->osd_class_update_on_start
) {
4311 dout(10) << __func__
<< " osd_class_update_on_start = false" << dendl
;
4315 string device_class
;
4316 int r
= store
->read_meta("crush_device_class", &device_class
);
4317 if (r
< 0 || device_class
.empty()) {
4318 device_class
= store
->get_default_device_class();
4321 if (device_class
.empty()) {
4322 dout(20) << __func__
<< " no device class stored locally" << dendl
;
4327 string("{\"prefix\": \"osd crush set-device-class\", ") +
4328 string("\"class\": \"") + device_class
+ string("\", ") +
4329 string("\"ids\": [\"") + stringify(whoami
) + string("\"]}");
4331 r
= mon_cmd_maybe_osd_create(cmd
);
4333 // good, already bound to a device-class
4340 void OSD::write_superblock(ObjectStore::Transaction
& t
)
4342 dout(10) << "write_superblock " << superblock
<< dendl
;
4344 //hack: at minimum it's using the baseline feature set
4345 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
4346 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
4349 encode(superblock
, bl
);
4350 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
4353 int OSD::read_superblock()
4356 int r
= store
->read(service
.meta_ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
4360 auto p
= bl
.cbegin();
4361 decode(superblock
, p
);
4363 dout(10) << "read_superblock " << superblock
<< dendl
;
4368 void OSD::clear_temp_objects()
4370 dout(10) << __func__
<< dendl
;
4372 store
->list_collections(ls
);
4373 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
4375 if (!p
->is_pg(&pgid
))
4378 // list temp objects
4379 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
4381 vector
<ghobject_t
> temps
;
4384 vector
<ghobject_t
> objects
;
4385 auto ch
= store
->open_collection(*p
);
4387 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4388 store
->get_ideal_list_max(),
4390 if (objects
.empty())
4392 vector
<ghobject_t
>::iterator q
;
4393 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
4394 // Hammer set pool for temps to -1, so check for clean-up
4395 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
4396 temps
.push_back(*q
);
4401 // If we saw a non-temp object and hit the break above we can
4402 // break out of the while loop too.
4403 if (q
!= objects
.end())
4406 if (!temps
.empty()) {
4407 ObjectStore::Transaction t
;
4409 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
4410 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
4412 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
4413 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4414 t
= ObjectStore::Transaction();
4419 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4425 void OSD::recursive_remove_collection(CephContext
* cct
,
4426 ObjectStore
*store
, spg_t pgid
,
4432 make_snapmapper_oid());
4434 ObjectStore::CollectionHandle ch
= store
->open_collection(tmp
);
4435 ObjectStore::Transaction t
;
4436 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
4439 int max
= cct
->_conf
->osd_target_transaction_size
;
4440 vector
<ghobject_t
> objects
;
4441 objects
.reserve(max
);
4444 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4445 max
, &objects
, &next
);
4446 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
4447 if (objects
.empty())
4449 for (auto& p
: objects
) {
4450 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
4451 int r
= mapper
.remove_oid(p
.hobj
, &_t
);
4452 if (r
!= 0 && r
!= -ENOENT
)
4456 int r
= store
->queue_transaction(ch
, std::move(t
));
4457 ceph_assert(r
== 0);
4458 t
= ObjectStore::Transaction();
4460 t
.remove_collection(tmp
);
4461 int r
= store
->queue_transaction(ch
, std::move(t
));
4462 ceph_assert(r
== 0);
4465 if (!ch
->flush_commit(&waiter
)) {
4471 // ======================================================
4475 OSDMapRef createmap
,
4478 dout(10) << __func__
<< " " << pgid
<< dendl
;
4480 map
<string
,string
> ec_profile
;
4482 if (createmap
->have_pg_pool(pgid
.pool())) {
4483 pi
= *createmap
->get_pg_pool(pgid
.pool());
4484 name
= createmap
->get_pool_name(pgid
.pool());
4485 if (pi
.is_erasure()) {
4486 ec_profile
= createmap
->get_erasure_code_profile(pi
.erasure_code_profile
);
4489 // pool was deleted; grab final pg_pool_t off disk.
4490 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
4492 int r
= store
->read(service
.meta_ch
, oid
, 0, 0, bl
);
4494 derr
<< __func__
<< " missing pool " << pgid
.pool() << " tombstone"
4498 ceph_assert(r
>= 0);
4499 auto p
= bl
.cbegin();
4502 if (p
.end()) { // dev release v13.0.2 did not include ec_profile
4503 derr
<< __func__
<< " missing ec_profile from pool " << pgid
.pool()
4504 << " tombstone" << dendl
;
4507 decode(ec_profile
, p
);
4509 PGPool
pool(cct
, createmap
, pgid
.pool(), pi
, name
);
4511 if (pi
.type
== pg_pool_t::TYPE_REPLICATED
||
4512 pi
.type
== pg_pool_t::TYPE_ERASURE
)
4513 pg
= new PrimaryLogPG(&service
, createmap
, pool
, ec_profile
, pgid
);
4519 void OSD::_get_pgs(vector
<PGRef
> *v
, bool clear_too
)
4522 v
->reserve(get_num_pgs());
4523 for (auto& s
: shards
) {
4524 std::lock_guard
l(s
->shard_lock
);
4525 for (auto& j
: s
->pg_slots
) {
4527 !j
.second
->pg
->is_deleted()) {
4528 v
->push_back(j
.second
->pg
);
4530 s
->_detach_pg(j
.second
.get());
4537 void OSD::_get_pgids(vector
<spg_t
> *v
)
4540 v
->reserve(get_num_pgs());
4541 for (auto& s
: shards
) {
4542 std::lock_guard
l(s
->shard_lock
);
4543 for (auto& j
: s
->pg_slots
) {
4545 !j
.second
->pg
->is_deleted()) {
4546 v
->push_back(j
.first
);
4552 void OSD::register_pg(PGRef pg
)
4554 spg_t pgid
= pg
->get_pgid();
4555 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4556 auto sdata
= shards
[shard_index
];
4557 std::lock_guard
l(sdata
->shard_lock
);
4558 auto r
= sdata
->pg_slots
.emplace(pgid
, make_unique
<OSDShardPGSlot
>());
4559 ceph_assert(r
.second
);
4560 auto *slot
= r
.first
->second
.get();
4561 dout(20) << __func__
<< " " << pgid
<< " " << pg
<< dendl
;
4562 sdata
->_attach_pg(slot
, pg
.get());
4565 bool OSD::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
4567 auto sdata
= pg
->osd_shard
;
4570 std::lock_guard
l(sdata
->shard_lock
);
4571 auto p
= sdata
->pg_slots
.find(pg
->pg_id
);
4572 if (p
== sdata
->pg_slots
.end() ||
4574 dout(20) << __func__
<< " " << pg
->pg_id
<< " not found" << dendl
;
4577 if (p
->second
->waiting_for_merge_epoch
) {
4578 dout(20) << __func__
<< " " << pg
->pg_id
<< " waiting for merge" << dendl
;
4581 dout(20) << __func__
<< " " << pg
->pg_id
<< " " << pg
<< dendl
;
4582 sdata
->_detach_pg(p
->second
.get());
4585 for (auto shard
: shards
) {
4586 shard
->unprime_split_children(pg
->pg_id
, old_pg_num
);
4589 // update pg count now since we might not get an osdmap any time soon.
4590 if (pg
->is_primary())
4591 service
.logger
->dec(l_osd_pg_primary
);
4592 else if (pg
->is_nonprimary())
4593 service
.logger
->dec(l_osd_pg_replica
); // misnomver
4595 service
.logger
->dec(l_osd_pg_stray
);
4600 PGRef
OSD::_lookup_pg(spg_t pgid
)
4602 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4603 auto sdata
= shards
[shard_index
];
4604 std::lock_guard
l(sdata
->shard_lock
);
4605 auto p
= sdata
->pg_slots
.find(pgid
);
4606 if (p
== sdata
->pg_slots
.end()) {
4609 return p
->second
->pg
;
4612 PGRef
OSD::_lookup_lock_pg(spg_t pgid
)
4614 PGRef pg
= _lookup_pg(pgid
);
4619 if (!pg
->is_deleted()) {
4626 PGRef
OSD::lookup_lock_pg(spg_t pgid
)
4628 return _lookup_lock_pg(pgid
);
4631 void OSD::load_pgs()
4633 ceph_assert(ceph_mutex_is_locked(osd_lock
));
4634 dout(0) << "load_pgs" << dendl
;
4637 auto pghist
= make_pg_num_history_oid();
4639 int r
= store
->read(service
.meta_ch
, pghist
, 0, 0, bl
, 0);
4640 if (r
>= 0 && bl
.length() > 0) {
4641 auto p
= bl
.cbegin();
4642 decode(pg_num_history
, p
);
4644 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
4648 int r
= store
->list_collections(ls
);
4650 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
4654 for (vector
<coll_t
>::iterator it
= ls
.begin();
4658 if (it
->is_temp(&pgid
) ||
4659 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
, pgid
))) {
4660 dout(10) << "load_pgs " << *it
4661 << " removing, legacy or flagged for removal pg" << dendl
;
4662 recursive_remove_collection(cct
, store
, pgid
, *it
);
4666 if (!it
->is_pg(&pgid
)) {
4667 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
4671 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
4672 epoch_t map_epoch
= 0;
4673 int r
= PG::peek_map_epoch(store
, pgid
, &map_epoch
);
4675 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
4681 if (map_epoch
> 0) {
4682 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
4684 if (!get_osdmap()->have_pg_pool(pgid
.pool())) {
4685 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
4686 << " on pg " << pgid
<< ", but the pool is not present in the "
4687 << "current map, so this is probably a result of bug 10617. "
4688 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4689 << "to clean it up later." << dendl
;
4692 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
4693 << map_epoch
<< ", but missing map. Crashing."
4695 ceph_abort_msg("Missing map in load_pgs");
4698 pg
= _make_pg(pgosdmap
, pgid
);
4700 pg
= _make_pg(get_osdmap(), pgid
);
4703 recursive_remove_collection(cct
, store
, pgid
, *it
);
4707 // there can be no waiters here, so we don't call _wake_pg_slot
4710 pg
->ch
= store
->open_collection(pg
->coll
);
4712 // read pg state, log
4713 pg
->read_state(store
);
4716 dout(10) << "load_pgs " << *it
<< " deleting dne" << dendl
;
4719 recursive_remove_collection(cct
, store
, pgid
, *it
);
4723 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4724 assert(NULL
!= shards
[shard_index
]);
4725 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4728 pg
->reg_next_scrub();
4730 dout(10) << __func__
<< " loaded " << *pg
<< dendl
;
4736 dout(0) << __func__
<< " opened " << num
<< " pgs" << dendl
;
4740 PGRef
OSD::handle_pg_create_info(const OSDMapRef
& osdmap
,
4741 const PGCreateInfo
*info
)
4743 spg_t pgid
= info
->pgid
;
4745 if (maybe_wait_for_max_pg(osdmap
, pgid
, info
->by_mon
)) {
4746 dout(10) << __func__
<< " hit max pg, dropping" << dendl
;
4750 PeeringCtx rctx
= create_context();
4752 OSDMapRef startmap
= get_map(info
->epoch
);
4755 int64_t pool_id
= pgid
.pgid
.pool();
4756 const pg_pool_t
*pool
= osdmap
->get_pg_pool(pool_id
);
4758 dout(10) << __func__
<< " ignoring " << pgid
<< ", pool dne" << dendl
;
4761 if (osdmap
->require_osd_release
>= ceph_release_t::nautilus
&&
4762 !pool
->has_flag(pg_pool_t::FLAG_CREATING
)) {
4763 // this ensures we do not process old creating messages after the
4764 // pool's initial pgs have been created (and pg are subsequently
4765 // allowed to split or merge).
4766 dout(20) << __func__
<< " dropping " << pgid
4767 << "create, pool does not have CREATING flag set" << dendl
;
4772 int up_primary
, acting_primary
;
4773 vector
<int> up
, acting
;
4774 startmap
->pg_to_up_acting_osds(
4775 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4777 const pg_pool_t
* pp
= startmap
->get_pg_pool(pgid
.pool());
4778 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
4779 store
->get_type() != "bluestore") {
4780 clog
->warn() << "pg " << pgid
4781 << " is at risk of silent data corruption: "
4782 << "the pool allows ec overwrites but is not stored in "
4783 << "bluestore, so deep scrubbing will not detect bitrot";
4785 create_pg_collection(
4786 rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
4787 init_pg_ondisk(rctx
.transaction
, pgid
, pp
);
4789 int role
= startmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
4791 PGRef pg
= _make_pg(startmap
, pgid
);
4792 pg
->ch
= store
->create_new_collection(pg
->coll
);
4795 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4796 assert(NULL
!= shards
[shard_index
]);
4797 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4802 // we are holding the shard lock
4803 ceph_assert(!pg
->is_deleted());
4812 info
->past_intervals
,
4816 pg
->init_collection_pool_opts();
4818 if (pg
->is_primary()) {
4819 std::lock_guard locker
{m_perf_queries_lock
};
4820 pg
->set_dynamic_perf_stats_queries(m_perf_queries
);
4823 pg
->handle_initialize(rctx
);
4824 pg
->handle_activate_map(rctx
);
4826 dispatch_context(rctx
, pg
.get(), osdmap
, nullptr);
4828 dout(10) << __func__
<< " new pg " << *pg
<< dendl
;
4832 bool OSD::maybe_wait_for_max_pg(const OSDMapRef
& osdmap
,
4836 const auto max_pgs_per_osd
=
4837 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4838 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4840 if (num_pgs
< max_pgs_per_osd
) {
4844 std::lock_guard
l(pending_creates_lock
);
4845 if (is_mon_create
) {
4846 pending_creates_from_mon
++;
4848 bool is_primary
= osdmap
->get_pg_acting_role(pgid
, whoami
) == 0;
4849 pending_creates_from_osd
.emplace(pgid
, is_primary
);
4851 dout(1) << __func__
<< " withhold creation of pg " << pgid
4852 << ": " << num_pgs
<< " >= "<< max_pgs_per_osd
<< dendl
;
4856 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4857 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4858 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4859 static vector
<int32_t> twiddle(const vector
<int>& acting
) {
4860 if (acting
.size() > 1) {
4863 vector
<int32_t> twiddled(acting
.begin(), acting
.end());
4864 twiddled
.push_back(-1);
4869 void OSD::resume_creating_pg()
4871 bool do_sub_pg_creates
= false;
4872 bool have_pending_creates
= false;
4874 const auto max_pgs_per_osd
=
4875 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4876 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4877 if (max_pgs_per_osd
<= num_pgs
) {
4878 // this could happen if admin decreases this setting before a PG is removed
4881 unsigned spare_pgs
= max_pgs_per_osd
- num_pgs
;
4882 std::lock_guard
l(pending_creates_lock
);
4883 if (pending_creates_from_mon
> 0) {
4884 dout(20) << __func__
<< " pending_creates_from_mon "
4885 << pending_creates_from_mon
<< dendl
;
4886 do_sub_pg_creates
= true;
4887 if (pending_creates_from_mon
>= spare_pgs
) {
4888 spare_pgs
= pending_creates_from_mon
= 0;
4890 spare_pgs
-= pending_creates_from_mon
;
4891 pending_creates_from_mon
= 0;
4894 auto pg
= pending_creates_from_osd
.cbegin();
4895 while (spare_pgs
> 0 && pg
!= pending_creates_from_osd
.cend()) {
4896 dout(20) << __func__
<< " pg " << pg
->first
<< dendl
;
4898 get_osdmap()->pg_to_up_acting_osds(pg
->first
.pgid
, nullptr, nullptr, &acting
, nullptr);
4899 service
.queue_want_pg_temp(pg
->first
.pgid
, twiddle(acting
), true);
4900 pg
= pending_creates_from_osd
.erase(pg
);
4901 do_sub_pg_creates
= true;
4904 have_pending_creates
= (pending_creates_from_mon
> 0 ||
4905 !pending_creates_from_osd
.empty());
4908 bool do_renew_subs
= false;
4909 if (do_sub_pg_creates
) {
4910 if (monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0)) {
4911 dout(4) << __func__
<< ": resolicit pg creates from mon since "
4912 << last_pg_create_epoch
<< dendl
;
4913 do_renew_subs
= true;
4916 version_t start
= get_osdmap_epoch() + 1;
4917 if (have_pending_creates
) {
4918 // don't miss any new osdmap deleting PGs
4919 if (monc
->sub_want("osdmap", start
, 0)) {
4920 dout(4) << __func__
<< ": resolicit osdmap from mon since "
4922 do_renew_subs
= true;
4924 } else if (do_sub_pg_creates
) {
4925 // no need to subscribe the osdmap continuously anymore
4926 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4927 if (monc
->sub_want_increment("osdmap", start
, CEPH_SUBSCRIBE_ONETIME
)) {
4928 dout(4) << __func__
<< ": re-subscribe osdmap(onetime) since "
4930 do_renew_subs
= true;
4934 if (do_renew_subs
) {
4938 service
.send_pg_temp();
4941 void OSD::build_initial_pg_history(
4944 utime_t created_stamp
,
4948 dout(10) << __func__
<< " " << pgid
<< " created " << created
<< dendl
;
4949 *h
= pg_history_t(created
, created_stamp
);
4951 OSDMapRef lastmap
= service
.get_map(created
);
4952 int up_primary
, acting_primary
;
4953 vector
<int> up
, acting
;
4954 lastmap
->pg_to_up_acting_osds(
4955 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4957 ostringstream debug
;
4958 for (epoch_t e
= created
+ 1; e
<= get_osdmap_epoch(); ++e
) {
4959 OSDMapRef osdmap
= service
.get_map(e
);
4960 int new_up_primary
, new_acting_primary
;
4961 vector
<int> new_up
, new_acting
;
4962 osdmap
->pg_to_up_acting_osds(
4963 pgid
.pgid
, &new_up
, &new_up_primary
, &new_acting
, &new_acting_primary
);
4965 // this is a bit imprecise, but sufficient?
4966 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
4967 const pg_pool_t
*pi
;
4968 bool operator()(const set
<pg_shard_t
> &have
) const {
4969 return have
.size() >= pi
->min_size
;
4971 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
4972 } min_size_predicate(osdmap
->get_pg_pool(pgid
.pgid
.pool()));
4974 bool new_interval
= PastIntervals::check_new_interval(
4981 h
->same_interval_since
,
4982 h
->last_epoch_clean
,
4990 h
->same_interval_since
= e
;
4992 h
->same_up_since
= e
;
4994 if (acting_primary
!= new_acting_primary
) {
4995 h
->same_primary_since
= e
;
4997 if (pgid
.pgid
.is_split(lastmap
->get_pg_num(pgid
.pgid
.pool()),
4998 osdmap
->get_pg_num(pgid
.pgid
.pool()),
5000 h
->last_epoch_split
= e
;
5003 acting
= new_acting
;
5004 up_primary
= new_up_primary
;
5005 acting_primary
= new_acting_primary
;
5009 dout(20) << __func__
<< " " << debug
.str() << dendl
;
5010 dout(10) << __func__
<< " " << *h
<< " " << *pi
5011 << " [" << (pi
->empty() ? pair
<epoch_t
,epoch_t
>(0,0) :
5012 pi
->get_bounds()) << ")"
5016 void OSD::_add_heartbeat_peer(int p
)
5022 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
5023 if (i
== heartbeat_peers
.end()) {
5024 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, get_osdmap_epoch());
5027 assert(cons
.second
);
5029 hi
= &heartbeat_peers
[p
];
5032 auto stamps
= service
.get_hb_stamps(p
);
5034 auto sb
= ceph::make_ref
<Session
>(cct
, cons
.first
.get());
5036 sb
->stamps
= stamps
;
5037 hi
->hb_interval_start
= ceph_clock_now();
5038 hi
->con_back
= cons
.first
.get();
5039 hi
->con_back
->set_priv(sb
);
5041 auto sf
= ceph::make_ref
<Session
>(cct
, cons
.second
.get());
5043 sf
->stamps
= stamps
;
5044 hi
->con_front
= cons
.second
.get();
5045 hi
->con_front
->set_priv(sf
);
5047 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5048 << " " << hi
->con_back
->get_peer_addr()
5049 << " " << hi
->con_front
->get_peer_addr()
5054 hi
->epoch
= get_osdmap_epoch();
5057 void OSD::_remove_heartbeat_peer(int n
)
5059 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
5060 ceph_assert(q
!= heartbeat_peers
.end());
5061 dout(20) << " removing heartbeat peer osd." << n
5062 << " " << q
->second
.con_back
->get_peer_addr()
5063 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
5065 q
->second
.clear_mark_down();
5066 heartbeat_peers
.erase(q
);
5069 void OSD::need_heartbeat_peer_update()
5073 dout(20) << "need_heartbeat_peer_update" << dendl
;
5074 heartbeat_set_peers_need_update();
5077 void OSD::maybe_update_heartbeat_peers()
5079 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5081 if (is_waiting_for_healthy() || is_active()) {
5082 utime_t now
= ceph_clock_now();
5083 if (last_heartbeat_resample
== utime_t()) {
5084 last_heartbeat_resample
= now
;
5085 heartbeat_set_peers_need_update();
5086 } else if (!heartbeat_peers_need_update()) {
5087 utime_t dur
= now
- last_heartbeat_resample
;
5088 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
5089 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
5090 heartbeat_set_peers_need_update();
5091 last_heartbeat_resample
= now
;
5092 // automatically clean up any stale heartbeat peers
5093 // if we are unhealthy, then clean all
5094 reset_heartbeat_peers(is_waiting_for_healthy());
5099 if (!heartbeat_peers_need_update())
5101 heartbeat_clear_peers_need_update();
5103 std::lock_guard
l(heartbeat_lock
);
5105 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
5108 // build heartbeat from set
5112 for (auto& pg
: pgs
) {
5113 pg
->with_heartbeat_peers([&](int peer
) {
5114 if (get_osdmap()->is_up(peer
)) {
5115 _add_heartbeat_peer(peer
);
5121 // include next and previous up osds to ensure we have a fully-connected set
5122 set
<int> want
, extras
;
5123 const int next
= get_osdmap()->get_next_up_osd_after(whoami
);
5126 int prev
= get_osdmap()->get_previous_up_osd_before(whoami
);
5127 if (prev
>= 0 && prev
!= next
)
5130 // make sure we have at least **min_down** osds coming from different
5131 // subtree level (e.g., hosts) for fast failure detection.
5132 auto min_down
= cct
->_conf
.get_val
<uint64_t>("mon_osd_min_down_reporters");
5133 auto subtree
= cct
->_conf
.get_val
<string
>("mon_osd_reporter_subtree_level");
5134 auto limit
= std::max(min_down
, (uint64_t)cct
->_conf
->osd_heartbeat_min_peers
);
5135 get_osdmap()->get_random_up_osds_by_subtree(
5136 whoami
, subtree
, limit
, want
, &want
);
5138 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
5139 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
5141 _add_heartbeat_peer(*p
);
5144 // remove down peers; enumerate extras
5145 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5146 while (p
!= heartbeat_peers
.end()) {
5147 if (!get_osdmap()->is_up(p
->first
)) {
5150 _remove_heartbeat_peer(o
);
5153 if (p
->second
.epoch
< get_osdmap_epoch()) {
5154 extras
.insert(p
->first
);
5160 for (int n
= next
; n
>= 0; ) {
5161 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
5163 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
5164 dout(10) << " adding random peer osd." << n
<< dendl
;
5166 _add_heartbeat_peer(n
);
5168 n
= get_osdmap()->get_next_up_osd_after(n
);
5170 break; // came full circle; stop
5174 for (set
<int>::iterator p
= extras
.begin();
5175 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
5179 _remove_heartbeat_peer(*p
);
5182 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
5184 // clean up stale failure pending
5185 for (auto it
= failure_pending
.begin(); it
!= failure_pending
.end();) {
5186 if (heartbeat_peers
.count(it
->first
) == 0) {
5187 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
5188 failure_pending
.erase(it
++);
5195 void OSD::reset_heartbeat_peers(bool all
)
5197 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5198 dout(10) << "reset_heartbeat_peers" << dendl
;
5199 utime_t stale
= ceph_clock_now();
5200 stale
-= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
5201 std::lock_guard
l(heartbeat_lock
);
5202 for (auto it
= heartbeat_peers
.begin(); it
!= heartbeat_peers
.end();) {
5203 HeartbeatInfo
& hi
= it
->second
;
5204 if (all
|| hi
.is_stale(stale
)) {
5205 hi
.clear_mark_down();
5206 // stop sending failure_report to mon too
5207 failure_queue
.erase(it
->first
);
5208 heartbeat_peers
.erase(it
++);
5215 void OSD::handle_osd_ping(MOSDPing
*m
)
5217 if (superblock
.cluster_fsid
!= m
->fsid
) {
5218 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
5219 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
5225 int from
= m
->get_source().num();
5227 heartbeat_lock
.lock();
5228 if (is_stopping()) {
5229 heartbeat_lock
.unlock();
5234 utime_t now
= ceph_clock_now();
5235 auto mnow
= service
.get_mnow();
5236 ConnectionRef
con(m
->get_connection());
5237 OSDMapRef curmap
= service
.get_osdmap();
5239 heartbeat_lock
.unlock();
5244 auto sref
= con
->get_priv();
5245 Session
*s
= static_cast<Session
*>(sref
.get());
5247 heartbeat_lock
.unlock();
5253 s
->stamps
= service
.get_hb_stamps(from
);
5258 case MOSDPing::PING
:
5260 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
5261 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
5262 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
5263 if (heartbeat_drop
->second
== 0) {
5264 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
5266 --heartbeat_drop
->second
;
5267 dout(5) << "Dropping heartbeat from " << from
5268 << ", " << heartbeat_drop
->second
5269 << " remaining to drop" << dendl
;
5272 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
5273 ((((double)(rand()%100))/100.0))) {
5275 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
5276 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
5277 dout(5) << "Dropping heartbeat from " << from
5278 << ", " << heartbeat_drop
->second
5279 << " remaining to drop" << dendl
;
5284 ceph::signedspan sender_delta_ub
{};
5285 s
->stamps
->got_ping(
5291 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5293 if (!cct
->get_heartbeat_map()->is_healthy()) {
5294 dout(10) << "internal heartbeat not healthy, dropping ping request"
5299 Message
*r
= new MOSDPing(monc
->get_fsid(),
5300 curmap
->get_epoch(),
5301 MOSDPing::PING_REPLY
,
5305 service
.get_up_epoch(),
5306 cct
->_conf
->osd_heartbeat_min_size
,
5308 con
->send_message(r
);
5310 if (curmap
->is_up(from
)) {
5312 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5313 from
, curmap
->get_epoch());
5315 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5318 } else if (!curmap
->exists(from
) ||
5319 curmap
->get_down_at(from
) > m
->map_epoch
) {
5320 // tell them they have died
5321 Message
*r
= new MOSDPing(monc
->get_fsid(),
5322 curmap
->get_epoch(),
5327 service
.get_up_epoch(),
5328 cct
->_conf
->osd_heartbeat_min_size
);
5329 con
->send_message(r
);
5334 case MOSDPing::PING_REPLY
:
5336 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
5337 if (i
!= heartbeat_peers
.end()) {
5338 auto acked
= i
->second
.ping_history
.find(m
->ping_stamp
);
5339 if (acked
!= i
->second
.ping_history
.end()) {
5340 int &unacknowledged
= acked
->second
.second
;
5341 if (con
== i
->second
.con_back
) {
5342 dout(25) << "handle_osd_ping got reply from osd." << from
5343 << " first_tx " << i
->second
.first_tx
5344 << " last_tx " << i
->second
.last_tx
5345 << " last_rx_back " << i
->second
.last_rx_back
5347 << " last_rx_front " << i
->second
.last_rx_front
5349 i
->second
.last_rx_back
= now
;
5350 ceph_assert(unacknowledged
> 0);
5352 // if there is no front con, set both stamps.
5353 if (i
->second
.con_front
== NULL
) {
5354 i
->second
.last_rx_front
= now
;
5355 ceph_assert(unacknowledged
> 0);
5358 } else if (con
== i
->second
.con_front
) {
5359 dout(25) << "handle_osd_ping got reply from osd." << from
5360 << " first_tx " << i
->second
.first_tx
5361 << " last_tx " << i
->second
.last_tx
5362 << " last_rx_back " << i
->second
.last_rx_back
5363 << " last_rx_front " << i
->second
.last_rx_front
5366 i
->second
.last_rx_front
= now
;
5367 ceph_assert(unacknowledged
> 0);
5371 if (unacknowledged
== 0) {
5372 // succeeded in getting all replies
5373 dout(25) << "handle_osd_ping got all replies from osd." << from
5374 << " , erase pending ping(sent at " << m
->ping_stamp
<< ")"
5375 << " and older pending ping(s)"
5378 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5379 ++i
->second
.hb_average_count
;
5380 uint32_t back_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_back
- m
->ping_stamp
);
5381 i
->second
.hb_total_back
+= back_pingtime
;
5382 if (back_pingtime
< i
->second
.hb_min_back
)
5383 i
->second
.hb_min_back
= back_pingtime
;
5384 if (back_pingtime
> i
->second
.hb_max_back
)
5385 i
->second
.hb_max_back
= back_pingtime
;
5386 uint32_t front_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_front
- m
->ping_stamp
);
5387 i
->second
.hb_total_front
+= front_pingtime
;
5388 if (front_pingtime
< i
->second
.hb_min_front
)
5389 i
->second
.hb_min_front
= front_pingtime
;
5390 if (front_pingtime
> i
->second
.hb_max_front
)
5391 i
->second
.hb_max_front
= front_pingtime
;
5393 ceph_assert(i
->second
.hb_interval_start
!= utime_t());
5394 if (i
->second
.hb_interval_start
== utime_t())
5395 i
->second
.hb_interval_start
= now
;
5396 int64_t hb_avg_time_period
= 60;
5397 if (cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span")) {
5398 hb_avg_time_period
= cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span");
5400 if (now
- i
->second
.hb_interval_start
>= utime_t(hb_avg_time_period
, 0)) {
5401 uint32_t back_avg
= i
->second
.hb_total_back
/ i
->second
.hb_average_count
;
5402 uint32_t back_min
= i
->second
.hb_min_back
;
5403 uint32_t back_max
= i
->second
.hb_max_back
;
5404 uint32_t front_avg
= i
->second
.hb_total_front
/ i
->second
.hb_average_count
;
5405 uint32_t front_min
= i
->second
.hb_min_front
;
5406 uint32_t front_max
= i
->second
.hb_max_front
;
5408 // Reset for new interval
5409 i
->second
.hb_average_count
= 0;
5410 i
->second
.hb_interval_start
= now
;
5411 i
->second
.hb_total_back
= i
->second
.hb_max_back
= 0;
5412 i
->second
.hb_min_back
= UINT_MAX
;
5413 i
->second
.hb_total_front
= i
->second
.hb_max_front
= 0;
5414 i
->second
.hb_min_front
= UINT_MAX
;
5416 // Record per osd interace ping times
5417 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5418 if (i
->second
.hb_back_pingtime
.size() == 0) {
5419 ceph_assert(i
->second
.hb_front_pingtime
.size() == 0);
5420 for (unsigned k
= 0 ; k
< hb_vector_size
; ++k
) {
5421 i
->second
.hb_back_pingtime
.push_back(back_avg
);
5422 i
->second
.hb_back_min
.push_back(back_min
);
5423 i
->second
.hb_back_max
.push_back(back_max
);
5424 i
->second
.hb_front_pingtime
.push_back(front_avg
);
5425 i
->second
.hb_front_min
.push_back(front_min
);
5426 i
->second
.hb_front_max
.push_back(front_max
);
5427 ++i
->second
.hb_index
;
5430 int index
= i
->second
.hb_index
& (hb_vector_size
- 1);
5431 i
->second
.hb_back_pingtime
[index
] = back_avg
;
5432 i
->second
.hb_back_min
[index
] = back_min
;
5433 i
->second
.hb_back_max
[index
] = back_max
;
5434 i
->second
.hb_front_pingtime
[index
] = front_avg
;
5435 i
->second
.hb_front_min
[index
] = front_min
;
5436 i
->second
.hb_front_max
[index
] = front_max
;
5437 ++i
->second
.hb_index
;
5441 std::lock_guard
l(service
.stat_lock
);
5442 service
.osd_stat
.hb_pingtime
[from
].last_update
= now
.sec();
5443 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5446 uint32_t min
= UINT_MAX
;
5450 uint32_t size
= (uint32_t)i
->second
.hb_back_pingtime
.size();
5451 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5453 int index
= (i
->second
.hb_index
+ k
) % size
;
5454 total
+= i
->second
.hb_back_pingtime
[index
];
5455 if (i
->second
.hb_back_min
[index
] < min
)
5456 min
= i
->second
.hb_back_min
[index
];
5457 if (i
->second
.hb_back_max
[index
] > max
)
5458 max
= i
->second
.hb_back_max
[index
];
5459 if (count
== 1 || count
== 5 || count
== 15) {
5460 service
.osd_stat
.hb_pingtime
[from
].back_pingtime
[which
] = total
/ count
;
5461 service
.osd_stat
.hb_pingtime
[from
].back_min
[which
] = min
;
5462 service
.osd_stat
.hb_pingtime
[from
].back_max
[which
] = max
;
5469 if (i
->second
.con_front
!= NULL
) {
5470 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5477 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5479 int index
= (i
->second
.hb_index
+ k
) % size
;
5480 total
+= i
->second
.hb_front_pingtime
[index
];
5481 if (i
->second
.hb_front_min
[index
] < min
)
5482 min
= i
->second
.hb_front_min
[index
];
5483 if (i
->second
.hb_front_max
[index
] > max
)
5484 max
= i
->second
.hb_front_max
[index
];
5485 if (count
== 1 || count
== 5 || count
== 15) {
5486 service
.osd_stat
.hb_pingtime
[from
].front_pingtime
[which
] = total
/ count
;
5487 service
.osd_stat
.hb_pingtime
[from
].front_min
[which
] = min
;
5488 service
.osd_stat
.hb_pingtime
[from
].front_max
[which
] = max
;
5497 std::lock_guard
l(service
.stat_lock
);
5498 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5499 if (i
->second
.con_front
!= NULL
)
5500 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5502 i
->second
.ping_history
.erase(i
->second
.ping_history
.begin(), ++acked
);
5505 if (i
->second
.is_healthy(now
)) {
5506 // Cancel false reports
5507 auto failure_queue_entry
= failure_queue
.find(from
);
5508 if (failure_queue_entry
!= failure_queue
.end()) {
5509 dout(10) << "handle_osd_ping canceling queued "
5510 << "failure report for osd." << from
<< dendl
;
5511 failure_queue
.erase(failure_queue_entry
);
5514 auto failure_pending_entry
= failure_pending
.find(from
);
5515 if (failure_pending_entry
!= failure_pending
.end()) {
5516 dout(10) << "handle_osd_ping canceling in-flight "
5517 << "failure report for osd." << from
<< dendl
;
5518 send_still_alive(curmap
->get_epoch(),
5520 failure_pending_entry
->second
.second
);
5521 failure_pending
.erase(failure_pending_entry
);
5525 // old replies, deprecated by newly sent pings.
5526 dout(10) << "handle_osd_ping no pending ping(sent at " << m
->ping_stamp
5527 << ") is found, treat as covered by newly sent pings "
5534 curmap
->is_up(from
)) {
5536 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5537 from
, curmap
->get_epoch());
5539 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5544 s
->stamps
->got_ping_reply(
5548 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5552 case MOSDPing::YOU_DIED
:
5553 dout(10) << "handle_osd_ping " << m
->get_source_inst()
5554 << " says i am down in " << m
->map_epoch
<< dendl
;
5555 osdmap_subscribe(curmap
->get_epoch()+1, false);
5559 heartbeat_lock
.unlock();
5563 void OSD::heartbeat_entry()
5565 std::unique_lock
l(heartbeat_lock
);
5568 while (!heartbeat_stop
) {
5572 if (cct
->_conf
.get_val
<bool>("debug_disable_randomized_ping")) {
5573 wait
= (float)cct
->_conf
->osd_heartbeat_interval
;
5575 wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
5577 auto w
= ceph::make_timespan(wait
);
5578 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
5579 heartbeat_cond
.wait_for(l
, w
);
5582 dout(30) << "heartbeat_entry woke up" << dendl
;
5586 void OSD::heartbeat_check()
5588 ceph_assert(ceph_mutex_is_locked(heartbeat_lock
));
5589 utime_t now
= ceph_clock_now();
5591 // check for incoming heartbeats (move me elsewhere?)
5592 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5593 p
!= heartbeat_peers
.end();
5596 if (p
->second
.first_tx
== utime_t()) {
5597 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
5598 << " yet, skipping" << dendl
;
5602 dout(25) << "heartbeat_check osd." << p
->first
5603 << " first_tx " << p
->second
.first_tx
5604 << " last_tx " << p
->second
.last_tx
5605 << " last_rx_back " << p
->second
.last_rx_back
5606 << " last_rx_front " << p
->second
.last_rx_front
5608 if (p
->second
.is_unhealthy(now
)) {
5609 utime_t oldest_deadline
= p
->second
.ping_history
.begin()->second
.first
;
5610 if (p
->second
.last_rx_back
== utime_t() ||
5611 p
->second
.last_rx_front
== utime_t()) {
5612 derr
<< "heartbeat_check: no reply from "
5613 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5614 << " osd." << p
->first
5615 << " ever on either front or back, first ping sent "
5616 << p
->second
.first_tx
5617 << " (oldest deadline " << oldest_deadline
<< ")"
5620 failure_queue
[p
->first
] = p
->second
.first_tx
;
5622 derr
<< "heartbeat_check: no reply from "
5623 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5624 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
5625 << " front " << p
->second
.last_rx_front
5626 << " (oldest deadline " << oldest_deadline
<< ")"
5629 failure_queue
[p
->first
] = std::min(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
5635 void OSD::heartbeat()
5637 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock
));
5638 dout(30) << "heartbeat" << dendl
;
5642 int hb_interval
= cct
->_conf
->osd_heartbeat_interval
;
5643 int n_samples
= 86400;
5644 if (hb_interval
> 1) {
5645 n_samples
/= hb_interval
;
5650 if (getloadavg(loadavgs
, 1) == 1) {
5651 logger
->set(l_osd_loadavg
, 100 * loadavgs
[0]);
5652 daily_loadavg
= (daily_loadavg
* (n_samples
- 1) + loadavgs
[0]) / n_samples
;
5653 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg
<< dendl
;
5656 dout(30) << "heartbeat checking stats" << dendl
;
5658 // refresh peer list and osd stats
5659 vector
<int> hb_peers
;
5660 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5661 p
!= heartbeat_peers
.end();
5663 hb_peers
.push_back(p
->first
);
5665 auto new_stat
= service
.set_osd_stat(hb_peers
, get_num_pgs());
5666 dout(5) << __func__
<< " " << new_stat
<< dendl
;
5667 ceph_assert(new_stat
.statfs
.total
);
5670 float ratio
= service
.compute_adjusted_ratio(new_stat
, &pratio
);
5672 service
.check_full_status(ratio
, pratio
);
5674 utime_t now
= ceph_clock_now();
5675 auto mnow
= service
.get_mnow();
5676 utime_t deadline
= now
;
5677 deadline
+= cct
->_conf
->osd_heartbeat_grace
;
5680 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
5681 i
!= heartbeat_peers
.end();
5683 int peer
= i
->first
;
5684 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
5686 i
->second
.last_tx
= now
;
5687 if (i
->second
.first_tx
== utime_t())
5688 i
->second
.first_tx
= now
;
5689 i
->second
.ping_history
[now
] = make_pair(deadline
,
5690 HeartbeatInfo::HEARTBEAT_MAX_CONN
);
5691 if (i
->second
.hb_interval_start
== utime_t())
5692 i
->second
.hb_interval_start
= now
;
5694 Session
*s
= static_cast<Session
*>(i
->second
.con_back
->get_priv().get());
5695 std::optional
<ceph::signedspan
> delta_ub
;
5696 s
->stamps
->sent_ping(&delta_ub
);
5698 i
->second
.con_back
->send_message(
5699 new MOSDPing(monc
->get_fsid(),
5700 service
.get_osdmap_epoch(),
5705 service
.get_up_epoch(),
5706 cct
->_conf
->osd_heartbeat_min_size
,
5709 if (i
->second
.con_front
)
5710 i
->second
.con_front
->send_message(
5711 new MOSDPing(monc
->get_fsid(),
5712 service
.get_osdmap_epoch(),
5717 service
.get_up_epoch(),
5718 cct
->_conf
->osd_heartbeat_min_size
,
5722 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
5724 // hmm.. am i all alone?
5725 dout(30) << "heartbeat lonely?" << dendl
;
5726 if (heartbeat_peers
.empty()) {
5727 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
5728 last_mon_heartbeat
= now
;
5729 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
5730 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5734 dout(30) << "heartbeat done" << dendl
;
5737 bool OSD::heartbeat_reset(Connection
*con
)
5739 std::lock_guard
l(heartbeat_lock
);
5740 auto s
= con
->get_priv();
5741 dout(20) << __func__
<< " con " << con
<< " s " << s
.get() << dendl
;
5742 con
->set_priv(nullptr);
5744 if (is_stopping()) {
5747 auto session
= static_cast<Session
*>(s
.get());
5748 auto p
= heartbeat_peers
.find(session
->peer
);
5749 if (p
!= heartbeat_peers
.end() &&
5750 (p
->second
.con_back
== con
||
5751 p
->second
.con_front
== con
)) {
5752 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5753 << ", reopening" << dendl
;
5754 p
->second
.clear_mark_down(con
);
5755 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
5757 p
->second
.con_back
= newcon
.first
.get();
5758 p
->second
.con_back
->set_priv(s
);
5759 if (newcon
.second
) {
5760 p
->second
.con_front
= newcon
.second
.get();
5761 p
->second
.con_front
->set_priv(s
);
5763 p
->second
.ping_history
.clear();
5765 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5766 << ", raced with osdmap update, closing out peer" << dendl
;
5767 heartbeat_peers
.erase(p
);
5770 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
5778 // =========================================
5782 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5783 dout(10) << "tick" << dendl
;
5785 utime_t now
= ceph_clock_now();
5786 // throw out any obsolete markdown log
5787 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
5788 while (!osd_markdown_log
.empty() &&
5789 osd_markdown_log
.front() + grace
< now
)
5790 osd_markdown_log
.pop_front();
5792 if (is_active() || is_waiting_for_healthy()) {
5793 maybe_update_heartbeat_peers();
5796 if (is_waiting_for_healthy()) {
5800 if (is_waiting_for_healthy() || is_booting()) {
5801 std::lock_guard
l(heartbeat_lock
);
5802 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
) {
5803 last_mon_heartbeat
= now
;
5804 dout(1) << __func__
<< " checking mon for new map" << dendl
;
5805 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5811 // scrub purged_snaps every deep scrub interval
5813 const utime_t last
= superblock
.last_purged_snaps_scrub
;
5814 utime_t next
= last
;
5815 next
+= cct
->_conf
->osd_scrub_min_interval
;
5817 // use a seed that is stable for each scrub interval, but varies
5818 // by OSD to avoid any herds.
5819 rng
.seed(whoami
+ superblock
.last_purged_snaps_scrub
.sec());
5820 double r
= (rng() % 1024) / 1024;
5822 cct
->_conf
->osd_scrub_min_interval
*
5823 cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
5824 if (next
< ceph_clock_now()) {
5825 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5826 << " next " << next
<< " ... now" << dendl
;
5827 scrub_purged_snaps();
5829 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5830 << " next " << next
<< dendl
;
5834 tick_timer
.add_event_after(get_tick_interval(), new C_Tick(this));
5837 void OSD::tick_without_osd_lock()
5839 ceph_assert(ceph_mutex_is_locked(tick_timer_lock
));
5840 dout(10) << "tick_without_osd_lock" << dendl
;
5842 logger
->set(l_osd_cached_crc
, buffer::get_cached_crc());
5843 logger
->set(l_osd_cached_crc_adjusted
, buffer::get_cached_crc_adjusted());
5844 logger
->set(l_osd_missed_crc
, buffer::get_missed_crc());
5846 // refresh osd stats
5847 struct store_statfs_t stbuf
;
5848 osd_alert_list_t alerts
;
5849 int r
= store
->statfs(&stbuf
, &alerts
);
5850 ceph_assert(r
== 0);
5851 service
.set_statfs(stbuf
, alerts
);
5853 // osd_lock is not being held, which means the OSD state
5854 // might change when doing the monitor report
5855 if (is_active() || is_waiting_for_healthy()) {
5857 std::lock_guard l
{heartbeat_lock
};
5860 map_lock
.lock_shared();
5861 std::lock_guard
l(mon_report_lock
);
5864 utime_t now
= ceph_clock_now();
5865 if (service
.need_fullness_update() ||
5866 now
- last_mon_report
> cct
->_conf
->osd_mon_report_interval
) {
5867 last_mon_report
= now
;
5871 map_lock
.unlock_shared();
5873 epoch_t max_waiting_epoch
= 0;
5874 for (auto s
: shards
) {
5875 max_waiting_epoch
= std::max(max_waiting_epoch
,
5876 s
->get_max_waiting_epoch());
5878 if (max_waiting_epoch
> get_osdmap()->get_epoch()) {
5879 dout(20) << __func__
<< " max_waiting_epoch " << max_waiting_epoch
5880 << ", requesting new map" << dendl
;
5881 osdmap_subscribe(superblock
.newest_map
+ 1, false);
5886 if (!scrub_random_backoff()) {
5889 service
.promote_throttle_recalibrate();
5890 resume_creating_pg();
5891 bool need_send_beacon
= false;
5892 const auto now
= ceph::coarse_mono_clock::now();
5894 // borrow lec lock to pretect last_sent_beacon from changing
5895 std::lock_guard l
{min_last_epoch_clean_lock
};
5896 const auto elapsed
= now
- last_sent_beacon
;
5897 if (chrono::duration_cast
<chrono::seconds
>(elapsed
).count() >
5898 cct
->_conf
->osd_beacon_report_interval
) {
5899 need_send_beacon
= true;
5902 if (need_send_beacon
) {
5907 mgrc
.update_daemon_health(get_health_metrics());
5908 service
.kick_recovery_queue();
5909 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
5910 new C_Tick_WithoutOSDLock(this));
5914 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5915 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5916 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5917 // getomap <pool> [namespace/]<obj-name>
5918 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5919 // injectmdataerr [namespace/]<obj-name> [shardid]
5920 // injectdataerr [namespace/]<obj-name> [shardid]
5922 // set_recovery_delay [utime]
5923 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
5924 std::string_view command
,
5925 const cmdmap_t
& cmdmap
, ostream
&ss
)
5928 //Support changing the omap on a single osd by using the Admin Socket to
5929 //directly request the osd make a change.
5930 if (command
== "setomapval" || command
== "rmomapkey" ||
5931 command
== "setomapheader" || command
== "getomap" ||
5932 command
== "truncobj" || command
== "injectmdataerr" ||
5933 command
== "injectdataerr"
5937 OSDMapRef curmap
= service
->get_osdmap();
5942 cmd_getval(cmdmap
, "pool", poolstr
);
5943 pool
= curmap
->lookup_pg_pool_name(poolstr
);
5944 //If we can't find it by name then maybe id specified
5945 if (pool
< 0 && isdigit(poolstr
[0]))
5946 pool
= atoll(poolstr
.c_str());
5948 ss
<< "Invalid pool '" << poolstr
<< "''";
5952 string objname
, nspace
;
5953 cmd_getval(cmdmap
, "objname", objname
);
5954 std::size_t found
= objname
.find_first_of('/');
5955 if (found
!= string::npos
) {
5956 nspace
= objname
.substr(0, found
);
5957 objname
= objname
.substr(found
+1);
5959 object_locator_t
oloc(pool
, nspace
);
5960 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
5963 ss
<< "Invalid namespace/objname";
5968 cmd_getval(cmdmap
, "shardid", shardid
, int64_t(shard_id_t::NO_SHARD
));
5969 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
5970 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
5971 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
5972 if (curmap
->pg_is_ec(rawpg
)) {
5973 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
5974 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
5979 ObjectStore::Transaction t
;
5981 if (command
== "setomapval") {
5982 map
<string
, bufferlist
> newattrs
;
5985 cmd_getval(cmdmap
, "key", key
);
5986 cmd_getval(cmdmap
, "val", valstr
);
5989 newattrs
[key
] = val
;
5990 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
5991 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5993 ss
<< "error=" << r
;
5996 } else if (command
== "rmomapkey") {
5998 cmd_getval(cmdmap
, "key", key
);
6000 t
.omap_rmkey(coll_t(pgid
), ghobject_t(obj
), key
);
6001 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6003 ss
<< "error=" << r
;
6006 } else if (command
== "setomapheader") {
6007 bufferlist newheader
;
6010 cmd_getval(cmdmap
, "header", headerstr
);
6011 newheader
.append(headerstr
);
6012 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
6013 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6015 ss
<< "error=" << r
;
6018 } else if (command
== "getomap") {
6019 //Debug: Output entire omap
6021 map
<string
, bufferlist
> keyvals
;
6022 auto ch
= store
->open_collection(coll_t(pgid
));
6024 ss
<< "unable to open collection for " << pgid
;
6027 r
= store
->omap_get(ch
, ghobject_t(obj
), &hdrbl
, &keyvals
);
6029 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
6030 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
6031 it
!= keyvals
.end(); ++it
)
6032 ss
<< " key=" << (*it
).first
<< " val="
6033 << string((*it
).second
.c_str(), (*it
).second
.length());
6035 ss
<< "error=" << r
;
6038 } else if (command
== "truncobj") {
6040 cmd_getval(cmdmap
, "len", trunclen
);
6041 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
6042 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6044 ss
<< "error=" << r
;
6047 } else if (command
== "injectdataerr") {
6048 store
->inject_data_error(gobj
);
6050 } else if (command
== "injectmdataerr") {
6051 store
->inject_mdata_error(gobj
);
6056 if (command
== "set_recovery_delay") {
6058 cmd_getval(cmdmap
, "utime", delay
, (int64_t)0);
6061 int r
= service
->cct
->_conf
.set_val("osd_recovery_delay_start",
6064 ss
<< "set_recovery_delay: error setting "
6065 << "osd_recovery_delay_start to '" << delay
<< "': error "
6069 service
->cct
->_conf
.apply_changes(nullptr);
6070 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
6071 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
6074 if (command
== "injectfull") {
6077 OSDService::s_names state
;
6078 cmd_getval(cmdmap
, "type", type
, string("full"));
6079 cmd_getval(cmdmap
, "count", count
, (int64_t)-1);
6080 if (type
== "none" || count
== 0) {
6084 state
= service
->get_full_state(type
);
6085 if (state
== OSDService::s_names::INVALID
) {
6086 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6089 service
->set_injectfull(state
, count
);
6092 ss
<< "Internal error - command=" << command
;
6095 // =========================================
6097 void OSD::ms_handle_connect(Connection
*con
)
6099 dout(10) << __func__
<< " con " << con
<< dendl
;
6100 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
6101 std::lock_guard
l(osd_lock
);
6104 dout(10) << __func__
<< " on mon" << dendl
;
6108 } else if (is_booting()) {
6109 _send_boot(); // resend boot message
6111 map_lock
.lock_shared();
6112 std::lock_guard
l2(mon_report_lock
);
6114 utime_t now
= ceph_clock_now();
6115 last_mon_report
= now
;
6117 // resend everything, it's a new session
6120 service
.requeue_pg_temp();
6121 service
.clear_sent_ready_to_merge();
6122 service
.send_pg_temp();
6123 service
.send_ready_to_merge();
6124 service
.send_pg_created();
6128 map_lock
.unlock_shared();
6130 send_beacon(ceph::coarse_mono_clock::now());
6134 // full map requests may happen while active or pre-boot
6135 if (requested_full_first
) {
6136 rerequest_full_maps();
6141 void OSD::ms_handle_fast_connect(Connection
*con
)
6143 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6144 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6145 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6146 s
= ceph::make_ref
<Session
>(cct
, con
);
6148 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
6149 << " addr=" << s
->con
->get_peer_addr() << dendl
;
6150 // we don't connect to clients
6151 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6152 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6157 void OSD::ms_handle_fast_accept(Connection
*con
)
6159 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6160 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6161 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6162 s
= ceph::make_ref
<Session
>(cct
, con
);
6164 dout(10) << "new session (incoming)" << s
<< " con=" << con
6165 << " addr=" << con
->get_peer_addr()
6166 << " must have raced with connect" << dendl
;
6167 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6168 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6173 bool OSD::ms_handle_reset(Connection
*con
)
6175 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6176 dout(2) << "ms_handle_reset con " << con
<< " session " << session
.get() << dendl
;
6179 session
->wstate
.reset(con
);
6180 session
->con
->set_priv(nullptr);
6181 session
->con
.reset(); // break con <-> session ref cycle
6182 // note that we break session->con *before* the session_handle_reset
6183 // cleanup below. this avoids a race between us and
6184 // PG::add_backoff, Session::check_backoff, etc.
6185 session_handle_reset(session
);
6189 bool OSD::ms_handle_refused(Connection
*con
)
6191 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
6194 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6195 dout(2) << "ms_handle_refused con " << con
<< " session " << session
.get() << dendl
;
6198 int type
= con
->get_peer_type();
6199 // handle only OSD failures here
6200 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
6201 OSDMapRef osdmap
= get_osdmap();
6203 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
6204 if (id
>= 0 && osdmap
->is_up(id
)) {
6205 // I'm cheating mon heartbeat grace logic, because we know it's not going
6206 // to respawn alone. +1 so we won't hit any boundary case.
6207 monc
->send_mon_message(
6211 osdmap
->get_addrs(id
),
6212 cct
->_conf
->osd_heartbeat_grace
+ 1,
6213 osdmap
->get_epoch(),
6214 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
6222 struct C_OSD_GetVersion
: public Context
{
6224 uint64_t oldest
, newest
;
6225 explicit C_OSD_GetVersion(OSD
*o
) : osd(o
), oldest(0), newest(0) {}
6226 void finish(int r
) override
{
6228 osd
->_got_mon_epochs(oldest
, newest
);
6232 void OSD::start_boot()
6234 if (!_is_healthy()) {
6235 // if we are not healthy, do not mark ourselves up (yet)
6236 dout(1) << "not healthy; waiting to boot" << dendl
;
6237 if (!is_waiting_for_healthy())
6238 start_waiting_for_healthy();
6239 // send pings sooner rather than later
6243 dout(1) << __func__
<< dendl
;
6244 set_state(STATE_PREBOOT
);
6245 dout(10) << "start_boot - have maps " << superblock
.oldest_map
6246 << ".." << superblock
.newest_map
<< dendl
;
6247 C_OSD_GetVersion
*c
= new C_OSD_GetVersion(this);
6248 monc
->get_version("osdmap", &c
->newest
, &c
->oldest
, c
);
6251 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
6253 std::lock_guard
l(osd_lock
);
6255 _preboot(oldest
, newest
);
6259 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
6261 ceph_assert(is_preboot());
6262 dout(10) << __func__
<< " _preboot mon has osdmaps "
6263 << oldest
<< ".." << newest
<< dendl
;
6265 // ensure our local fullness awareness is accurate
6267 std::lock_guard
l(heartbeat_lock
);
6271 const auto& monmap
= monc
->monmap
;
6272 const auto osdmap
= get_osdmap();
6273 // if our map within recent history, try to add ourselves to the osdmap.
6274 if (osdmap
->get_epoch() == 0) {
6275 derr
<< "waiting for initial osdmap" << dendl
;
6276 } else if (osdmap
->is_destroyed(whoami
)) {
6277 derr
<< "osdmap says I am destroyed" << dendl
;
6278 // provide a small margin so we don't livelock seeing if we
6279 // un-destroyed ourselves.
6280 if (osdmap
->get_epoch() > newest
- 1) {
6283 } else if (osdmap
->is_noup(whoami
)) {
6284 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
6285 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
6286 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6288 } else if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
6289 derr
<< "osdmap require_osd_release < luminous; please upgrade to luminous"
6291 } else if (service
.need_fullness_update()) {
6292 derr
<< "osdmap fullness state needs update" << dendl
;
6294 } else if (monmap
.min_mon_release
>= ceph_release_t::octopus
&&
6295 superblock
.purged_snaps_last
< superblock
.current_epoch
) {
6296 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6297 << " < newest_map " << superblock
.current_epoch
<< dendl
;
6298 _get_purged_snaps();
6299 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
6300 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
6302 // wait for pgs to fully catch up in a different thread, since
6303 // this thread might be required for splitting and merging PGs to
6305 boot_finisher
.queue(
6308 std::unique_lock
l(osd_lock
);
6310 dout(10) << __func__
<< " waiting for peering work to drain"
6313 for (auto shard
: shards
) {
6314 shard
->wait_min_pg_epoch(get_osdmap_epoch());
6325 // get all the latest maps
6326 if (osdmap
->get_epoch() + 1 >= oldest
)
6327 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6329 osdmap_subscribe(oldest
- 1, true);
6332 void OSD::_get_purged_snaps()
6334 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6335 // overlapping requests to the mon, which will be somewhat inefficient, but
6336 // it should be reliable.
6337 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6338 << ", newest_map " << superblock
.current_epoch
<< dendl
;
6339 MMonGetPurgedSnaps
*m
= new MMonGetPurgedSnaps(
6340 superblock
.purged_snaps_last
+ 1,
6341 superblock
.current_epoch
+ 1);
6342 monc
->send_mon_message(m
);
6345 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply
*m
)
6347 dout(10) << __func__
<< " " << *m
<< dendl
;
6348 ObjectStore::Transaction t
;
6349 if (!is_preboot() ||
6350 m
->last
< superblock
.purged_snaps_last
) {
6353 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
6354 make_purged_snaps_oid(), &t
,
6356 superblock
.purged_snaps_last
= m
->last
;
6357 write_superblock(t
);
6358 store
->queue_transaction(
6361 service
.publish_superblock(superblock
);
6362 if (m
->last
< superblock
.current_epoch
) {
6363 _get_purged_snaps();
6371 void OSD::send_full_update()
6373 if (!service
.need_fullness_update())
6376 if (service
.is_full()) {
6377 state
= CEPH_OSD_FULL
;
6378 } else if (service
.is_backfillfull()) {
6379 state
= CEPH_OSD_BACKFILLFULL
;
6380 } else if (service
.is_nearfull()) {
6381 state
= CEPH_OSD_NEARFULL
;
6384 OSDMap::calc_state_set(state
, s
);
6385 dout(10) << __func__
<< " want state " << s
<< dendl
;
6386 monc
->send_mon_message(new MOSDFull(get_osdmap_epoch(), state
));
6389 void OSD::start_waiting_for_healthy()
6391 dout(1) << "start_waiting_for_healthy" << dendl
;
6392 set_state(STATE_WAITING_FOR_HEALTHY
);
6393 last_heartbeat_resample
= utime_t();
6395 // subscribe to osdmap updates, in case our peers really are known to be dead
6396 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6399 bool OSD::_is_healthy()
6401 if (!cct
->get_heartbeat_map()->is_healthy()) {
6402 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
6406 if (is_waiting_for_healthy()) {
6407 utime_t now
= ceph_clock_now();
6408 if (osd_markdown_log
.empty()) {
6409 dout(5) << __func__
<< " force returning true since last markdown"
6410 << " was " << cct
->_conf
->osd_max_markdown_period
6411 << "s ago" << dendl
;
6414 std::lock_guard
l(heartbeat_lock
);
6415 int num
= 0, up
= 0;
6416 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
6417 p
!= heartbeat_peers
.end();
6419 if (p
->second
.is_healthy(now
))
6423 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
6424 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
6425 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
6433 void OSD::_send_boot()
6435 dout(10) << "_send_boot" << dendl
;
6436 Connection
*local_connection
=
6437 cluster_messenger
->get_loopback_connection().get();
6438 entity_addrvec_t client_addrs
= client_messenger
->get_myaddrs();
6439 entity_addrvec_t cluster_addrs
= cluster_messenger
->get_myaddrs();
6440 entity_addrvec_t hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6441 entity_addrvec_t hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6443 dout(20) << " initial client_addrs " << client_addrs
6444 << ", cluster_addrs " << cluster_addrs
6445 << ", hb_back_addrs " << hb_back_addrs
6446 << ", hb_front_addrs " << hb_front_addrs
6448 if (cluster_messenger
->set_addr_unknowns(client_addrs
)) {
6449 dout(10) << " assuming cluster_addrs match client_addrs "
6450 << client_addrs
<< dendl
;
6451 cluster_addrs
= cluster_messenger
->get_myaddrs();
6453 if (auto session
= local_connection
->get_priv(); !session
) {
6454 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6457 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
6458 if (hb_back_server_messenger
->set_addr_unknowns(cluster_addrs
)) {
6459 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6460 << cluster_addrs
<< dendl
;
6461 hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6463 if (auto session
= local_connection
->get_priv(); !session
) {
6464 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6467 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
6468 if (hb_front_server_messenger
->set_addr_unknowns(client_addrs
)) {
6469 dout(10) << " assuming hb_front_addrs match client_addrs "
6470 << client_addrs
<< dendl
;
6471 hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6473 if (auto session
= local_connection
->get_priv(); !session
) {
6474 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6477 // we now know what our front and back addrs will be, and we are
6478 // about to tell the mon what our metadata (including numa bindings)
6479 // are, so now is a good time!
6480 set_numa_affinity();
6482 MOSDBoot
*mboot
= new MOSDBoot(
6483 superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
6484 hb_back_addrs
, hb_front_addrs
, cluster_addrs
,
6486 dout(10) << " final client_addrs " << client_addrs
6487 << ", cluster_addrs " << cluster_addrs
6488 << ", hb_back_addrs " << hb_back_addrs
6489 << ", hb_front_addrs " << hb_front_addrs
6491 _collect_metadata(&mboot
->metadata
);
6492 monc
->send_mon_message(mboot
);
6493 set_state(STATE_BOOTING
);
6496 void OSD::_collect_metadata(map
<string
,string
> *pm
)
6499 (*pm
)["osd_data"] = dev_path
;
6500 if (store
->get_type() == "filestore") {
6501 // not applicable for bluestore
6502 (*pm
)["osd_journal"] = journal_path
;
6504 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddrs());
6505 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddrs());
6506 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddrs());
6507 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddrs());
6510 (*pm
)["osd_objectstore"] = store
->get_type();
6511 (*pm
)["rotational"] = store_is_rotational
? "1" : "0";
6512 (*pm
)["journal_rotational"] = journal_is_rotational
? "1" : "0";
6513 (*pm
)["default_device_class"] = store
->get_default_device_class();
6514 store
->collect_metadata(pm
);
6516 collect_sys_info(pm
, cct
);
6518 (*pm
)["front_iface"] = pick_iface(
6520 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
6521 (*pm
)["back_iface"] = pick_iface(
6523 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
6529 set
<string
> unknown
;
6530 for (auto nm
: { "front_iface", "back_iface" }) {
6531 if (!(*pm
)[nm
].size()) {
6536 int r
= get_iface_numa_node((*pm
)[nm
], &n
);
6538 unknown
.insert((*pm
)[nm
]);
6546 if (unknown
.size()) {
6547 (*pm
)["network_numa_unknown_ifaces"] = stringify(unknown
);
6549 if (!nodes
.empty()) {
6550 (*pm
)["network_numa_nodes"] = stringify(nodes
);
6552 if (node
>= 0 && nodes
.size() == 1 && unknown
.empty()) {
6553 (*pm
)["network_numa_node"] = stringify(node
);
6557 if (numa_node
>= 0) {
6558 (*pm
)["numa_node"] = stringify(numa_node
);
6559 (*pm
)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size
,
6563 set
<string
> devnames
;
6564 store
->get_devices(&devnames
);
6565 map
<string
,string
> errs
;
6566 get_device_metadata(devnames
, pm
, &errs
);
6567 for (auto& i
: errs
) {
6568 dout(1) << __func__
<< " " << i
.first
<< ": " << i
.second
<< dendl
;
6570 dout(10) << __func__
<< " " << *pm
<< dendl
;
6573 void OSD::queue_want_up_thru(epoch_t want
)
6575 std::shared_lock map_locker
{map_lock
};
6576 epoch_t cur
= get_osdmap()->get_up_thru(whoami
);
6577 std::lock_guard
report_locker(mon_report_lock
);
6578 if (want
> up_thru_wanted
) {
6579 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
6580 << ", currently " << cur
6582 up_thru_wanted
= want
;
6585 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
6586 << ", currently " << cur
6591 void OSD::send_alive()
6593 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6594 const auto osdmap
= get_osdmap();
6595 if (!osdmap
->exists(whoami
))
6597 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
6598 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
6599 if (up_thru_wanted
> up_thru
) {
6600 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
6601 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
6605 void OSD::request_full_map(epoch_t first
, epoch_t last
)
6607 dout(10) << __func__
<< " " << first
<< ".." << last
6608 << ", previously requested "
6609 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
6610 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6611 ceph_assert(first
> 0 && last
> 0);
6612 ceph_assert(first
<= last
);
6613 ceph_assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
6614 if (requested_full_first
== 0) {
6616 requested_full_first
= first
;
6617 requested_full_last
= last
;
6618 } else if (last
<= requested_full_last
) {
6622 // additional request
6623 first
= requested_full_last
+ 1;
6624 requested_full_last
= last
;
6626 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
6627 req
->request_full(first
, last
);
6628 monc
->send_mon_message(req
);
6631 void OSD::got_full_map(epoch_t e
)
6633 ceph_assert(requested_full_first
<= requested_full_last
);
6634 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6635 if (requested_full_first
== 0) {
6636 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
6639 if (e
< requested_full_first
) {
6640 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6641 << ".." << requested_full_last
6642 << ", ignoring" << dendl
;
6645 if (e
>= requested_full_last
) {
6646 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6647 << ".." << requested_full_last
<< ", resetting" << dendl
;
6648 requested_full_first
= requested_full_last
= 0;
6652 requested_full_first
= e
+ 1;
6654 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6655 << ".." << requested_full_last
6656 << ", still need more" << dendl
;
6659 void OSD::requeue_failures()
6661 std::lock_guard
l(heartbeat_lock
);
6662 unsigned old_queue
= failure_queue
.size();
6663 unsigned old_pending
= failure_pending
.size();
6664 for (auto p
= failure_pending
.begin(); p
!= failure_pending
.end(); ) {
6665 failure_queue
[p
->first
] = p
->second
.first
;
6666 failure_pending
.erase(p
++);
6668 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
6669 << failure_queue
.size() << dendl
;
6672 void OSD::send_failures()
6674 ceph_assert(ceph_mutex_is_locked(map_lock
));
6675 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6676 std::lock_guard
l(heartbeat_lock
);
6677 utime_t now
= ceph_clock_now();
6678 const auto osdmap
= get_osdmap();
6679 while (!failure_queue
.empty()) {
6680 int osd
= failure_queue
.begin()->first
;
6681 if (!failure_pending
.count(osd
)) {
6682 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
6683 monc
->send_mon_message(
6687 osdmap
->get_addrs(osd
),
6689 osdmap
->get_epoch()));
6690 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
,
6691 osdmap
->get_addrs(osd
));
6693 failure_queue
.erase(osd
);
6697 void OSD::send_still_alive(epoch_t epoch
, int osd
, const entity_addrvec_t
&addrs
)
6699 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), osd
, addrs
, 0, epoch
,
6700 MOSDFailure::FLAG_ALIVE
);
6701 monc
->send_mon_message(m
);
6704 void OSD::cancel_pending_failures()
6706 std::lock_guard
l(heartbeat_lock
);
6707 auto it
= failure_pending
.begin();
6708 while (it
!= failure_pending
.end()) {
6709 dout(10) << __func__
<< " canceling in-flight failure report for osd."
6710 << it
->first
<< dendl
;
6711 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
6712 failure_pending
.erase(it
++);
6716 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
6718 const auto& monmap
= monc
->monmap
;
6719 // send beacon to mon even if we are just connected, and the monmap is not
6720 // initialized yet by then.
6721 if (monmap
.epoch
> 0 &&
6722 monmap
.get_required_features().contains_all(
6723 ceph::features::mon::FEATURE_LUMINOUS
)) {
6724 dout(20) << __func__
<< " sending" << dendl
;
6725 MOSDBeacon
* beacon
= nullptr;
6727 std::lock_guard l
{min_last_epoch_clean_lock
};
6728 beacon
= new MOSDBeacon(get_osdmap_epoch(),
6729 min_last_epoch_clean
,
6730 superblock
.last_purged_snaps_scrub
);
6731 beacon
->pgs
= min_last_epoch_clean_pgs
;
6732 last_sent_beacon
= now
;
6734 monc
->send_mon_message(beacon
);
6736 dout(20) << __func__
<< " not sending" << dendl
;
6740 void OSD::handle_command(MCommand
*m
)
6742 ConnectionRef con
= m
->get_connection();
6743 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6745 con
->send_message(new MCommandReply(m
, -EACCES
));
6749 if (!session
->caps
.allow_all()) {
6750 con
->send_message(new MCommandReply(m
, -EACCES
));
6754 cct
->get_admin_socket()->queue_tell_command(m
);
6759 class unlock_guard
{
6762 explicit unlock_guard(ceph::mutex
& mutex
)
6767 unlock_guard(unlock_guard
&) = delete;
6774 void OSD::scrub_purged_snaps()
6776 dout(10) << __func__
<< dendl
;
6777 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6778 SnapMapper::Scrubber
s(cct
, store
, service
.meta_ch
,
6779 make_snapmapper_oid(),
6780 make_purged_snaps_oid());
6781 clog
->debug() << "purged_snaps scrub starts";
6784 if (s
.stray
.size()) {
6785 clog
->debug() << "purged_snaps scrub found " << s
.stray
.size() << " strays";
6787 clog
->debug() << "purged_snaps scrub ok";
6789 set
<pair
<spg_t
,snapid_t
>> queued
;
6790 for (auto& [pool
, snap
, hash
, shard
] : s
.stray
) {
6791 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(pool
);
6793 dout(20) << __func__
<< " pool " << pool
<< " dne" << dendl
;
6796 pg_t
pgid(pi
->raw_hash_to_pg(hash
), pool
);
6797 spg_t
spgid(pgid
, shard
);
6798 pair
<spg_t
,snapid_t
> p(spgid
, snap
);
6799 if (queued
.count(p
)) {
6800 dout(20) << __func__
<< " pg " << spgid
<< " snap " << snap
6801 << " already queued" << dendl
;
6804 PGRef pg
= lookup_lock_pg(spgid
);
6806 dout(20) << __func__
<< " pg " << spgid
<< " not found" << dendl
;
6810 dout(10) << __func__
<< " requeue pg " << spgid
<< " " << pg
<< " snap "
6812 pg
->queue_snap_retrim(snap
);
6816 if (is_stopping()) {
6819 dout(10) << __func__
<< " done queueing pgs, updating superblock" << dendl
;
6820 ObjectStore::Transaction t
;
6821 superblock
.last_purged_snaps_scrub
= ceph_clock_now();
6822 write_superblock(t
);
6823 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
6824 ceph_assert(tr
== 0);
6826 send_beacon(ceph::coarse_mono_clock::now());
6828 dout(10) << __func__
<< " done" << dendl
;
6831 void OSD::probe_smart(const string
& only_devid
, ostream
& ss
)
6833 set
<string
> devnames
;
6834 store
->get_devices(&devnames
);
6835 uint64_t smart_timeout
= cct
->_conf
.get_val
<uint64_t>(
6836 "osd_smart_report_timeout");
6838 // == typedef std::map<std::string, mValue> mObject;
6839 json_spirit::mObject json_map
;
6841 for (auto dev
: devnames
) {
6842 // smartctl works only on physical devices; filter out any logical device
6843 if (dev
.find("dm-") == 0) {
6848 string devid
= get_device_id(dev
, &err
);
6849 if (devid
.size() == 0) {
6850 dout(10) << __func__
<< " no unique id for dev " << dev
<< " ("
6851 << err
<< "), skipping" << dendl
;
6854 if (only_devid
.size() && devid
!= only_devid
) {
6858 json_spirit::mValue smart_json
;
6859 if (block_device_get_metrics(dev
, smart_timeout
,
6861 dout(10) << "block_device_get_metrics failed for /dev/" << dev
<< dendl
;
6864 json_map
[devid
] = smart_json
;
6866 json_spirit::write(json_map
, ss
, json_spirit::pretty_print
);
6869 bool OSD::heartbeat_dispatch(Message
*m
)
6871 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
6872 switch (m
->get_type()) {
6875 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
6880 handle_osd_ping(static_cast<MOSDPing
*>(m
));
6884 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
6891 bool OSD::ms_dispatch(Message
*m
)
6893 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
6894 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
6895 service
.got_stop_ack();
6903 if (is_stopping()) {
6917 void OSDService::maybe_share_map(
6919 const OSDMapRef
& osdmap
,
6920 epoch_t peer_epoch_lb
)
6922 // NOTE: we assume caller hold something that keeps the Connection itself
6923 // pinned (e.g., an OpRequest's MessageRef).
6924 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6929 // assume the peer has the newer of the op's sent_epoch and what
6930 // we think we sent them.
6931 session
->sent_epoch_lock
.lock();
6932 if (peer_epoch_lb
> session
->last_sent_epoch
) {
6933 dout(10) << __func__
<< " con " << con
6934 << " " << con
->get_peer_addr()
6935 << " map epoch " << session
->last_sent_epoch
6936 << " -> " << peer_epoch_lb
<< " (as per caller)" << dendl
;
6937 session
->last_sent_epoch
= peer_epoch_lb
;
6939 epoch_t last_sent_epoch
= session
->last_sent_epoch
;
6940 session
->sent_epoch_lock
.unlock();
6942 if (osdmap
->get_epoch() <= last_sent_epoch
) {
6946 send_incremental_map(last_sent_epoch
, con
, osdmap
);
6947 last_sent_epoch
= osdmap
->get_epoch();
6949 session
->sent_epoch_lock
.lock();
6950 if (session
->last_sent_epoch
< last_sent_epoch
) {
6951 dout(10) << __func__
<< " con " << con
6952 << " " << con
->get_peer_addr()
6953 << " map epoch " << session
->last_sent_epoch
6954 << " -> " << last_sent_epoch
<< " (shared)" << dendl
;
6955 session
->last_sent_epoch
= last_sent_epoch
;
6957 session
->sent_epoch_lock
.unlock();
6960 void OSD::dispatch_session_waiting(const ceph::ref_t
<Session
>& session
, OSDMapRef osdmap
)
6962 ceph_assert(ceph_mutex_is_locked(session
->session_dispatch_lock
));
6964 auto i
= session
->waiting_on_map
.begin();
6965 while (i
!= session
->waiting_on_map
.end()) {
6966 OpRequestRef op
= &(*i
);
6967 ceph_assert(ms_can_fast_dispatch(op
->get_req()));
6968 auto m
= op
->get_req
<MOSDFastDispatchOp
>();
6969 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
6972 session
->waiting_on_map
.erase(i
++);
6976 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
6977 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
6978 static_cast<const MOSDOp
*>(m
)->get_pg());
6979 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
6983 pgid
= m
->get_spg();
6985 enqueue_op(pgid
, std::move(op
), m
->get_map_epoch());
6988 if (session
->waiting_on_map
.empty()) {
6989 clear_session_waiting_on_map(session
);
6991 register_session_waiting_on_map(session
);
6995 void OSD::ms_fast_dispatch(Message
*m
)
6998 if (service
.is_stopping()) {
7004 switch (m
->get_type()) {
7006 dout(10) << "ping from " << m
->get_source() << dendl
;
7009 case MSG_OSD_FORCE_RECOVERY
:
7010 handle_fast_force_recovery(static_cast<MOSDForceRecovery
*>(m
));
7012 case MSG_OSD_SCRUB2
:
7013 handle_fast_scrub(static_cast<MOSDScrub2
*>(m
));
7016 case MSG_OSD_PG_CREATE2
:
7017 return handle_fast_pg_create(static_cast<MOSDPGCreate2
*>(m
));
7018 case MSG_OSD_PG_QUERY
:
7019 return handle_fast_pg_query(static_cast<MOSDPGQuery
*>(m
));
7020 case MSG_OSD_PG_NOTIFY
:
7021 return handle_fast_pg_notify(static_cast<MOSDPGNotify
*>(m
));
7022 case MSG_OSD_PG_INFO
:
7023 return handle_fast_pg_info(static_cast<MOSDPGInfo
*>(m
));
7024 case MSG_OSD_PG_REMOVE
:
7025 return handle_fast_pg_remove(static_cast<MOSDPGRemove
*>(m
));
7027 // these are single-pg messages that handle themselves
7028 case MSG_OSD_PG_LOG
:
7029 case MSG_OSD_PG_TRIM
:
7030 case MSG_OSD_PG_NOTIFY2
:
7031 case MSG_OSD_PG_QUERY2
:
7032 case MSG_OSD_PG_INFO2
:
7033 case MSG_OSD_BACKFILL_RESERVE
:
7034 case MSG_OSD_RECOVERY_RESERVE
:
7035 case MSG_OSD_PG_LEASE
:
7036 case MSG_OSD_PG_LEASE_ACK
:
7038 MOSDPeeringOp
*pm
= static_cast<MOSDPeeringOp
*>(m
);
7039 if (require_osd_peer(pm
)) {
7040 enqueue_peering_evt(
7042 PGPeeringEventRef(pm
->get_event()));
7049 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7052 osd_reqid_t reqid
= op
->get_reqid();
7054 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
7055 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
7059 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7061 // note sender epoch, min req's epoch
7062 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
7063 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
7064 ceph_assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
7066 service
.maybe_inject_dispatch_delay();
7068 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
7069 m
->get_type() != CEPH_MSG_OSD_OP
) {
7070 // queue it directly
7072 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
7074 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
7076 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7077 // message that didn't have an explicit spg_t); we need to map
7078 // them to an spg_t while preserving delivery order.
7079 auto priv
= m
->get_connection()->get_priv();
7080 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
7081 std::lock_guard l
{session
->session_dispatch_lock
};
7083 session
->waiting_on_map
.push_back(*op
);
7084 OSDMapRef nextmap
= service
.get_nextmap_reserved();
7085 dispatch_session_waiting(session
, nextmap
);
7086 service
.release_map(nextmap
);
7089 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
7092 int OSD::ms_handle_authentication(Connection
*con
)
7095 auto s
= ceph::ref_cast
<Session
>(con
->get_priv());
7097 s
= ceph::make_ref
<Session
>(cct
, con
);
7099 s
->entity_name
= con
->get_peer_entity_name();
7100 dout(10) << __func__
<< " new session " << s
<< " con " << s
->con
7101 << " entity " << s
->entity_name
7102 << " addr " << con
->get_peer_addrs() << dendl
;
7104 dout(10) << __func__
<< " existing session " << s
<< " con " << s
->con
7105 << " entity " << s
->entity_name
7106 << " addr " << con
->get_peer_addrs() << dendl
;
7109 AuthCapsInfo
&caps_info
= con
->get_peer_caps_info();
7110 if (caps_info
.allow_all
) {
7111 s
->caps
.set_allow_all();
7112 } else if (caps_info
.caps
.length() > 0) {
7113 bufferlist::const_iterator p
= caps_info
.caps
.cbegin();
7118 catch (buffer::error
& e
) {
7119 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7120 << " failed to decode caps string" << dendl
;
7124 bool success
= s
->caps
.parse(str
);
7126 dout(10) << __func__
<< " session " << s
7127 << " " << s
->entity_name
7128 << " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
7131 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7132 << " failed to parse caps '" << str
<< "'" << dendl
;
7140 void OSD::do_waiters()
7142 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7144 dout(10) << "do_waiters -- start" << dendl
;
7145 while (!finished
.empty()) {
7146 OpRequestRef next
= finished
.front();
7147 finished
.pop_front();
7150 dout(10) << "do_waiters -- finish" << dendl
;
7153 void OSD::dispatch_op(OpRequestRef op
)
7155 switch (op
->get_req()->get_type()) {
7157 case MSG_OSD_PG_CREATE
:
7158 handle_pg_create(op
);
7163 void OSD::_dispatch(Message
*m
)
7165 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7166 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
7168 switch (m
->get_type()) {
7169 // -- don't need OSDMap --
7171 // map and replication
7172 case CEPH_MSG_OSD_MAP
:
7173 handle_osd_map(static_cast<MOSDMap
*>(m
));
7175 case MSG_MON_GET_PURGED_SNAPS_REPLY
:
7176 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply
*>(m
));
7181 handle_scrub(static_cast<MOSDScrub
*>(m
));
7185 handle_command(static_cast<MCommand
*>(m
));
7188 // -- need OSDMap --
7190 case MSG_OSD_PG_CREATE
:
7192 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7194 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7195 // no map? starting up?
7196 if (!get_osdmap()) {
7197 dout(7) << "no OSDMap, not booted" << dendl
;
7198 logger
->inc(l_osd_waiting_for_map
);
7199 waiting_for_osdmap
.push_back(op
);
7200 op
->mark_delayed("no osdmap");
7210 // remove me post-nautilus
7211 void OSD::handle_scrub(MOSDScrub
*m
)
7213 dout(10) << "handle_scrub " << *m
<< dendl
;
7214 if (!require_mon_or_mgr_peer(m
)) {
7218 if (m
->fsid
!= monc
->get_fsid()) {
7219 dout(0) << "handle_scrub fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7228 if (!m
->scrub_pgs
.empty()) {
7230 for (auto pgid
: m
->scrub_pgs
) {
7232 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
7233 std::find(spgs
.begin(), spgs
.end(), pcand
) != spgs
.end()) {
7240 for (auto pgid
: spgs
) {
7241 enqueue_peering_evt(
7244 std::make_shared
<PGPeeringEvent
>(
7247 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7253 void OSD::handle_fast_scrub(MOSDScrub2
*m
)
7255 dout(10) << __func__
<< " " << *m
<< dendl
;
7256 if (!require_mon_or_mgr_peer(m
)) {
7260 if (m
->fsid
!= monc
->get_fsid()) {
7261 dout(0) << __func__
<< " fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7266 for (auto pgid
: m
->scrub_pgs
) {
7267 enqueue_peering_evt(
7270 std::make_shared
<PGPeeringEvent
>(
7273 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7278 bool OSD::scrub_random_backoff()
7280 bool coin_flip
= (rand() / (double)RAND_MAX
>=
7281 cct
->_conf
->osd_scrub_backoff_ratio
);
7283 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl
;
7289 OSDService::ScrubJob::ScrubJob(CephContext
* cct
,
7290 const spg_t
& pg
, const utime_t
& timestamp
,
7291 double pool_scrub_min_interval
,
7292 double pool_scrub_max_interval
, bool must
)
7295 sched_time(timestamp
),
7298 // if not explicitly requested, postpone the scrub with a random delay
7300 double scrub_min_interval
= pool_scrub_min_interval
> 0 ?
7301 pool_scrub_min_interval
: cct
->_conf
->osd_scrub_min_interval
;
7302 double scrub_max_interval
= pool_scrub_max_interval
> 0 ?
7303 pool_scrub_max_interval
: cct
->_conf
->osd_scrub_max_interval
;
7305 sched_time
+= scrub_min_interval
;
7306 double r
= rand() / (double)RAND_MAX
;
7308 scrub_min_interval
* cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
7309 if (scrub_max_interval
== 0) {
7310 deadline
= utime_t();
7312 deadline
+= scrub_max_interval
;
7318 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob
& rhs
) const {
7319 if (sched_time
< rhs
.sched_time
)
7321 if (sched_time
> rhs
.sched_time
)
7323 return pgid
< rhs
.pgid
;
7326 double OSD::scrub_sleep_time(bool must_scrub
)
7329 return cct
->_conf
->osd_scrub_sleep
;
7331 utime_t now
= ceph_clock_now();
7332 if (scrub_time_permit(now
)) {
7333 return cct
->_conf
->osd_scrub_sleep
;
7335 double normal_sleep
= cct
->_conf
->osd_scrub_sleep
;
7336 double extended_sleep
= cct
->_conf
->osd_scrub_extended_sleep
;
7337 return std::max(extended_sleep
, normal_sleep
);
7340 bool OSD::scrub_time_permit(utime_t now
)
7343 time_t tt
= now
.sec();
7344 localtime_r(&tt
, &bdt
);
7346 bool day_permit
= false;
7347 if (cct
->_conf
->osd_scrub_begin_week_day
< cct
->_conf
->osd_scrub_end_week_day
) {
7348 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
&& bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7352 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
|| bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7358 dout(20) << __func__
<< " should run between week day " << cct
->_conf
->osd_scrub_begin_week_day
7359 << " - " << cct
->_conf
->osd_scrub_end_week_day
7360 << " now " << bdt
.tm_wday
<< " = no" << dendl
;
7364 bool time_permit
= false;
7365 if (cct
->_conf
->osd_scrub_begin_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7366 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
&& bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7370 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
|| bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7375 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7376 << " - " << cct
->_conf
->osd_scrub_end_hour
7377 << " now " << bdt
.tm_hour
<< " = no" << dendl
;
7379 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7380 << " - " << cct
->_conf
->osd_scrub_end_hour
7381 << " now " << bdt
.tm_hour
<< " = yes" << dendl
;
7386 bool OSD::scrub_load_below_threshold()
7389 if (getloadavg(loadavgs
, 3) != 3) {
7390 dout(10) << __func__
<< " couldn't read loadavgs\n" << dendl
;
7394 // allow scrub if below configured threshold
7395 long cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
7396 double loadavg_per_cpu
= cpus
> 0 ? loadavgs
[0] / cpus
: loadavgs
[0];
7397 if (loadavg_per_cpu
< cct
->_conf
->osd_scrub_load_threshold
) {
7398 dout(20) << __func__
<< " loadavg per cpu " << loadavg_per_cpu
7399 << " < max " << cct
->_conf
->osd_scrub_load_threshold
7400 << " = yes" << dendl
;
7404 // allow scrub if below daily avg and currently decreasing
7405 if (loadavgs
[0] < daily_loadavg
&& loadavgs
[0] < loadavgs
[2]) {
7406 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7407 << " < daily_loadavg " << daily_loadavg
7408 << " and < 15m avg " << loadavgs
[2]
7409 << " = yes" << dendl
;
7413 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7414 << " >= max " << cct
->_conf
->osd_scrub_load_threshold
7415 << " and ( >= daily_loadavg " << daily_loadavg
7416 << " or >= 15m avg " << loadavgs
[2]
7417 << ") = no" << dendl
;
7421 void OSD::sched_scrub()
7423 // if not permitted, fail fast
7424 if (!service
.can_inc_scrubs()) {
7427 bool allow_requested_repair_only
= false;
7428 if (service
.is_recovery_active()) {
7429 if (!cct
->_conf
->osd_scrub_during_recovery
&& cct
->_conf
->osd_repair_during_recovery
) {
7430 dout(10) << __func__
7431 << " will only schedule explicitly requested repair due to active recovery"
7433 allow_requested_repair_only
= true;
7434 } else if (!cct
->_conf
->osd_scrub_during_recovery
&& !cct
->_conf
->osd_repair_during_recovery
) {
7435 dout(20) << __func__
<< " not scheduling scrubs due to active recovery" << dendl
;
7440 utime_t now
= ceph_clock_now();
7441 bool time_permit
= scrub_time_permit(now
);
7442 bool load_is_low
= scrub_load_below_threshold();
7443 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low
<< dendl
;
7445 OSDService::ScrubJob scrub
;
7446 if (service
.first_scrub_stamp(&scrub
)) {
7448 dout(30) << "sched_scrub examine " << scrub
.pgid
<< " at " << scrub
.sched_time
<< dendl
;
7450 if (scrub
.sched_time
> now
) {
7451 // save ourselves some effort
7452 dout(10) << "sched_scrub " << scrub
.pgid
<< " scheduled at " << scrub
.sched_time
7453 << " > " << now
<< dendl
;
7457 if ((scrub
.deadline
.is_zero() || scrub
.deadline
>= now
) && !(time_permit
&& load_is_low
)) {
7458 dout(10) << __func__
<< " not scheduling scrub for " << scrub
.pgid
<< " due to "
7459 << (!time_permit
? "time not permit" : "high load") << dendl
;
7463 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7466 // This has already started, so go on to the next scrub job
7467 if (pg
->scrubber
.active
) {
7469 dout(30) << __func__
<< ": already in progress pgid " << scrub
.pgid
<< dendl
;
7472 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7473 if (allow_requested_repair_only
&& !pg
->scrubber
.must_repair
) {
7475 dout(10) << __func__
<< " skip " << scrub
.pgid
7476 << " because repairing is not explicitly requested on it"
7480 // If it is reserving, let it resolve before going to the next scrub job
7481 if (pg
->scrubber
.local_reserved
&& !pg
->scrubber
.active
) {
7483 dout(30) << __func__
<< ": reserve in progress pgid " << scrub
.pgid
<< dendl
;
7486 dout(10) << "sched_scrub scrubbing " << scrub
.pgid
<< " at " << scrub
.sched_time
7487 << (pg
->get_must_scrub() ? ", explicitly requested" :
7488 (load_is_low
? ", load_is_low" : " deadline < now"))
7490 if (pg
->sched_scrub()) {
7495 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7497 dout(20) << "sched_scrub done" << dendl
;
7500 void OSD::resched_all_scrubs()
7502 dout(10) << __func__
<< ": start" << dendl
;
7503 OSDService::ScrubJob scrub
;
7504 if (service
.first_scrub_stamp(&scrub
)) {
7506 dout(20) << __func__
<< ": examine " << scrub
.pgid
<< dendl
;
7508 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7511 if (!pg
->scrubber
.must_scrub
&& !pg
->scrubber
.need_auto
) {
7512 dout(20) << __func__
<< ": reschedule " << scrub
.pgid
<< dendl
;
7513 pg
->on_info_history_change();
7516 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7518 dout(10) << __func__
<< ": done" << dendl
;
7521 MPGStats
* OSD::collect_pg_stats()
7523 // This implementation unconditionally sends every is_primary PG's
7524 // stats every time we're called. This has equivalent cost to the
7525 // previous implementation's worst case where all PGs are busy and
7526 // their stats are always enqueued for sending.
7527 std::shared_lock l
{map_lock
};
7529 osd_stat_t cur_stat
= service
.get_osd_stat();
7530 cur_stat
.os_perf_stat
= store
->get_cur_stats();
7532 auto m
= new MPGStats(monc
->get_fsid(), get_osdmap_epoch());
7533 m
->osd_stat
= cur_stat
;
7535 std::lock_guard lec
{min_last_epoch_clean_lock
};
7536 min_last_epoch_clean
= get_osdmap_epoch();
7537 min_last_epoch_clean_pgs
.clear();
7539 std::set
<int64_t> pool_set
;
7542 for (auto& pg
: pgs
) {
7543 auto pool
= pg
->pg_id
.pgid
.pool();
7544 pool_set
.emplace((int64_t)pool
);
7545 if (!pg
->is_primary()) {
7548 pg
->get_pg_stats([&](const pg_stat_t
& s
, epoch_t lec
) {
7549 m
->pg_stat
[pg
->pg_id
.pgid
] = s
;
7550 min_last_epoch_clean
= min(min_last_epoch_clean
, lec
);
7551 min_last_epoch_clean_pgs
.push_back(pg
->pg_id
.pgid
);
7555 bool per_pool_stats
= false;
7556 bool per_pool_omap_stats
= false;
7557 for (auto p
: pool_set
) {
7558 int r
= store
->pool_statfs(p
, &st
, &per_pool_omap_stats
);
7559 if (r
== -ENOTSUP
) {
7563 m
->pool_stat
[p
] = st
;
7564 per_pool_stats
= true;
7568 // indicate whether we are reporting per-pool stats
7569 m
->osd_stat
.num_osds
= 1;
7570 m
->osd_stat
.num_per_pool_osds
= per_pool_stats
? 1 : 0;
7571 m
->osd_stat
.num_per_pool_omap_osds
= per_pool_omap_stats
? 1 : 0;
7576 vector
<DaemonHealthMetric
> OSD::get_health_metrics()
7578 vector
<DaemonHealthMetric
> metrics
;
7580 utime_t oldest_secs
;
7581 const utime_t now
= ceph_clock_now();
7583 too_old
-= cct
->_conf
.get_val
<double>("osd_op_complaint_time");
7585 TrackedOpRef oldest_op
;
7586 auto count_slow_ops
= [&](TrackedOp
& op
) {
7587 if (op
.get_initiated() < too_old
) {
7589 ss
<< "slow request " << op
.get_desc()
7591 << op
.get_initiated()
7593 << op
.state_string();
7594 lgeneric_subdout(cct
,osd
,20) << ss
.str() << dendl
;
7595 clog
->warn() << ss
.str();
7597 if (!oldest_op
|| op
.get_initiated() < oldest_op
->get_initiated()) {
7605 if (op_tracker
.visit_ops_in_flight(&oldest_secs
, count_slow_ops
)) {
7607 derr
<< __func__
<< " reporting " << slow
<< " slow ops, oldest is "
7608 << oldest_op
->get_desc() << dendl
;
7610 metrics
.emplace_back(daemon_metric::SLOW_OPS
, slow
, oldest_secs
);
7612 // no news is not good news.
7613 metrics
.emplace_back(daemon_metric::SLOW_OPS
, 0, 0);
7617 std::lock_guard
l(pending_creates_lock
);
7618 auto n_primaries
= pending_creates_from_mon
;
7619 for (const auto& create
: pending_creates_from_osd
) {
7620 if (create
.second
) {
7624 metrics
.emplace_back(daemon_metric::PENDING_CREATING_PGS
, n_primaries
);
7629 // =====================================================
7632 void OSD::wait_for_new_map(OpRequestRef op
)
7635 if (waiting_for_osdmap
.empty()) {
7636 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7639 logger
->inc(l_osd_waiting_for_map
);
7640 waiting_for_osdmap
.push_back(op
);
7641 op
->mark_delayed("wait for new map");
7646 * assimilate new OSDMap(s). scan pgs, etc.
7649 void OSD::note_down_osd(int peer
)
7651 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7652 cluster_messenger
->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer
));
7654 std::lock_guard l
{heartbeat_lock
};
7655 failure_queue
.erase(peer
);
7656 failure_pending
.erase(peer
);
7657 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
7658 if (p
!= heartbeat_peers
.end()) {
7659 p
->second
.clear_mark_down();
7660 heartbeat_peers
.erase(p
);
7664 void OSD::note_up_osd(int peer
)
7666 heartbeat_set_peers_need_update();
7669 struct C_OnMapCommit
: public Context
{
7671 epoch_t first
, last
;
7673 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
7674 : osd(o
), first(f
), last(l
), msg(m
) {}
7675 void finish(int r
) override
{
7676 osd
->_committed_osd_maps(first
, last
, msg
);
7681 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
7683 std::lock_guard
l(osdmap_subscribe_lock
);
7684 if (latest_subscribed_epoch
>= epoch
&& !force_request
)
7687 latest_subscribed_epoch
= std::max
<uint64_t>(epoch
, latest_subscribed_epoch
);
7689 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
7695 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
7697 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
7698 if (min
<= superblock
.oldest_map
)
7702 ObjectStore::Transaction t
;
7703 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
7704 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
7705 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
7706 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
7707 superblock
.oldest_map
= e
+ 1;
7709 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
7710 service
.publish_superblock(superblock
);
7711 write_superblock(t
);
7712 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7713 ceph_assert(tr
== 0);
7716 // skip_maps leaves us with a range of old maps if we fail to remove all
7717 // of them before moving superblock.oldest_map forward to the first map
7718 // in the incoming MOSDMap msg. so we should continue removing them in
7719 // this case, even we could do huge series of delete transactions all at
7726 service
.publish_superblock(superblock
);
7727 write_superblock(t
);
7728 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7729 ceph_assert(tr
== 0);
7731 // we should not remove the cached maps
7732 ceph_assert(min
<= service
.map_cache
.cached_key_lower_bound());
7735 void OSD::handle_osd_map(MOSDMap
*m
)
7737 // wait for pgs to catch up
7739 // we extend the map cache pins to accomodate pgs slow to consume maps
7740 // for some period, until we hit the max_lag_factor bound, at which point
7741 // we block here to stop injesting more maps than they are able to keep
7743 epoch_t max_lag
= cct
->_conf
->osd_map_cache_size
*
7744 m_osd_pg_epoch_max_lag_factor
;
7745 ceph_assert(max_lag
> 0);
7746 epoch_t osd_min
= 0;
7747 for (auto shard
: shards
) {
7748 epoch_t min
= shard
->get_min_pg_epoch();
7749 if (osd_min
== 0 || min
< osd_min
) {
7753 epoch_t osdmap_epoch
= get_osdmap_epoch();
7755 osdmap_epoch
> max_lag
&&
7756 osdmap_epoch
- max_lag
> osd_min
) {
7757 epoch_t need
= osdmap_epoch
- max_lag
;
7758 dout(10) << __func__
<< " waiting for pgs to catch up (need " << need
7759 << " max_lag " << max_lag
<< ")" << dendl
;
7760 for (auto shard
: shards
) {
7761 epoch_t min
= shard
->get_min_pg_epoch();
7763 dout(10) << __func__
<< " waiting for pgs to consume " << need
7764 << " (shard " << shard
->shard_id
<< " min " << min
7765 << ", map cache is " << cct
->_conf
->osd_map_cache_size
7766 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7768 unlock_guard unlock
{osd_lock
};
7769 shard
->wait_min_pg_epoch(need
);
7775 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7776 map
<epoch_t
,OSDMapRef
> added_maps
;
7777 map
<epoch_t
,bufferlist
> added_maps_bl
;
7778 if (m
->fsid
!= monc
->get_fsid()) {
7779 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
7780 << monc
->get_fsid() << dendl
;
7784 if (is_initializing()) {
7785 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
7790 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
7791 if (session
&& !(session
->entity_name
.is_mon() ||
7792 session
->entity_name
.is_osd())) {
7794 dout(10) << "got osd map from Session " << session
7795 << " which we can't take maps from (not a mon or osd)" << dendl
;
7800 // share with the objecter
7802 service
.objecter
->handle_osd_map(m
);
7804 epoch_t first
= m
->get_first();
7805 epoch_t last
= m
->get_last();
7806 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
7807 << superblock
.newest_map
7808 << ", src has [" << m
->oldest_map
<< "," << m
->newest_map
<< "]"
7811 logger
->inc(l_osd_map
);
7812 logger
->inc(l_osd_mape
, last
- first
+ 1);
7813 if (first
<= superblock
.newest_map
)
7814 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
7815 if (service
.max_oldest_map
< m
->oldest_map
) {
7816 service
.max_oldest_map
= m
->oldest_map
;
7817 ceph_assert(service
.max_oldest_map
>= superblock
.oldest_map
);
7820 // make sure there is something new, here, before we bother flushing
7821 // the queues and such
7822 if (last
<= superblock
.newest_map
) {
7823 dout(10) << " no new maps here, dropping" << dendl
;
7829 bool skip_maps
= false;
7830 if (first
> superblock
.newest_map
+ 1) {
7831 dout(10) << "handle_osd_map message skips epochs "
7832 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
7833 if (m
->oldest_map
<= superblock
.newest_map
+ 1) {
7834 osdmap_subscribe(superblock
.newest_map
+ 1, false);
7838 // always try to get the full range of maps--as many as we can. this
7839 // 1- is good to have
7840 // 2- is at present the only way to ensure that we get a *full* map as
7842 if (m
->oldest_map
< first
) {
7843 osdmap_subscribe(m
->oldest_map
- 1, true);
7850 ObjectStore::Transaction t
;
7851 uint64_t txn_size
= 0;
7853 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> purged_snaps
;
7855 // store new maps: queue for disk and put in the osdmap cache
7856 epoch_t start
= std::max(superblock
.newest_map
+ 1, first
);
7857 for (epoch_t e
= start
; e
<= last
; e
++) {
7858 if (txn_size
>= t
.get_num_bytes()) {
7859 derr
<< __func__
<< " transaction size overflowed" << dendl
;
7860 ceph_assert(txn_size
< t
.get_num_bytes());
7862 txn_size
= t
.get_num_bytes();
7863 map
<epoch_t
,bufferlist
>::iterator p
;
7864 p
= m
->maps
.find(e
);
7865 if (p
!= m
->maps
.end()) {
7866 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
7867 OSDMap
*o
= new OSDMap
;
7868 bufferlist
& bl
= p
->second
;
7872 purged_snaps
[e
] = o
->get_new_purged_snaps();
7874 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
7875 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
7876 added_maps
[e
] = add_map(o
);
7877 added_maps_bl
[e
] = bl
;
7882 p
= m
->incremental_maps
.find(e
);
7883 if (p
!= m
->incremental_maps
.end()) {
7884 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
7885 bufferlist
& bl
= p
->second
;
7886 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
7887 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
7889 OSDMap
*o
= new OSDMap
;
7892 bool got
= get_map_bl(e
- 1, obl
);
7894 auto p
= added_maps_bl
.find(e
- 1);
7895 ceph_assert(p
!= added_maps_bl
.end());
7901 OSDMap::Incremental inc
;
7902 auto p
= bl
.cbegin();
7905 if (o
->apply_incremental(inc
) < 0) {
7906 derr
<< "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
7907 ceph_abort_msg("bad fsid");
7911 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
7913 bool injected_failure
= false;
7914 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
7915 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
7916 derr
<< __func__
<< " injecting map crc failure" << dendl
;
7917 injected_failure
= true;
7920 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
7921 dout(2) << "got incremental " << e
7922 << " but failed to encode full with correct crc; requesting"
7924 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
7925 dout(20) << "my encoded map was:\n";
7926 fbl
.hexdump(*_dout
);
7929 request_full_map(e
, last
);
7934 purged_snaps
[e
] = o
->get_new_purged_snaps();
7936 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
7937 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
7938 added_maps
[e
] = add_map(o
);
7939 added_maps_bl
[e
] = fbl
;
7943 ceph_abort_msg("MOSDMap lied about what maps it had?");
7946 // even if this map isn't from a mon, we may have satisfied our subscription
7947 monc
->sub_got("osdmap", last
);
7949 if (!m
->maps
.empty() && requested_full_first
) {
7950 dout(10) << __func__
<< " still missing full maps " << requested_full_first
7951 << ".." << requested_full_last
<< dendl
;
7952 rerequest_full_maps();
7955 if (superblock
.oldest_map
) {
7956 // make sure we at least keep pace with incoming maps
7957 trim_maps(m
->oldest_map
, last
- first
+ 1, skip_maps
);
7958 pg_num_history
.prune(superblock
.oldest_map
);
7961 if (!superblock
.oldest_map
|| skip_maps
)
7962 superblock
.oldest_map
= first
;
7963 superblock
.newest_map
= last
;
7964 superblock
.current_epoch
= last
;
7966 // note in the superblock that we were clean thru the prior epoch
7967 epoch_t boot_epoch
= service
.get_boot_epoch();
7968 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
7969 superblock
.mounted
= boot_epoch
;
7970 superblock
.clean_thru
= last
;
7973 // check for pg_num changes and deleted pools
7975 for (auto& i
: added_maps
) {
7977 if (!(lastmap
= service
.try_get_map(i
.first
- 1))) {
7978 dout(10) << __func__
<< " can't get previous map " << i
.first
- 1
7979 << " probably first start of this osd" << dendl
;
7983 ceph_assert(lastmap
->get_epoch() + 1 == i
.second
->get_epoch());
7984 for (auto& j
: lastmap
->get_pools()) {
7985 if (!i
.second
->have_pg_pool(j
.first
)) {
7986 pg_num_history
.log_pool_delete(i
.first
, j
.first
);
7987 dout(10) << __func__
<< " recording final pg_pool_t for pool "
7988 << j
.first
<< dendl
;
7989 // this information is needed by _make_pg() if have to restart before
7990 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
7991 ghobject_t obj
= make_final_pool_info_oid(j
.first
);
7993 encode(j
.second
, bl
, CEPH_FEATURES_ALL
);
7994 string name
= lastmap
->get_pool_name(j
.first
);
7996 map
<string
,string
> profile
;
7997 if (lastmap
->get_pg_pool(j
.first
)->is_erasure()) {
7998 profile
= lastmap
->get_erasure_code_profile(
7999 lastmap
->get_pg_pool(j
.first
)->erasure_code_profile
);
8001 encode(profile
, bl
);
8002 t
.write(coll_t::meta(), obj
, 0, bl
.length(), bl
);
8003 } else if (unsigned new_pg_num
= i
.second
->get_pg_num(j
.first
);
8004 new_pg_num
!= j
.second
.get_pg_num()) {
8005 dout(10) << __func__
<< " recording pool " << j
.first
<< " pg_num "
8006 << j
.second
.get_pg_num() << " -> " << new_pg_num
<< dendl
;
8007 pg_num_history
.log_pg_num_change(i
.first
, j
.first
, new_pg_num
);
8010 for (auto& j
: i
.second
->get_pools()) {
8011 if (!lastmap
->have_pg_pool(j
.first
)) {
8012 dout(10) << __func__
<< " recording new pool " << j
.first
<< " pg_num "
8013 << j
.second
.get_pg_num() << dendl
;
8014 pg_num_history
.log_pg_num_change(i
.first
, j
.first
,
8015 j
.second
.get_pg_num());
8020 pg_num_history
.epoch
= last
;
8023 ::encode(pg_num_history
, bl
);
8024 t
.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl
.length(), bl
);
8025 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
8028 // record new purged_snaps
8029 if (superblock
.purged_snaps_last
== start
- 1) {
8030 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
8031 make_purged_snaps_oid(), &t
,
8033 superblock
.purged_snaps_last
= last
;
8035 dout(10) << __func__
<< " superblock purged_snaps_last is "
8036 << superblock
.purged_snaps_last
8037 << ", not recording new purged_snaps" << dendl
;
8040 // superblock and commit
8041 write_superblock(t
);
8042 t
.register_on_commit(new C_OnMapCommit(this, start
, last
, m
));
8043 store
->queue_transaction(
8046 service
.publish_superblock(superblock
);
8049 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
8051 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
8052 if (is_stopping()) {
8053 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8056 std::lock_guard
l(osd_lock
);
8057 if (is_stopping()) {
8058 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8063 bool do_shutdown
= false;
8064 bool do_restart
= false;
8065 bool network_error
= false;
8068 // advance through the new maps
8069 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
8070 dout(10) << " advance to epoch " << cur
8071 << " (<= last " << last
8072 << " <= newest_map " << superblock
.newest_map
8075 OSDMapRef newmap
= get_map(cur
);
8076 ceph_assert(newmap
); // we just cached it above!
8078 // start blacklisting messages sent to peers that go down.
8079 service
.pre_publish_map(newmap
);
8081 // kill connections to newly down osds
8082 bool waited_for_reservations
= false;
8084 osdmap
= get_osdmap();
8085 osdmap
->get_all_osds(old
);
8086 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
8088 osdmap
->is_up(*p
) && // in old map
8089 newmap
->is_down(*p
)) { // but not the new one
8090 if (!waited_for_reservations
) {
8091 service
.await_reserved_maps();
8092 waited_for_reservations
= true;
8095 } else if (*p
!= whoami
&&
8096 osdmap
->is_down(*p
) &&
8097 newmap
->is_up(*p
)) {
8102 if (osdmap
->is_noup(whoami
) != newmap
->is_noup(whoami
)) {
8103 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
8106 // this captures the case where we sent the boot message while
8107 // NOUP was being set on the mon and our boot request was
8108 // dropped, and then later it is cleared. it imperfectly
8109 // handles the case where our original boot message was not
8110 // dropped and we restart even though we might have booted, but
8111 // that is harmless (boot will just take slightly longer).
8116 osdmap
= std::move(newmap
);
8120 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
8122 osdmap
->is_up(whoami
) &&
8123 osdmap
->get_addrs(whoami
) == client_messenger
->get_myaddrs()) {
8124 up_epoch
= osdmap
->get_epoch();
8125 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
8127 boot_epoch
= osdmap
->get_epoch();
8128 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
8130 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
8134 epoch_t _bind_epoch
= service
.get_bind_epoch();
8135 if (osdmap
->is_up(whoami
) &&
8136 osdmap
->get_addrs(whoami
).legacy_equals(
8137 client_messenger
->get_myaddrs()) &&
8138 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
8141 dout(1) << "state: booting -> active" << dendl
;
8142 set_state(STATE_ACTIVE
);
8145 // set incarnation so that osd_reqid_t's we generate for our
8146 // objecter requests are unique across restarts.
8147 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
8148 cancel_pending_failures();
8152 if (osdmap
->get_epoch() > 0 &&
8154 if (!osdmap
->exists(whoami
)) {
8155 derr
<< "map says i do not exist. shutting down." << dendl
;
8156 do_shutdown
= true; // don't call shutdown() while we have
8157 // everything paused
8158 } else if (osdmap
->is_stop(whoami
)) {
8159 derr
<< "map says i am stopped by admin. shutting down." << dendl
;
8161 } else if (!osdmap
->is_up(whoami
) ||
8162 !osdmap
->get_addrs(whoami
).legacy_equals(
8163 client_messenger
->get_myaddrs()) ||
8164 !osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8165 cluster_messenger
->get_myaddrs()) ||
8166 !osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8167 hb_back_server_messenger
->get_myaddrs()) ||
8168 !osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8169 hb_front_server_messenger
->get_myaddrs())) {
8170 if (!osdmap
->is_up(whoami
)) {
8171 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
8172 service
.got_stop_ack();
8174 clog
->warn() << "Monitor daemon marked osd." << whoami
<< " down, "
8175 "but it is still running";
8176 clog
->debug() << "map e" << osdmap
->get_epoch()
8177 << " wrongly marked me down at e"
8178 << osdmap
->get_down_at(whoami
);
8180 if (monc
->monmap
.min_mon_release
>= ceph_release_t::octopus
) {
8181 // note that this is best-effort...
8182 monc
->send_mon_message(
8186 osdmap
->get_epoch()));
8188 } else if (!osdmap
->get_addrs(whoami
).legacy_equals(
8189 client_messenger
->get_myaddrs())) {
8190 clog
->error() << "map e" << osdmap
->get_epoch()
8191 << " had wrong client addr (" << osdmap
->get_addrs(whoami
)
8192 << " != my " << client_messenger
->get_myaddrs() << ")";
8193 } else if (!osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8194 cluster_messenger
->get_myaddrs())) {
8195 clog
->error() << "map e" << osdmap
->get_epoch()
8196 << " had wrong cluster addr ("
8197 << osdmap
->get_cluster_addrs(whoami
)
8198 << " != my " << cluster_messenger
->get_myaddrs() << ")";
8199 } else if (!osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8200 hb_back_server_messenger
->get_myaddrs())) {
8201 clog
->error() << "map e" << osdmap
->get_epoch()
8202 << " had wrong heartbeat back addr ("
8203 << osdmap
->get_hb_back_addrs(whoami
)
8204 << " != my " << hb_back_server_messenger
->get_myaddrs()
8206 } else if (!osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8207 hb_front_server_messenger
->get_myaddrs())) {
8208 clog
->error() << "map e" << osdmap
->get_epoch()
8209 << " had wrong heartbeat front addr ("
8210 << osdmap
->get_hb_front_addrs(whoami
)
8211 << " != my " << hb_front_server_messenger
->get_myaddrs()
8215 if (!service
.is_stopping()) {
8216 epoch_t up_epoch
= 0;
8217 epoch_t bind_epoch
= osdmap
->get_epoch();
8218 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
8222 utime_t now
= ceph_clock_now();
8223 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
8224 osd_markdown_log
.push_back(now
);
8225 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
8226 derr
<< __func__
<< " marked down "
8227 << osd_markdown_log
.size()
8228 << " > osd_max_markdown_count "
8229 << cct
->_conf
->osd_max_markdown_count
8230 << " in last " << grace
<< " seconds, shutting down"
8236 start_waiting_for_healthy();
8238 set
<int> avoid_ports
;
8239 #if defined(__FreeBSD__)
8240 // prevent FreeBSD from grabbing the client_messenger port during
8241 // rebinding. In which case a cluster_meesneger will connect also
8243 client_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8245 cluster_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8247 int r
= cluster_messenger
->rebind(avoid_ports
);
8249 do_shutdown
= true; // FIXME: do_restart?
8250 network_error
= true;
8251 derr
<< __func__
<< " marked down:"
8252 << " rebind cluster_messenger failed" << dendl
;
8255 hb_back_server_messenger
->mark_down_all();
8256 hb_front_server_messenger
->mark_down_all();
8257 hb_front_client_messenger
->mark_down_all();
8258 hb_back_client_messenger
->mark_down_all();
8260 reset_heartbeat_peers(true);
8267 check_osdmap_features();
8272 if (is_active() || is_waiting_for_healthy())
8273 maybe_update_heartbeat_peers();
8280 if (network_error
) {
8281 cancel_pending_failures();
8283 // trigger shutdown in a different thread
8284 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
8285 queue_async_signal(SIGINT
);
8287 else if (m
->newest_map
&& m
->newest_map
> last
) {
8288 dout(10) << " msg say newest map is " << m
->newest_map
8289 << ", requesting more" << dendl
;
8290 osdmap_subscribe(osdmap
->get_epoch()+1, false);
8292 else if (is_preboot()) {
8293 if (m
->get_source().is_mon())
8294 _preboot(m
->oldest_map
, m
->newest_map
);
8298 else if (do_restart
)
8303 void OSD::check_osdmap_features()
8305 // adjust required feature bits?
8307 // we have to be a bit careful here, because we are accessing the
8308 // Policy structures without taking any lock. in particular, only
8309 // modify integer values that can safely be read by a racing CPU.
8310 // since we are only accessing existing Policy structures a their
8311 // current memory location, and setting or clearing bits in integer
8312 // fields, and we are the only writer, this is not a problem.
8314 const auto osdmap
= get_osdmap();
8316 Messenger::Policy p
= client_messenger
->get_default_policy();
8318 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
8319 if ((p
.features_required
& mask
) != features
) {
8320 dout(0) << "crush map has features " << features
8321 << ", adjusting msgr requires for clients" << dendl
;
8322 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8323 client_messenger
->set_default_policy(p
);
8327 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
8329 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
8330 if ((p
.features_required
& mask
) != features
) {
8331 dout(0) << "crush map has features " << features
8332 << " was " << p
.features_required
8333 << ", adjusting msgr requires for mons" << dendl
;
8334 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8335 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
8339 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
8341 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
8343 if ((p
.features_required
& mask
) != features
) {
8344 dout(0) << "crush map has features " << features
8345 << ", adjusting msgr requires for osds" << dendl
;
8346 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8347 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
8350 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
8351 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
8352 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
8353 ObjectStore::Transaction t
;
8354 write_superblock(t
);
8355 int err
= store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
8356 ceph_assert(err
== 0);
8360 if (osdmap
->require_osd_release
< ceph_release_t::nautilus
) {
8361 hb_front_server_messenger
->set_require_authorizer(false);
8362 hb_back_server_messenger
->set_require_authorizer(false);
8364 hb_front_server_messenger
->set_require_authorizer(true);
8365 hb_back_server_messenger
->set_require_authorizer(true);
8368 if (osdmap
->require_osd_release
!= last_require_osd_release
) {
8369 dout(1) << __func__
<< " require_osd_release " << last_require_osd_release
8370 << " -> " << to_string(osdmap
->require_osd_release
) << dendl
;
8371 store
->write_meta("require_osd_release",
8372 stringify((int)osdmap
->require_osd_release
));
8373 last_require_osd_release
= osdmap
->require_osd_release
;
8377 struct C_FinishSplits
: public Context
{
8380 C_FinishSplits(OSD
*osd
, const set
<PGRef
> &in
)
8381 : osd(osd
), pgs(in
) {}
8382 void finish(int r
) override
{
8383 osd
->_finish_splits(pgs
);
8387 void OSD::_finish_splits(set
<PGRef
>& pgs
)
8389 dout(10) << __func__
<< " " << pgs
<< dendl
;
8392 for (set
<PGRef
>::iterator i
= pgs
.begin();
8397 PeeringCtx rctx
= create_context();
8399 dout(10) << __func__
<< " " << *pg
<< dendl
;
8400 epoch_t e
= pg
->get_osdmap_epoch();
8401 pg
->handle_initialize(rctx
);
8402 pg
->queue_null(e
, e
);
8403 dispatch_context(rctx
, pg
, service
.get_osdmap());
8406 unsigned shard_index
= pg
->pg_id
.hash_to_shard(num_shards
);
8407 shards
[shard_index
]->register_and_wake_split_child(pg
);
8411 bool OSD::add_merge_waiter(OSDMapRef nextmap
, spg_t target
, PGRef src
,
8414 std::lock_guard
l(merge_lock
);
8415 auto& p
= merge_waiters
[nextmap
->get_epoch()][target
];
8416 p
[src
->pg_id
] = src
;
8417 dout(10) << __func__
<< " added merge_waiter " << src
->pg_id
8418 << " for " << target
<< ", have " << p
.size() << "/" << need
8420 return p
.size() == need
;
8423 bool OSD::advance_pg(
8426 ThreadPool::TPHandle
&handle
,
8429 if (osd_epoch
<= pg
->get_osdmap_epoch()) {
8432 ceph_assert(pg
->is_locked());
8433 OSDMapRef lastmap
= pg
->get_osdmap();
8434 ceph_assert(lastmap
->get_epoch() < osd_epoch
);
8435 set
<PGRef
> new_pgs
; // any split children
8438 unsigned old_pg_num
= lastmap
->have_pg_pool(pg
->pg_id
.pool()) ?
8439 lastmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8440 for (epoch_t next_epoch
= pg
->get_osdmap_epoch() + 1;
8441 next_epoch
<= osd_epoch
;
8443 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
8445 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
8449 unsigned new_pg_num
=
8450 (old_pg_num
&& nextmap
->have_pg_pool(pg
->pg_id
.pool())) ?
8451 nextmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8452 if (old_pg_num
&& new_pg_num
&& old_pg_num
!= new_pg_num
) {
8454 if (nextmap
->have_pg_pool(pg
->pg_id
.pool())) {
8456 if (pg
->pg_id
.is_merge_source(
8460 // we are merge source
8461 PGRef spg
= pg
; // carry a ref
8462 dout(1) << __func__
<< " " << pg
->pg_id
8463 << " is merge source, target is " << parent
8465 pg
->write_if_dirty(rctx
);
8466 if (!new_pgs
.empty()) {
8467 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8471 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8473 // release backoffs explicitly, since the on_shutdown path
8474 // aggressively tears down backoff state.
8475 if (pg
->is_primary()) {
8476 pg
->release_pg_backoffs();
8479 OSDShard
*sdata
= pg
->osd_shard
;
8481 std::lock_guard
l(sdata
->shard_lock
);
8483 sdata
->_detach_pg(pg
->pg_slot
);
8484 // update pg count now since we might not get an osdmap
8486 if (pg
->is_primary())
8487 logger
->dec(l_osd_pg_primary
);
8488 else if (pg
->is_nonprimary())
8489 logger
->dec(l_osd_pg_replica
); // misnomer
8491 logger
->dec(l_osd_pg_stray
);
8496 set
<spg_t
> children
;
8497 parent
.is_split(new_pg_num
, old_pg_num
, &children
);
8498 if (add_merge_waiter(nextmap
, parent
, pg
, children
.size())) {
8499 enqueue_peering_evt(
8502 std::make_shared
<PGPeeringEvent
>(
8503 nextmap
->get_epoch(),
8504 nextmap
->get_epoch(),
8509 } else if (pg
->pg_id
.is_merge_target(old_pg_num
, new_pg_num
)) {
8510 // we are merge target
8511 set
<spg_t
> children
;
8512 pg
->pg_id
.is_split(new_pg_num
, old_pg_num
, &children
);
8513 dout(20) << __func__
<< " " << pg
->pg_id
8514 << " is merge target, sources are " << children
8516 map
<spg_t
,PGRef
> sources
;
8518 std::lock_guard
l(merge_lock
);
8519 auto& s
= merge_waiters
[nextmap
->get_epoch()][pg
->pg_id
];
8520 unsigned need
= children
.size();
8521 dout(20) << __func__
<< " have " << s
.size() << "/"
8523 if (s
.size() == need
) {
8525 merge_waiters
[nextmap
->get_epoch()].erase(pg
->pg_id
);
8526 if (merge_waiters
[nextmap
->get_epoch()].empty()) {
8527 merge_waiters
.erase(nextmap
->get_epoch());
8531 if (!sources
.empty()) {
8532 unsigned new_pg_num
= nextmap
->get_pg_num(pg
->pg_id
.pool());
8533 unsigned split_bits
= pg
->pg_id
.get_split_bits(new_pg_num
);
8534 dout(1) << __func__
<< " merging " << pg
->pg_id
<< dendl
;
8536 sources
, rctx
, split_bits
,
8537 nextmap
->get_pg_pool(
8538 pg
->pg_id
.pool())->last_pg_merge_meta
);
8539 pg
->pg_slot
->waiting_for_merge_epoch
= 0;
8541 dout(20) << __func__
<< " not ready to merge yet" << dendl
;
8542 pg
->write_if_dirty(rctx
);
8543 if (!new_pgs
.empty()) {
8544 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8548 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8550 // kick source(s) to get them ready
8551 for (auto& i
: children
) {
8552 dout(20) << __func__
<< " kicking source " << i
<< dendl
;
8553 enqueue_peering_evt(
8556 std::make_shared
<PGPeeringEvent
>(
8557 nextmap
->get_epoch(),
8558 nextmap
->get_epoch(),
8568 vector
<int> newup
, newacting
;
8569 int up_primary
, acting_primary
;
8570 nextmap
->pg_to_up_acting_osds(
8572 &newup
, &up_primary
,
8573 &newacting
, &acting_primary
);
8574 pg
->handle_advance_map(
8575 nextmap
, lastmap
, newup
, up_primary
,
8576 newacting
, acting_primary
, rctx
);
8578 auto oldpool
= lastmap
->get_pools().find(pg
->pg_id
.pool());
8579 auto newpool
= nextmap
->get_pools().find(pg
->pg_id
.pool());
8580 if (oldpool
!= lastmap
->get_pools().end()
8581 && newpool
!= nextmap
->get_pools().end()) {
8582 dout(20) << __func__
8583 << " new pool opts " << newpool
->second
.opts
8584 << " old pool opts " << oldpool
->second
.opts
8587 double old_min_interval
= 0, new_min_interval
= 0;
8588 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &old_min_interval
);
8589 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &new_min_interval
);
8591 double old_max_interval
= 0, new_max_interval
= 0;
8592 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &old_max_interval
);
8593 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &new_max_interval
);
8595 // Assume if an interval is change from set to unset or vice versa the actual config
8596 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8598 if (old_min_interval
!= new_min_interval
|| old_max_interval
!= new_max_interval
) {
8599 pg
->on_info_history_change();
8603 if (new_pg_num
&& old_pg_num
!= new_pg_num
) {
8605 set
<spg_t
> children
;
8606 if (pg
->pg_id
.is_split(
8611 pg
, children
, &new_pgs
, lastmap
, nextmap
,
8617 old_pg_num
= new_pg_num
;
8618 handle
.reset_tp_timeout();
8620 pg
->handle_activate_map(rctx
);
8624 if (!new_pgs
.empty()) {
8625 rctx
.transaction
.register_on_applied(new C_FinishSplits(this, new_pgs
));
8630 void OSD::consume_map()
8632 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8633 auto osdmap
= get_osdmap();
8634 dout(7) << "consume_map version " << osdmap
->get_epoch() << dendl
;
8636 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8637 * speak the older sorting version any more. Be careful not to force
8638 * a shutdown if we are merely processing old maps, though.
8640 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
) && is_active()) {
8641 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
8645 service
.pre_publish_map(osdmap
);
8646 service
.await_reserved_maps();
8647 service
.publish_map(osdmap
);
8649 // prime splits and merges
8650 set
<pair
<spg_t
,epoch_t
>> newly_split
; // splits, and when
8651 set
<pair
<spg_t
,epoch_t
>> merge_pgs
; // merge participants, and when
8652 for (auto& shard
: shards
) {
8653 shard
->identify_splits_and_merges(osdmap
, &newly_split
, &merge_pgs
);
8655 if (!newly_split
.empty()) {
8656 for (auto& shard
: shards
) {
8657 shard
->prime_splits(osdmap
, &newly_split
);
8659 ceph_assert(newly_split
.empty());
8662 // prune sent_ready_to_merge
8663 service
.prune_sent_ready_to_merge(osdmap
);
8665 // FIXME, maybe: We could race against an incoming peering message
8666 // that instantiates a merge PG after identify_merges() below and
8667 // never set up its peer to complete the merge. An OSD restart
8668 // would clear it up. This is a hard race to resolve,
8669 // extraordinarily rare (we only merge PGs that are stable and
8670 // clean, so it'd have to be an imported PG to an OSD with a
8671 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8672 // replace all of this with a seastar-based code soon anyway.
8673 if (!merge_pgs
.empty()) {
8674 // mark the pgs we already have, or create new and empty merge
8675 // participants for those we are missing. do this all under the
8676 // shard lock so we don't have to worry about racing pg creates
8678 for (auto& shard
: shards
) {
8679 shard
->prime_merges(osdmap
, &merge_pgs
);
8681 ceph_assert(merge_pgs
.empty());
8684 service
.prune_pg_created();
8686 unsigned pushes_to_free
= 0;
8687 for (auto& shard
: shards
) {
8688 shard
->consume_map(osdmap
, &pushes_to_free
);
8691 vector
<spg_t
> pgids
;
8694 // count (FIXME, probably during seastar rewrite)
8695 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
8698 for (auto& pg
: pgs
) {
8699 // FIXME (probably during seastar rewrite): this is lockless and
8700 // racy, but we don't want to take pg lock here.
8701 if (pg
->is_primary())
8703 else if (pg
->is_nonprimary())
8704 num_pg_replica
++; // misnomer
8710 // FIXME (as part of seastar rewrite): move to OSDShard
8711 std::lock_guard
l(pending_creates_lock
);
8712 for (auto pg
= pending_creates_from_osd
.begin();
8713 pg
!= pending_creates_from_osd
.end();) {
8714 if (osdmap
->get_pg_acting_role(pg
->first
, whoami
) < 0) {
8715 dout(10) << __func__
<< " pg " << pg
->first
<< " doesn't map here, "
8716 << "discarding pending_create_from_osd" << dendl
;
8717 pg
= pending_creates_from_osd
.erase(pg
);
8724 service
.maybe_inject_dispatch_delay();
8726 dispatch_sessions_waiting_on_map();
8728 service
.maybe_inject_dispatch_delay();
8730 service
.release_reserved_pushes(pushes_to_free
);
8732 // queue null events to push maps down to individual PGs
8733 for (auto pgid
: pgids
) {
8734 enqueue_peering_evt(
8737 std::make_shared
<PGPeeringEvent
>(
8738 osdmap
->get_epoch(),
8739 osdmap
->get_epoch(),
8742 logger
->set(l_osd_pg
, pgids
.size());
8743 logger
->set(l_osd_pg_primary
, num_pg_primary
);
8744 logger
->set(l_osd_pg_replica
, num_pg_replica
);
8745 logger
->set(l_osd_pg_stray
, num_pg_stray
);
8748 void OSD::activate_map()
8750 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8751 auto osdmap
= get_osdmap();
8753 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
8756 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
8757 if (!service
.recovery_is_paused()) {
8758 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
8759 service
.pause_recovery();
8762 if (service
.recovery_is_paused()) {
8763 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
8764 service
.unpause_recovery();
8768 service
.activate_map();
8771 take_waiters(waiting_for_osdmap
);
8774 bool OSD::require_mon_peer(const Message
*m
)
8776 if (!m
->get_connection()->peer_is_mon()) {
8777 dout(0) << "require_mon_peer received from non-mon "
8778 << m
->get_connection()->get_peer_addr()
8779 << " " << *m
<< dendl
;
8785 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
8787 if (!m
->get_connection()->peer_is_mon() &&
8788 !m
->get_connection()->peer_is_mgr()) {
8789 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8790 << m
->get_connection()->get_peer_addr()
8791 << " " << *m
<< dendl
;
8797 bool OSD::require_osd_peer(const Message
*m
)
8799 if (!m
->get_connection()->peer_is_osd()) {
8800 dout(0) << "require_osd_peer received from non-osd "
8801 << m
->get_connection()->get_peer_addr()
8802 << " " << *m
<< dendl
;
8808 bool OSD::require_self_aliveness(const Message
*m
, epoch_t epoch
)
8810 epoch_t up_epoch
= service
.get_up_epoch();
8811 if (epoch
< up_epoch
) {
8812 dout(7) << "from pre-up epoch " << epoch
<< " < " << up_epoch
<< dendl
;
8817 dout(7) << "still in boot state, dropping message " << *m
<< dendl
;
8824 bool OSD::require_same_peer_instance(const Message
*m
, const OSDMapRef
& map
,
8825 bool is_fast_dispatch
)
8827 int from
= m
->get_source().num();
8829 if (map
->is_down(from
) ||
8830 (map
->get_cluster_addrs(from
) != m
->get_source_addrs())) {
8831 dout(5) << "from dead osd." << from
<< ", marking down, "
8832 << " msg was " << m
->get_source_inst().addr
8834 << (map
->is_up(from
) ?
8835 map
->get_cluster_addrs(from
) : entity_addrvec_t())
8837 ConnectionRef con
= m
->get_connection();
8839 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); s
) {
8840 if (!is_fast_dispatch
)
8841 s
->session_dispatch_lock
.lock();
8842 clear_session_waiting_on_map(s
);
8843 con
->set_priv(nullptr); // break ref <-> session cycle, if any
8845 if (!is_fast_dispatch
)
8846 s
->session_dispatch_lock
.unlock();
8855 * require that we have same (or newer) map, and that
8856 * the source is the pg primary.
8858 bool OSD::require_same_or_newer_map(OpRequestRef
& op
, epoch_t epoch
,
8859 bool is_fast_dispatch
)
8861 const Message
*m
= op
->get_req();
8862 const auto osdmap
= get_osdmap();
8863 dout(15) << "require_same_or_newer_map " << epoch
8864 << " (i am " << osdmap
->get_epoch() << ") " << m
<< dendl
;
8866 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8868 // do they have a newer map?
8869 if (epoch
> osdmap
->get_epoch()) {
8870 dout(7) << "waiting for newer map epoch " << epoch
8871 << " > my " << osdmap
->get_epoch() << " with " << m
<< dendl
;
8872 wait_for_new_map(op
);
8876 if (!require_self_aliveness(op
->get_req(), epoch
)) {
8880 // ok, our map is same or newer.. do they still exist?
8881 if (m
->get_connection()->get_messenger() == cluster_messenger
&&
8882 !require_same_peer_instance(op
->get_req(), osdmap
, is_fast_dispatch
)) {
8893 // ----------------------------------------
8896 void OSD::split_pgs(
8898 const set
<spg_t
> &childpgids
, set
<PGRef
> *out_pgs
,
8903 unsigned pg_num
= nextmap
->get_pg_num(parent
->pg_id
.pool());
8904 parent
->update_snap_mapper_bits(parent
->get_pgid().get_split_bits(pg_num
));
8906 vector
<object_stat_sum_t
> updated_stats
;
8907 parent
->start_split_stats(childpgids
, &updated_stats
);
8909 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
8910 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
8911 i
!= childpgids
.end();
8913 ceph_assert(stat_iter
!= updated_stats
.end());
8914 dout(10) << __func__
<< " splitting " << *parent
<< " into " << *i
<< dendl
;
8915 PG
* child
= _make_pg(nextmap
, *i
);
8917 out_pgs
->insert(child
);
8918 child
->ch
= store
->create_new_collection(child
->coll
);
8921 uint32_t shard_index
= i
->hash_to_shard(shards
.size());
8922 assert(NULL
!= shards
[shard_index
]);
8923 store
->set_collection_commit_queue(child
->coll
, &(shards
[shard_index
]->context_queue
));
8926 unsigned split_bits
= i
->get_split_bits(pg_num
);
8927 dout(10) << " pg_num is " << pg_num
8928 << ", m_seed " << i
->ps()
8929 << ", split_bits is " << split_bits
<< dendl
;
8930 parent
->split_colls(
8934 &child
->get_pool().info
,
8941 child
->init_collection_pool_opts();
8943 child
->finish_split_stats(*stat_iter
, rctx
.transaction
);
8946 ceph_assert(stat_iter
!= updated_stats
.end());
8947 parent
->finish_split_stats(*stat_iter
, rctx
.transaction
);
8953 void OSD::handle_pg_create(OpRequestRef op
)
8955 // NOTE: this can be removed in P release (mimic is the last version to
8956 // send MOSDPGCreate messages).
8958 auto m
= op
->get_req
<MOSDPGCreate
>();
8959 ceph_assert(m
->get_type() == MSG_OSD_PG_CREATE
);
8961 dout(10) << "handle_pg_create " << *m
<< dendl
;
8963 if (!require_mon_peer(op
->get_req())) {
8967 if (!require_same_or_newer_map(op
, m
->epoch
, false))
8972 const auto osdmap
= get_osdmap();
8973 map
<pg_t
,utime_t
>::const_iterator ci
= m
->ctimes
.begin();
8974 for (map
<pg_t
,pg_create_t
>::const_iterator p
= m
->mkpg
.begin();
8977 ceph_assert(ci
!= m
->ctimes
.end() && ci
->first
== p
->first
);
8978 epoch_t created
= p
->second
.created
;
8979 if (p
->second
.split_bits
) // Skip split pgs
8983 if (!osdmap
->have_pg_pool(on
.pool())) {
8984 dout(20) << "ignoring pg on deleted pool " << on
<< dendl
;
8988 dout(20) << "mkpg " << on
<< " e" << created
<< "@" << ci
->second
<< dendl
;
8991 bool mapped
= osdmap
->get_primary_shard(on
, &pgid
);
8992 ceph_assert(mapped
);
8994 // is it still ours?
8995 vector
<int> up
, acting
;
8996 int up_primary
= -1;
8997 int acting_primary
= -1;
8998 osdmap
->pg_to_up_acting_osds(on
, &up
, &up_primary
, &acting
, &acting_primary
);
8999 int role
= osdmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
9001 if (acting_primary
!= whoami
) {
9002 dout(10) << "mkpg " << on
<< " not acting_primary (" << acting_primary
9003 << "), my role=" << role
<< ", skipping" << dendl
;
9009 pg_history_t history
;
9010 build_initial_pg_history(pgid
, created
, ci
->second
, &history
, &pi
);
9012 // The mon won't resend unless the primary changed, so we ignore
9013 // same_interval_since. We'll pass this history with the current
9014 // epoch as the event.
9015 if (history
.same_primary_since
> m
->epoch
) {
9016 dout(10) << __func__
<< ": got obsolete pg create on pgid "
9017 << pgid
<< " from epoch " << m
->epoch
9018 << ", primary changed in " << history
.same_primary_since
9022 enqueue_peering_evt(
9025 std::make_shared
<PGPeeringEvent
>(
9026 osdmap
->get_epoch(),
9027 osdmap
->get_epoch(),
9032 osdmap
->get_epoch(),
9040 std::lock_guard
l(pending_creates_lock
);
9041 if (pending_creates_from_mon
== 0) {
9042 last_pg_create_epoch
= m
->epoch
;
9046 maybe_update_heartbeat_peers();
9050 // ----------------------------------------
9051 // peering and recovery
9053 PeeringCtx
OSD::create_context()
9055 return PeeringCtx(get_osdmap()->require_osd_release
);
9058 void OSD::dispatch_context(PeeringCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
9059 ThreadPool::TPHandle
*handle
)
9061 if (!service
.get_osdmap()->is_up(whoami
)) {
9062 dout(20) << __func__
<< " not up in osdmap" << dendl
;
9063 } else if (!is_active()) {
9064 dout(20) << __func__
<< " not active" << dendl
;
9066 for (auto& [osd
, ls
] : ctx
.message_map
) {
9067 if (!curmap
->is_up(osd
)) {
9068 dout(20) << __func__
<< " skipping down osd." << osd
<< dendl
;
9071 ConnectionRef con
= service
.get_con_osd_cluster(
9072 osd
, curmap
->get_epoch());
9074 dout(20) << __func__
<< " skipping osd." << osd
<< " (NULL con)"
9078 service
.maybe_share_map(con
.get(), curmap
);
9080 con
->send_message2(m
);
9085 if ((!ctx
.transaction
.empty() || ctx
.transaction
.has_contexts()) && pg
) {
9086 int tr
= store
->queue_transaction(
9088 std::move(ctx
.transaction
), TrackedOpRef(),
9090 ceph_assert(tr
== 0);
9094 void OSD::handle_fast_pg_create(MOSDPGCreate2
*m
)
9096 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9097 if (!require_mon_peer(m
)) {
9101 for (auto& p
: m
->pgs
) {
9102 spg_t pgid
= p
.first
;
9103 epoch_t created
= p
.second
.first
;
9104 utime_t created_stamp
= p
.second
.second
;
9105 auto q
= m
->pg_extra
.find(pgid
);
9106 if (q
== m
->pg_extra
.end()) {
9107 dout(20) << __func__
<< " " << pgid
<< " e" << created
9108 << "@" << created_stamp
9109 << " (no history or past_intervals)" << dendl
;
9110 // pre-octopus ... no pg history. this can be removed in Q release.
9111 enqueue_peering_evt(
9114 std::make_shared
<PGPeeringEvent
>(
9122 pg_history_t(created
, created_stamp
),
9127 dout(20) << __func__
<< " " << pgid
<< " e" << created
9128 << "@" << created_stamp
9129 << " history " << q
->second
.first
9130 << " pi " << q
->second
.second
<< dendl
;
9131 if (!q
->second
.second
.empty() &&
9132 m
->epoch
< q
->second
.second
.get_bounds().second
) {
9133 clog
->error() << "got pg_create on " << pgid
<< " epoch " << m
->epoch
9134 << " and unmatched past_intervals " << q
->second
.second
9135 << " (history " << q
->second
.first
<< ")";
9137 enqueue_peering_evt(
9140 std::make_shared
<PGPeeringEvent
>(
9157 std::lock_guard
l(pending_creates_lock
);
9158 if (pending_creates_from_mon
== 0) {
9159 last_pg_create_epoch
= m
->epoch
;
9166 void OSD::handle_fast_pg_query(MOSDPGQuery
*m
)
9168 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9169 if (!require_osd_peer(m
)) {
9173 int from
= m
->get_source().num();
9174 for (auto& p
: m
->pg_list
) {
9175 enqueue_peering_evt(
9178 std::make_shared
<PGPeeringEvent
>(
9179 p
.second
.epoch_sent
, p
.second
.epoch_sent
,
9182 pg_shard_t(from
, p
.second
.from
),
9184 p
.second
.epoch_sent
),
9191 void OSD::handle_fast_pg_notify(MOSDPGNotify
* m
)
9193 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9194 if (!require_osd_peer(m
)) {
9198 int from
= m
->get_source().num();
9199 for (auto& p
: m
->get_pg_list()) {
9200 spg_t
pgid(p
.info
.pgid
.pgid
, p
.to
);
9201 enqueue_peering_evt(
9204 std::make_shared
<PGPeeringEvent
>(
9208 pgid
, pg_shard_t(from
, p
.from
),
9210 m
->get_connection()->get_features()),
9223 void OSD::handle_fast_pg_info(MOSDPGInfo
* m
)
9225 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9226 if (!require_osd_peer(m
)) {
9230 int from
= m
->get_source().num();
9231 for (auto& p
: m
->pg_list
) {
9232 enqueue_peering_evt(
9233 spg_t(p
.info
.pgid
.pgid
, p
.to
),
9235 std::make_shared
<PGPeeringEvent
>(
9236 p
.epoch_sent
, p
.query_epoch
,
9238 pg_shard_t(from
, p
.from
),
9246 void OSD::handle_fast_pg_remove(MOSDPGRemove
*m
)
9248 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9249 if (!require_osd_peer(m
)) {
9253 for (auto& pgid
: m
->pg_list
) {
9254 enqueue_peering_evt(
9257 std::make_shared
<PGPeeringEvent
>(
9258 m
->get_epoch(), m
->get_epoch(),
9259 PeeringState::DeleteStart())));
9264 void OSD::handle_fast_force_recovery(MOSDForceRecovery
*m
)
9266 dout(10) << __func__
<< " " << *m
<< dendl
;
9267 if (!require_mon_or_mgr_peer(m
)) {
9271 epoch_t epoch
= get_osdmap_epoch();
9272 for (auto pgid
: m
->forced_pgs
) {
9273 if (m
->options
& OFR_BACKFILL
) {
9274 if (m
->options
& OFR_CANCEL
) {
9275 enqueue_peering_evt(
9278 std::make_shared
<PGPeeringEvent
>(
9280 PeeringState::UnsetForceBackfill())));
9282 enqueue_peering_evt(
9285 std::make_shared
<PGPeeringEvent
>(
9287 PeeringState::SetForceBackfill())));
9289 } else if (m
->options
& OFR_RECOVERY
) {
9290 if (m
->options
& OFR_CANCEL
) {
9291 enqueue_peering_evt(
9294 std::make_shared
<PGPeeringEvent
>(
9296 PeeringState::UnsetForceRecovery())));
9298 enqueue_peering_evt(
9301 std::make_shared
<PGPeeringEvent
>(
9303 PeeringState::SetForceRecovery())));
9310 void OSD::handle_pg_query_nopg(const MQuery
& q
)
9312 spg_t pgid
= q
.pgid
;
9313 dout(10) << __func__
<< " " << pgid
<< dendl
;
9315 OSDMapRef osdmap
= get_osdmap();
9316 if (!osdmap
->have_pg_pool(pgid
.pool()))
9319 dout(10) << " pg " << pgid
<< " dne" << dendl
;
9320 pg_info_t
empty(spg_t(pgid
.pgid
, q
.query
.to
));
9321 ConnectionRef con
= service
.get_con_osd_cluster(q
.from
.osd
, osdmap
->get_epoch());
9324 if (q
.query
.type
== pg_query_t::LOG
||
9325 q
.query
.type
== pg_query_t::FULLLOG
) {
9327 q
.query
.from
, q
.query
.to
,
9328 osdmap
->get_epoch(), empty
,
9329 q
.query
.epoch_sent
);
9331 vector
<pg_notify_t
> ls
;
9334 q
.query
.from
, q
.query
.to
,
9336 osdmap
->get_epoch(),
9339 m
= new MOSDPGNotify(osdmap
->get_epoch(), std::move(ls
));
9341 service
.maybe_share_map(con
.get(), osdmap
);
9342 con
->send_message(m
);
9346 void OSDService::queue_check_readable(spg_t spgid
,
9348 ceph::signedspan delay
)
9350 if (delay
== ceph::signedspan::zero()) {
9351 osd
->enqueue_peering_evt(
9354 std::make_shared
<PGPeeringEvent
>(
9356 PeeringState::CheckReadable())));
9358 mono_timer
.add_event(
9360 [this, spgid
, lpr
]() {
9361 queue_check_readable(spgid
, lpr
);
9367 // =========================================================
9370 void OSDService::_maybe_queue_recovery() {
9371 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
9372 uint64_t available_pushes
;
9373 while (!awaiting_throttle
.empty() &&
9374 _recover_now(&available_pushes
)) {
9375 uint64_t to_start
= std::min(
9377 cct
->_conf
->osd_recovery_max_single_start
);
9378 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
9379 awaiting_throttle
.pop_front();
9380 dout(10) << __func__
<< " starting " << to_start
9381 << ", recovery_ops_reserved " << recovery_ops_reserved
9382 << " -> " << (recovery_ops_reserved
+ to_start
) << dendl
;
9383 recovery_ops_reserved
+= to_start
;
9387 bool OSDService::_recover_now(uint64_t *available_pushes
)
9389 if (available_pushes
)
9390 *available_pushes
= 0;
9392 if (ceph_clock_now() < defer_recovery_until
) {
9393 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
9397 if (recovery_paused
) {
9398 dout(15) << __func__
<< " paused" << dendl
;
9402 uint64_t max
= osd
->get_recovery_max_active();
9403 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
9404 dout(15) << __func__
<< " active " << recovery_ops_active
9405 << " + reserved " << recovery_ops_reserved
9406 << " >= max " << max
<< dendl
;
9410 if (available_pushes
)
9411 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
9416 unsigned OSDService::get_target_pg_log_entries() const
9418 auto num_pgs
= osd
->get_num_pgs();
9419 auto target
= cct
->_conf
->osd_target_pg_log_entries_per_osd
;
9420 if (num_pgs
> 0 && target
> 0) {
9421 // target an even spread of our budgeted log entries across all
9422 // PGs. note that while we only get to control the entry count
9423 // for primary PGs, we'll normally be responsible for a mix of
9424 // primary and replica PGs (for the same pool(s) even), so this
9426 return std::max
<unsigned>(
9427 std::min
<unsigned>(target
/ num_pgs
,
9428 cct
->_conf
->osd_max_pg_log_entries
),
9429 cct
->_conf
->osd_min_pg_log_entries
);
9431 // fall back to a per-pg value.
9432 return cct
->_conf
->osd_min_pg_log_entries
;
9436 void OSD::do_recovery(
9437 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
,
9438 ThreadPool::TPHandle
&handle
)
9440 uint64_t started
= 0;
9443 * When the value of osd_recovery_sleep is set greater than zero, recovery
9444 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9445 * recovery event's schedule time. This is done by adding a
9446 * recovery_requeue_callback event, which re-queues the recovery op using
9447 * queue_recovery_after_sleep.
9449 float recovery_sleep
= get_osd_recovery_sleep();
9451 std::lock_guard
l(service
.sleep_lock
);
9452 if (recovery_sleep
> 0 && service
.recovery_needs_sleep
) {
9454 auto recovery_requeue_callback
= new LambdaContext([this, pgref
, queued
, reserved_pushes
](int r
) {
9455 dout(20) << "do_recovery wake up at "
9457 << ", re-queuing recovery" << dendl
;
9458 std::lock_guard
l(service
.sleep_lock
);
9459 service
.recovery_needs_sleep
= false;
9460 service
.queue_recovery_after_sleep(pgref
.get(), queued
, reserved_pushes
);
9463 // This is true for the first recovery op and when the previous recovery op
9464 // has been scheduled in the past. The next recovery op is scheduled after
9465 // completing the sleep from now.
9467 if (auto now
= ceph::real_clock::now();
9468 service
.recovery_schedule_time
< now
) {
9469 service
.recovery_schedule_time
= now
;
9471 service
.recovery_schedule_time
+= ceph::make_timespan(recovery_sleep
);
9472 service
.sleep_timer
.add_event_at(service
.recovery_schedule_time
,
9473 recovery_requeue_callback
);
9474 dout(20) << "Recovery event scheduled at "
9475 << service
.recovery_schedule_time
<< dendl
;
9482 std::lock_guard
l(service
.sleep_lock
);
9483 service
.recovery_needs_sleep
= true;
9486 if (pg
->pg_has_reset_since(queued
)) {
9490 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
9491 #ifdef DEBUG_RECOVERY_OIDS
9492 dout(20) << " active was " << service
.recovery_oids
[pg
->pg_id
] << dendl
;
9495 bool do_unfound
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
9496 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
9497 << " on " << *pg
<< dendl
;
9500 PeeringCtx rctx
= create_context();
9501 rctx
.handle
= &handle
;
9502 pg
->find_unfound(queued
, rctx
);
9503 dispatch_context(rctx
, pg
, pg
->get_osdmap());
9508 ceph_assert(started
<= reserved_pushes
);
9509 service
.release_reserved_pushes(reserved_pushes
);
9512 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
9514 std::lock_guard
l(recovery_lock
);
9515 dout(10) << "start_recovery_op " << *pg
<< " " << soid
9516 << " (" << recovery_ops_active
<< "/"
9517 << osd
->get_recovery_max_active() << " rops)"
9519 recovery_ops_active
++;
9521 #ifdef DEBUG_RECOVERY_OIDS
9522 dout(20) << " active was " << recovery_oids
[pg
->pg_id
] << dendl
;
9523 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
) == 0);
9524 recovery_oids
[pg
->pg_id
].insert(soid
);
9528 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
9530 std::lock_guard
l(recovery_lock
);
9531 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
9532 << " dequeue=" << dequeue
9533 << " (" << recovery_ops_active
<< "/"
9534 << osd
->get_recovery_max_active() << " rops)"
9538 ceph_assert(recovery_ops_active
> 0);
9539 recovery_ops_active
--;
9541 #ifdef DEBUG_RECOVERY_OIDS
9542 dout(20) << " active oids was " << recovery_oids
[pg
->pg_id
] << dendl
;
9543 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
));
9544 recovery_oids
[pg
->pg_id
].erase(soid
);
9547 _maybe_queue_recovery();
9550 bool OSDService::is_recovery_active()
9552 if (cct
->_conf
->osd_debug_pretend_recovery_active
) {
9555 return local_reserver
.has_reservation() || remote_reserver
.has_reservation();
9558 void OSDService::release_reserved_pushes(uint64_t pushes
)
9560 std::lock_guard
l(recovery_lock
);
9561 dout(10) << __func__
<< "(" << pushes
<< "), recovery_ops_reserved "
9562 << recovery_ops_reserved
<< " -> " << (recovery_ops_reserved
-pushes
)
9564 ceph_assert(recovery_ops_reserved
>= pushes
);
9565 recovery_ops_reserved
-= pushes
;
9566 _maybe_queue_recovery();
9569 // =========================================================
9572 bool OSD::op_is_discardable(const MOSDOp
*op
)
9574 // drop client request if they are not connected and can't get the
9576 if (!op
->get_connection()->is_connected()) {
9582 void OSD::enqueue_op(spg_t pg
, OpRequestRef
&& op
, epoch_t epoch
)
9584 const utime_t stamp
= op
->get_req()->get_recv_stamp();
9585 const utime_t latency
= ceph_clock_now() - stamp
;
9586 const unsigned priority
= op
->get_req()->get_priority();
9587 const int cost
= op
->get_req()->get_cost();
9588 const uint64_t owner
= op
->get_req()->get_source().num();
9590 dout(15) << "enqueue_op " << op
<< " prio " << priority
9592 << " latency " << latency
9593 << " epoch " << epoch
9594 << " " << *(op
->get_req()) << dendl
;
9595 op
->osd_trace
.event("enqueue op");
9596 op
->osd_trace
.keyval("priority", priority
);
9597 op
->osd_trace
.keyval("cost", cost
);
9598 op
->mark_queued_for_pg();
9599 logger
->tinc(l_osd_op_before_queue_op_lat
, latency
);
9602 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGOpItem(pg
, std::move(op
))),
9603 cost
, priority
, stamp
, owner
, epoch
));
9606 void OSD::enqueue_peering_evt(spg_t pgid
, PGPeeringEventRef evt
)
9608 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
9611 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
9613 cct
->_conf
->osd_peering_op_priority
,
9616 evt
->get_epoch_sent()));
9620 * NOTE: dequeue called in worker thread, with pg lock
9622 void OSD::dequeue_op(
9623 PGRef pg
, OpRequestRef op
,
9624 ThreadPool::TPHandle
&handle
)
9626 const Message
*m
= op
->get_req();
9629 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_BEGIN", false);
9631 utime_t now
= ceph_clock_now();
9632 op
->set_dequeued_time(now
);
9634 utime_t latency
= now
- m
->get_recv_stamp();
9635 dout(10) << "dequeue_op " << op
<< " prio " << m
->get_priority()
9636 << " cost " << m
->get_cost()
9637 << " latency " << latency
9639 << " pg " << *pg
<< dendl
;
9641 logger
->tinc(l_osd_op_before_dequeue_op_lat
, latency
);
9643 service
.maybe_share_map(m
->get_connection().get(),
9647 if (pg
->is_deleting())
9650 op
->mark_reached_pg();
9651 op
->osd_trace
.event("dequeue_op");
9653 pg
->do_request(op
, handle
);
9656 dout(10) << "dequeue_op " << op
<< " finish" << dendl
;
9657 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_END", false);
9661 void OSD::dequeue_peering_evt(
9664 PGPeeringEventRef evt
,
9665 ThreadPool::TPHandle
& handle
)
9667 PeeringCtx rctx
= create_context();
9668 auto curmap
= sdata
->get_osdmap();
9669 bool need_up_thru
= false;
9670 epoch_t same_interval_since
= 0;
9672 if (const MQuery
*q
= dynamic_cast<const MQuery
*>(evt
->evt
.get())) {
9673 handle_pg_query_nopg(*q
);
9675 derr
<< __func__
<< " unrecognized pg-less event " << evt
->get_desc() << dendl
;
9678 } else if (advance_pg(curmap
->get_epoch(), pg
, handle
, rctx
)) {
9679 pg
->do_peering_event(evt
, rctx
);
9680 if (pg
->is_deleted()) {
9684 dispatch_context(rctx
, pg
, curmap
, &handle
);
9685 need_up_thru
= pg
->get_need_up_thru();
9686 same_interval_since
= pg
->get_same_interval_since();
9691 queue_want_up_thru(same_interval_since
);
9694 service
.send_pg_temp();
9697 void OSD::dequeue_delete(
9701 ThreadPool::TPHandle
& handle
)
9703 dequeue_peering_evt(
9707 std::make_shared
<PGPeeringEvent
>(
9709 PeeringState::DeleteSome())),
9715 // --------------------------------
9717 const char** OSD::get_tracked_conf_keys() const
9719 static const char* KEYS
[] = {
9720 "osd_max_backfills",
9721 "osd_min_recovery_priority",
9722 "osd_max_trimming_pgs",
9723 "osd_op_complaint_time",
9724 "osd_op_log_threshold",
9725 "osd_op_history_size",
9726 "osd_op_history_duration",
9727 "osd_op_history_slow_op_size",
9728 "osd_op_history_slow_op_threshold",
9729 "osd_enable_op_tracker",
9730 "osd_map_cache_size",
9731 "osd_pg_epoch_max_lag_factor",
9732 "osd_pg_epoch_persisted_max_stale",
9733 // clog & admin clog
9736 "clog_to_syslog_facility",
9737 "clog_to_syslog_level",
9738 "osd_objectstore_fuse",
9740 "clog_to_graylog_host",
9741 "clog_to_graylog_port",
9744 "osd_recovery_delay_start",
9745 "osd_client_message_size_cap",
9746 "osd_client_message_cap",
9747 "osd_heartbeat_min_size",
9748 "osd_heartbeat_interval",
9749 "osd_object_clean_region_max_num_intervals",
9750 "osd_scrub_min_interval",
9751 "osd_scrub_max_interval",
9757 void OSD::handle_conf_change(const ConfigProxy
& conf
,
9758 const std::set
<std::string
> &changed
)
9760 std::lock_guard l
{osd_lock
};
9761 if (changed
.count("osd_max_backfills")) {
9762 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9763 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9765 if (changed
.count("osd_min_recovery_priority")) {
9766 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9767 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9769 if (changed
.count("osd_max_trimming_pgs")) {
9770 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
9772 if (changed
.count("osd_op_complaint_time") ||
9773 changed
.count("osd_op_log_threshold")) {
9774 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
9775 cct
->_conf
->osd_op_log_threshold
);
9777 if (changed
.count("osd_op_history_size") ||
9778 changed
.count("osd_op_history_duration")) {
9779 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
9780 cct
->_conf
->osd_op_history_duration
);
9782 if (changed
.count("osd_op_history_slow_op_size") ||
9783 changed
.count("osd_op_history_slow_op_threshold")) {
9784 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
9785 cct
->_conf
->osd_op_history_slow_op_threshold
);
9787 if (changed
.count("osd_enable_op_tracker")) {
9788 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
9790 if (changed
.count("osd_map_cache_size")) {
9791 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9792 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9793 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9795 if (changed
.count("clog_to_monitors") ||
9796 changed
.count("clog_to_syslog") ||
9797 changed
.count("clog_to_syslog_level") ||
9798 changed
.count("clog_to_syslog_facility") ||
9799 changed
.count("clog_to_graylog") ||
9800 changed
.count("clog_to_graylog_host") ||
9801 changed
.count("clog_to_graylog_port") ||
9802 changed
.count("host") ||
9803 changed
.count("fsid")) {
9804 update_log_config();
9806 if (changed
.count("osd_pg_epoch_max_lag_factor")) {
9807 m_osd_pg_epoch_max_lag_factor
= conf
.get_val
<double>(
9808 "osd_pg_epoch_max_lag_factor");
9812 if (changed
.count("osd_objectstore_fuse")) {
9814 enable_disable_fuse(false);
9819 if (changed
.count("osd_recovery_delay_start")) {
9820 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
9821 service
.kick_recovery_queue();
9824 if (changed
.count("osd_client_message_cap")) {
9825 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
9826 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9827 if (pol
.throttler_messages
&& newval
> 0) {
9828 pol
.throttler_messages
->reset_max(newval
);
9831 if (changed
.count("osd_client_message_size_cap")) {
9832 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
9833 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9834 if (pol
.throttler_bytes
&& newval
> 0) {
9835 pol
.throttler_bytes
->reset_max(newval
);
9838 if (changed
.count("osd_object_clean_region_max_num_intervals")) {
9839 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
9842 if (changed
.count("osd_scrub_min_interval") ||
9843 changed
.count("osd_scrub_max_interval")) {
9844 resched_all_scrubs();
9845 dout(0) << __func__
<< ": scrub interval change" << dendl
;
9850 void OSD::update_log_config()
9852 map
<string
,string
> log_to_monitors
;
9853 map
<string
,string
> log_to_syslog
;
9854 map
<string
,string
> log_channel
;
9855 map
<string
,string
> log_prio
;
9856 map
<string
,string
> log_to_graylog
;
9857 map
<string
,string
> log_to_graylog_host
;
9858 map
<string
,string
> log_to_graylog_port
;
9862 if (parse_log_client_options(cct
, log_to_monitors
, log_to_syslog
,
9863 log_channel
, log_prio
, log_to_graylog
,
9864 log_to_graylog_host
, log_to_graylog_port
,
9866 clog
->update_config(log_to_monitors
, log_to_syslog
,
9867 log_channel
, log_prio
, log_to_graylog
,
9868 log_to_graylog_host
, log_to_graylog_port
,
9870 derr
<< "log_to_monitors " << log_to_monitors
<< dendl
;
9873 void OSD::check_config()
9875 // some sanity checks
9876 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
9877 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
9878 << " is not > osd_pg_epoch_persisted_max_stale ("
9879 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
9881 if (cct
->_conf
->osd_object_clean_region_max_num_intervals
< 0) {
9882 clog
->warn() << "osd_object_clean_region_max_num_intervals ("
9883 << cct
->_conf
->osd_object_clean_region_max_num_intervals
9888 // --------------------------------
9890 void OSD::get_latest_osdmap()
9892 dout(10) << __func__
<< " -- start" << dendl
;
9895 service
.objecter
->wait_for_latest_osdmap(&cond
);
9898 dout(10) << __func__
<< " -- finish" << dendl
;
9901 // --------------------------------
9903 void OSD::set_perf_queries(const ConfigPayload
&config_payload
) {
9904 const OSDConfigPayload
&osd_config_payload
= boost::get
<OSDConfigPayload
>(config_payload
);
9905 const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
= osd_config_payload
.config
;
9906 dout(10) << "setting " << queries
.size() << " queries" << dendl
;
9908 std::list
<OSDPerfMetricQuery
> supported_queries
;
9909 for (auto &it
: queries
) {
9910 auto &query
= it
.first
;
9911 if (!query
.key_descriptor
.empty()) {
9912 supported_queries
.push_back(query
);
9915 if (supported_queries
.size() < queries
.size()) {
9916 dout(1) << queries
.size() - supported_queries
.size()
9917 << " unsupported queries" << dendl
;
9920 std::lock_guard locker
{m_perf_queries_lock
};
9921 m_perf_queries
= supported_queries
;
9922 m_perf_limits
= queries
;
9924 std::vector
<PGRef
> pgs
;
9926 for (auto& pg
: pgs
) {
9927 std::scoped_lock l
{*pg
};
9928 pg
->set_dynamic_perf_stats_queries(supported_queries
);
9932 MetricPayload
OSD::get_perf_reports() {
9933 OSDMetricPayload payload
;
9934 std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> &reports
= payload
.report
;
9936 std::vector
<PGRef
> pgs
;
9938 DynamicPerfStats dps
;
9939 for (auto& pg
: pgs
) {
9940 // m_perf_queries can be modified only in set_perf_queries by mgr client
9941 // request, and it is protected by by mgr client's lock, which is held
9942 // when set_perf_queries/get_perf_reports are called, so we may not hold
9943 // m_perf_queries_lock here.
9944 DynamicPerfStats
pg_dps(m_perf_queries
);
9946 pg
->get_dynamic_perf_stats(&pg_dps
);
9950 dps
.add_to_reports(m_perf_limits
, &reports
);
9951 dout(20) << "reports for " << reports
.size() << " queries" << dendl
;
9956 // =============================================================
9959 #define dout_context cct
9961 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
9963 void OSDShard::_attach_pg(OSDShardPGSlot
*slot
, PG
*pg
)
9965 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
9967 pg
->osd_shard
= this;
9971 slot
->epoch
= pg
->get_osdmap_epoch();
9972 pg_slots_by_epoch
.insert(*slot
);
9975 void OSDShard::_detach_pg(OSDShardPGSlot
*slot
)
9977 dout(10) << slot
->pg
->pg_id
<< " " << slot
->pg
<< dendl
;
9978 slot
->pg
->osd_shard
= nullptr;
9979 slot
->pg
->pg_slot
= nullptr;
9983 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
9985 if (waiting_for_min_pg_epoch
) {
9986 min_pg_epoch_cond
.notify_all();
9990 void OSDShard::update_pg_epoch(OSDShardPGSlot
*slot
, epoch_t e
)
9992 std::lock_guard
l(shard_lock
);
9993 dout(30) << "min was " << pg_slots_by_epoch
.begin()->epoch
9994 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
9995 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
9996 dout(20) << slot
->pg
->pg_id
<< " " << slot
->epoch
<< " -> " << e
<< dendl
;
9998 pg_slots_by_epoch
.insert(*slot
);
9999 dout(30) << "min is now " << pg_slots_by_epoch
.begin()->epoch
10000 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10001 if (waiting_for_min_pg_epoch
) {
10002 min_pg_epoch_cond
.notify_all();
10006 epoch_t
OSDShard::get_min_pg_epoch()
10008 std::lock_guard
l(shard_lock
);
10009 auto p
= pg_slots_by_epoch
.begin();
10010 if (p
== pg_slots_by_epoch
.end()) {
10016 void OSDShard::wait_min_pg_epoch(epoch_t need
)
10018 std::unique_lock l
{shard_lock
};
10019 ++waiting_for_min_pg_epoch
;
10020 min_pg_epoch_cond
.wait(l
, [need
, this] {
10021 if (pg_slots_by_epoch
.empty()) {
10023 } else if (pg_slots_by_epoch
.begin()->epoch
>= need
) {
10026 dout(10) << need
<< " waiting on "
10027 << pg_slots_by_epoch
.begin()->epoch
<< dendl
;
10031 --waiting_for_min_pg_epoch
;
10034 epoch_t
OSDShard::get_max_waiting_epoch()
10036 std::lock_guard
l(shard_lock
);
10038 for (auto& i
: pg_slots
) {
10039 if (!i
.second
->waiting_peering
.empty()) {
10040 r
= std::max(r
, i
.second
->waiting_peering
.rbegin()->first
);
10046 void OSDShard::consume_map(
10047 const OSDMapRef
& new_osdmap
,
10048 unsigned *pushes_to_free
)
10050 std::lock_guard
l(shard_lock
);
10051 OSDMapRef old_osdmap
;
10053 std::lock_guard
l(osdmap_lock
);
10054 old_osdmap
= std::move(shard_osdmap
);
10055 shard_osdmap
= new_osdmap
;
10057 dout(10) << new_osdmap
->get_epoch()
10058 << " (was " << (old_osdmap
? old_osdmap
->get_epoch() : 0) << ")"
10060 bool queued
= false;
10063 auto p
= pg_slots
.begin();
10064 while (p
!= pg_slots
.end()) {
10065 OSDShardPGSlot
*slot
= p
->second
.get();
10066 const spg_t
& pgid
= p
->first
;
10067 dout(20) << __func__
<< " " << pgid
<< dendl
;
10068 if (!slot
->waiting_for_split
.empty()) {
10069 dout(20) << __func__
<< " " << pgid
10070 << " waiting for split " << slot
->waiting_for_split
<< dendl
;
10074 if (slot
->waiting_for_merge_epoch
> new_osdmap
->get_epoch()) {
10075 dout(20) << __func__
<< " " << pgid
10076 << " waiting for merge by epoch " << slot
->waiting_for_merge_epoch
10081 if (!slot
->waiting_peering
.empty()) {
10082 epoch_t first
= slot
->waiting_peering
.begin()->first
;
10083 if (first
<= new_osdmap
->get_epoch()) {
10084 dout(20) << __func__
<< " " << pgid
10085 << " pending_peering first epoch " << first
10086 << " <= " << new_osdmap
->get_epoch() << ", requeueing" << dendl
;
10087 _wake_pg_slot(pgid
, slot
);
10093 if (!slot
->waiting
.empty()) {
10094 if (new_osdmap
->is_up_acting_osd_shard(pgid
, osd
->get_nodeid())) {
10095 dout(20) << __func__
<< " " << pgid
<< " maps to us, keeping"
10100 while (!slot
->waiting
.empty() &&
10101 slot
->waiting
.front().get_map_epoch() <= new_osdmap
->get_epoch()) {
10102 auto& qi
= slot
->waiting
.front();
10103 dout(20) << __func__
<< " " << pgid
10104 << " waiting item " << qi
10105 << " epoch " << qi
.get_map_epoch()
10106 << " <= " << new_osdmap
->get_epoch()
10108 << (qi
.get_map_epoch() < new_osdmap
->get_epoch() ? "stale" :
10110 << ", dropping" << dendl
;
10111 *pushes_to_free
+= qi
.get_reserved_pushes();
10112 slot
->waiting
.pop_front();
10115 if (slot
->waiting
.empty() &&
10116 slot
->num_running
== 0 &&
10117 slot
->waiting_for_split
.empty() &&
10119 dout(20) << __func__
<< " " << pgid
<< " empty, pruning" << dendl
;
10120 p
= pg_slots
.erase(p
);
10127 std::lock_guard l
{sdata_wait_lock
};
10128 sdata_cond
.notify_one();
10132 void OSDShard::_wake_pg_slot(
10134 OSDShardPGSlot
*slot
)
10136 dout(20) << __func__
<< " " << pgid
10137 << " to_process " << slot
->to_process
10138 << " waiting " << slot
->waiting
10139 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10140 for (auto i
= slot
->to_process
.rbegin();
10141 i
!= slot
->to_process
.rend();
10143 scheduler
->enqueue_front(std::move(*i
));
10145 slot
->to_process
.clear();
10146 for (auto i
= slot
->waiting
.rbegin();
10147 i
!= slot
->waiting
.rend();
10149 scheduler
->enqueue_front(std::move(*i
));
10151 slot
->waiting
.clear();
10152 for (auto i
= slot
->waiting_peering
.rbegin();
10153 i
!= slot
->waiting_peering
.rend();
10155 // this is overkill; we requeue everything, even if some of these
10156 // items are waiting for maps we don't have yet. FIXME, maybe,
10157 // someday, if we decide this inefficiency matters
10158 for (auto j
= i
->second
.rbegin(); j
!= i
->second
.rend(); ++j
) {
10159 scheduler
->enqueue_front(std::move(*j
));
10162 slot
->waiting_peering
.clear();
10163 ++slot
->requeue_seq
;
10166 void OSDShard::identify_splits_and_merges(
10167 const OSDMapRef
& as_of_osdmap
,
10168 set
<pair
<spg_t
,epoch_t
>> *split_pgs
,
10169 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10171 std::lock_guard
l(shard_lock
);
10172 if (shard_osdmap
) {
10173 for (auto& i
: pg_slots
) {
10174 const spg_t
& pgid
= i
.first
;
10175 auto *slot
= i
.second
.get();
10177 osd
->service
.identify_splits_and_merges(
10178 shard_osdmap
, as_of_osdmap
, pgid
,
10179 split_pgs
, merge_pgs
);
10180 } else if (!slot
->waiting_for_split
.empty()) {
10181 osd
->service
.identify_splits_and_merges(
10182 shard_osdmap
, as_of_osdmap
, pgid
,
10183 split_pgs
, nullptr);
10185 dout(20) << __func__
<< " slot " << pgid
10186 << " has no pg and waiting_for_split " << dendl
;
10192 void OSDShard::prime_splits(const OSDMapRef
& as_of_osdmap
,
10193 set
<pair
<spg_t
,epoch_t
>> *pgids
)
10195 std::lock_guard
l(shard_lock
);
10196 _prime_splits(pgids
);
10197 if (shard_osdmap
->get_epoch() > as_of_osdmap
->get_epoch()) {
10198 set
<pair
<spg_t
,epoch_t
>> newer_children
;
10199 for (auto i
: *pgids
) {
10200 osd
->service
.identify_splits_and_merges(
10201 as_of_osdmap
, shard_osdmap
, i
.first
,
10202 &newer_children
, nullptr);
10204 newer_children
.insert(pgids
->begin(), pgids
->end());
10205 dout(10) << "as_of_osdmap " << as_of_osdmap
->get_epoch() << " < shard "
10206 << shard_osdmap
->get_epoch() << ", new children " << newer_children
10208 _prime_splits(&newer_children
);
10209 // note: we don't care what is left over here for other shards.
10210 // if this shard is ahead of us and one isn't, e.g., one thread is
10211 // calling into prime_splits via _process (due to a newly created
10212 // pg) and this shard has a newer map due to a racing consume_map,
10213 // then any grandchildren left here will be identified (or were
10214 // identified) when the slower shard's osdmap is advanced.
10215 // _prime_splits() will tolerate the case where the pgid is
10220 void OSDShard::_prime_splits(set
<pair
<spg_t
,epoch_t
>> *pgids
)
10222 dout(10) << *pgids
<< dendl
;
10223 auto p
= pgids
->begin();
10224 while (p
!= pgids
->end()) {
10225 unsigned shard_index
= p
->first
.hash_to_shard(osd
->num_shards
);
10226 if (shard_index
== shard_id
) {
10227 auto r
= pg_slots
.emplace(p
->first
, nullptr);
10229 dout(10) << "priming slot " << p
->first
<< " e" << p
->second
<< dendl
;
10230 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10231 r
.first
->second
->waiting_for_split
.insert(p
->second
);
10234 ceph_assert(q
!= pg_slots
.end());
10235 dout(10) << "priming (existing) slot " << p
->first
<< " e" << p
->second
10237 q
->second
->waiting_for_split
.insert(p
->second
);
10239 p
= pgids
->erase(p
);
10246 void OSDShard::prime_merges(const OSDMapRef
& as_of_osdmap
,
10247 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10249 std::lock_guard
l(shard_lock
);
10250 dout(20) << __func__
<< " checking shard " << shard_id
10251 << " for remaining merge pgs " << merge_pgs
<< dendl
;
10252 auto p
= merge_pgs
->begin();
10253 while (p
!= merge_pgs
->end()) {
10254 spg_t pgid
= p
->first
;
10255 epoch_t epoch
= p
->second
;
10256 unsigned shard_index
= pgid
.hash_to_shard(osd
->num_shards
);
10257 if (shard_index
!= shard_id
) {
10261 OSDShardPGSlot
*slot
;
10262 auto r
= pg_slots
.emplace(pgid
, nullptr);
10264 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10266 slot
= r
.first
->second
.get();
10269 dout(20) << __func__
<< " have merge participant pg " << pgid
10270 << " " << slot
->pg
<< dendl
;
10271 } else if (!slot
->waiting_for_split
.empty() &&
10272 *slot
->waiting_for_split
.begin() < epoch
) {
10273 dout(20) << __func__
<< " pending split on merge participant pg " << pgid
10274 << " " << slot
->waiting_for_split
<< dendl
;
10276 dout(20) << __func__
<< " creating empty merge participant " << pgid
10277 << " for merge in " << epoch
<< dendl
;
10278 // leave history zeroed; PG::merge_from() will fill it in.
10279 pg_history_t history
;
10280 PGCreateInfo
cinfo(pgid
, epoch
- 1,
10281 history
, PastIntervals(), false);
10282 PGRef pg
= osd
->handle_pg_create_info(shard_osdmap
, &cinfo
);
10283 _attach_pg(r
.first
->second
.get(), pg
.get());
10284 _wake_pg_slot(pgid
, slot
);
10287 // mark slot for merge
10288 dout(20) << __func__
<< " marking merge participant " << pgid
<< dendl
;
10289 slot
->waiting_for_merge_epoch
= epoch
;
10290 p
= merge_pgs
->erase(p
);
10294 void OSDShard::register_and_wake_split_child(PG
*pg
)
10298 std::lock_guard
l(shard_lock
);
10299 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10300 auto p
= pg_slots
.find(pg
->pg_id
);
10301 ceph_assert(p
!= pg_slots
.end());
10302 auto *slot
= p
->second
.get();
10303 dout(20) << pg
->pg_id
<< " waiting_for_split " << slot
->waiting_for_split
10305 ceph_assert(!slot
->pg
);
10306 ceph_assert(!slot
->waiting_for_split
.empty());
10307 _attach_pg(slot
, pg
);
10309 epoch
= pg
->get_osdmap_epoch();
10310 ceph_assert(slot
->waiting_for_split
.count(epoch
));
10311 slot
->waiting_for_split
.erase(epoch
);
10312 if (slot
->waiting_for_split
.empty()) {
10313 _wake_pg_slot(pg
->pg_id
, slot
);
10315 dout(10) << __func__
<< " still waiting for split on "
10316 << slot
->waiting_for_split
<< dendl
;
10320 // kick child to ensure it pulls up to the latest osdmap
10321 osd
->enqueue_peering_evt(
10324 std::make_shared
<PGPeeringEvent
>(
10329 std::lock_guard l
{sdata_wait_lock
};
10330 sdata_cond
.notify_one();
10333 void OSDShard::unprime_split_children(spg_t parent
, unsigned old_pg_num
)
10335 std::lock_guard
l(shard_lock
);
10336 vector
<spg_t
> to_delete
;
10337 for (auto& i
: pg_slots
) {
10338 if (i
.first
!= parent
&&
10339 i
.first
.get_ancestor(old_pg_num
) == parent
) {
10340 dout(10) << __func__
<< " parent " << parent
<< " clearing " << i
.first
10342 _wake_pg_slot(i
.first
, i
.second
.get());
10343 to_delete
.push_back(i
.first
);
10346 for (auto pgid
: to_delete
) {
10347 pg_slots
.erase(pgid
);
10351 OSDShard::OSDShard(
10358 shard_name(string("OSDShard.") + stringify(id
)),
10359 sdata_wait_lock_name(shard_name
+ "::sdata_wait_lock"),
10360 sdata_wait_lock
{make_mutex(sdata_wait_lock_name
)},
10361 osdmap_lock
{make_mutex(shard_name
+ "::osdmap_lock")},
10362 shard_lock_name(shard_name
+ "::shard_lock"),
10363 shard_lock
{make_mutex(shard_lock_name
)},
10364 scheduler(ceph::osd::scheduler::make_scheduler(cct
)),
10365 context_queue(sdata_wait_lock
, sdata_cond
)
10367 dout(0) << "using op scheduler " << *scheduler
<< dendl
;
10371 // =============================================================
10373 #undef dout_context
10374 #define dout_context osd->cct
10376 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10378 void OSD::ShardedOpWQ::_add_slot_waiter(
10380 OSDShardPGSlot
*slot
,
10381 OpSchedulerItem
&& qi
)
10383 if (qi
.is_peering()) {
10384 dout(20) << __func__
<< " " << pgid
10385 << " peering, item epoch is "
10386 << qi
.get_map_epoch()
10387 << ", will wait on " << qi
<< dendl
;
10388 slot
->waiting_peering
[qi
.get_map_epoch()].push_back(std::move(qi
));
10390 dout(20) << __func__
<< " " << pgid
10391 << " item epoch is "
10392 << qi
.get_map_epoch()
10393 << ", will wait on " << qi
<< dendl
;
10394 slot
->waiting
.push_back(std::move(qi
));
10399 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10401 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
10403 uint32_t shard_index
= thread_index
% osd
->num_shards
;
10404 auto& sdata
= osd
->shards
[shard_index
];
10405 ceph_assert(sdata
);
10407 // If all threads of shards do oncommits, there is a out-of-order
10408 // problem. So we choose the thread which has the smallest
10409 // thread_index(thread_index < num_shards) of shard to do oncommit
10411 bool is_smallest_thread_index
= thread_index
< osd
->num_shards
;
10414 sdata
->shard_lock
.lock();
10415 if (sdata
->scheduler
->empty() &&
10416 (!is_smallest_thread_index
|| sdata
->context_queue
.empty())) {
10417 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10418 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
10419 // we raced with a context_queue addition, don't wait
10420 wait_lock
.unlock();
10421 } else if (!sdata
->stop_waiting
) {
10422 dout(20) << __func__
<< " empty q, waiting" << dendl
;
10423 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
10424 sdata
->shard_lock
.unlock();
10425 sdata
->sdata_cond
.wait(wait_lock
);
10426 wait_lock
.unlock();
10427 sdata
->shard_lock
.lock();
10428 if (sdata
->scheduler
->empty() &&
10429 !(is_smallest_thread_index
&& !sdata
->context_queue
.empty())) {
10430 sdata
->shard_lock
.unlock();
10433 // found a work item; reapply default wq timeouts
10434 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10435 timeout_interval
, suicide_interval
);
10437 dout(20) << __func__
<< " need return immediately" << dendl
;
10438 wait_lock
.unlock();
10439 sdata
->shard_lock
.unlock();
10444 list
<Context
*> oncommits
;
10445 if (is_smallest_thread_index
) {
10446 sdata
->context_queue
.move_to(oncommits
);
10449 if (sdata
->scheduler
->empty()) {
10450 if (osd
->is_stopping()) {
10451 sdata
->shard_lock
.unlock();
10452 for (auto c
: oncommits
) {
10453 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10456 return; // OSD shutdown, discard.
10458 sdata
->shard_lock
.unlock();
10459 handle_oncommits(oncommits
);
10463 OpSchedulerItem item
= sdata
->scheduler
->dequeue();
10464 if (osd
->is_stopping()) {
10465 sdata
->shard_lock
.unlock();
10466 for (auto c
: oncommits
) {
10467 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10470 return; // OSD shutdown, discard.
10473 const auto token
= item
.get_ordering_token();
10474 auto r
= sdata
->pg_slots
.emplace(token
, nullptr);
10476 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10478 OSDShardPGSlot
*slot
= r
.first
->second
.get();
10479 dout(20) << __func__
<< " " << token
10480 << (r
.second
? " (new)" : "")
10481 << " to_process " << slot
->to_process
10482 << " waiting " << slot
->waiting
10483 << " waiting_peering " << slot
->waiting_peering
10485 slot
->to_process
.push_back(std::move(item
));
10486 dout(20) << __func__
<< " " << slot
->to_process
.back()
10487 << " queued" << dendl
;
10490 PGRef pg
= slot
->pg
;
10492 // lock pg (if we have it)
10494 // note the requeue seq now...
10495 uint64_t requeue_seq
= slot
->requeue_seq
;
10496 ++slot
->num_running
;
10498 sdata
->shard_lock
.unlock();
10499 osd
->service
.maybe_inject_dispatch_delay();
10501 osd
->service
.maybe_inject_dispatch_delay();
10502 sdata
->shard_lock
.lock();
10504 auto q
= sdata
->pg_slots
.find(token
);
10505 if (q
== sdata
->pg_slots
.end()) {
10506 // this can happen if we race with pg removal.
10507 dout(20) << __func__
<< " slot " << token
<< " no longer there" << dendl
;
10509 sdata
->shard_lock
.unlock();
10510 handle_oncommits(oncommits
);
10513 slot
= q
->second
.get();
10514 --slot
->num_running
;
10516 if (slot
->to_process
.empty()) {
10517 // raced with _wake_pg_slot or consume_map
10518 dout(20) << __func__
<< " " << token
10519 << " nothing queued" << dendl
;
10521 sdata
->shard_lock
.unlock();
10522 handle_oncommits(oncommits
);
10525 if (requeue_seq
!= slot
->requeue_seq
) {
10526 dout(20) << __func__
<< " " << token
10527 << " requeue_seq " << slot
->requeue_seq
<< " > our "
10528 << requeue_seq
<< ", we raced with _wake_pg_slot"
10531 sdata
->shard_lock
.unlock();
10532 handle_oncommits(oncommits
);
10535 if (slot
->pg
!= pg
) {
10536 // this can happen if we race with pg removal.
10537 dout(20) << __func__
<< " slot " << token
<< " no longer attached to "
10544 dout(20) << __func__
<< " " << token
10545 << " to_process " << slot
->to_process
10546 << " waiting " << slot
->waiting
10547 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10549 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
10553 auto qi
= std::move(slot
->to_process
.front());
10554 slot
->to_process
.pop_front();
10555 dout(20) << __func__
<< " " << qi
<< " pg " << pg
<< dendl
;
10556 set
<pair
<spg_t
,epoch_t
>> new_children
;
10560 // should this pg shard exist on this osd in this (or a later) epoch?
10561 osdmap
= sdata
->shard_osdmap
;
10562 const PGCreateInfo
*create_info
= qi
.creates_pg();
10563 if (!slot
->waiting_for_split
.empty()) {
10564 dout(20) << __func__
<< " " << token
10565 << " splitting " << slot
->waiting_for_split
<< dendl
;
10566 _add_slot_waiter(token
, slot
, std::move(qi
));
10567 } else if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10568 dout(20) << __func__
<< " " << token
10569 << " map " << qi
.get_map_epoch() << " > "
10570 << osdmap
->get_epoch() << dendl
;
10571 _add_slot_waiter(token
, slot
, std::move(qi
));
10572 } else if (qi
.is_peering()) {
10573 if (!qi
.peering_requires_pg()) {
10574 // for pg-less events, we run them under the ordering lock, since
10575 // we don't have the pg lock to keep them ordered.
10576 qi
.run(osd
, sdata
, pg
, tp_handle
);
10577 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10579 if (create_info
->by_mon
&&
10580 osdmap
->get_pg_acting_primary(token
.pgid
) != osd
->whoami
) {
10581 dout(20) << __func__
<< " " << token
10582 << " no pg, no longer primary, ignoring mon create on "
10585 dout(20) << __func__
<< " " << token
10586 << " no pg, should create on " << qi
<< dendl
;
10587 pg
= osd
->handle_pg_create_info(osdmap
, create_info
);
10589 // we created the pg! drop out and continue "normally"!
10590 sdata
->_attach_pg(slot
, pg
.get());
10591 sdata
->_wake_pg_slot(token
, slot
);
10593 // identify split children between create epoch and shard epoch.
10594 osd
->service
.identify_splits_and_merges(
10595 pg
->get_osdmap(), osdmap
, pg
->pg_id
, &new_children
, nullptr);
10596 sdata
->_prime_splits(&new_children
);
10597 // distribute remaining split children to other shards below!
10600 dout(20) << __func__
<< " ignored create on " << qi
<< dendl
;
10603 dout(20) << __func__
<< " " << token
10604 << " no pg, peering, !create, discarding " << qi
<< dendl
;
10607 dout(20) << __func__
<< " " << token
10608 << " no pg, peering, doesn't map here e" << osdmap
->get_epoch()
10609 << ", discarding " << qi
10612 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10613 dout(20) << __func__
<< " " << token
10614 << " no pg, should exist e" << osdmap
->get_epoch()
10615 << ", will wait on " << qi
<< dendl
;
10616 _add_slot_waiter(token
, slot
, std::move(qi
));
10618 dout(20) << __func__
<< " " << token
10619 << " no pg, shouldn't exist e" << osdmap
->get_epoch()
10620 << ", dropping " << qi
<< dendl
;
10621 // share map with client?
10622 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10623 osd
->service
.maybe_share_map((*_op
)->get_req()->get_connection().get(),
10624 sdata
->shard_osdmap
,
10625 (*_op
)->sent_epoch
);
10627 unsigned pushes_to_free
= qi
.get_reserved_pushes();
10628 if (pushes_to_free
> 0) {
10629 sdata
->shard_lock
.unlock();
10630 osd
->service
.release_reserved_pushes(pushes_to_free
);
10631 handle_oncommits(oncommits
);
10635 sdata
->shard_lock
.unlock();
10636 handle_oncommits(oncommits
);
10639 if (qi
.is_peering()) {
10640 OSDMapRef osdmap
= sdata
->shard_osdmap
;
10641 if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10642 _add_slot_waiter(token
, slot
, std::move(qi
));
10643 sdata
->shard_lock
.unlock();
10645 handle_oncommits(oncommits
);
10649 sdata
->shard_lock
.unlock();
10651 if (!new_children
.empty()) {
10652 for (auto shard
: osd
->shards
) {
10653 shard
->prime_splits(osdmap
, &new_children
);
10655 ceph_assert(new_children
.empty());
10658 // osd_opwq_process marks the point at which an operation has been dequeued
10659 // and will begin to be handled by a worker thread.
10663 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10664 reqid
= (*_op
)->get_reqid();
10667 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
10668 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
10671 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
10672 Formatter
*f
= Formatter::create("json");
10673 f
->open_object_section("q");
10675 f
->close_section();
10680 qi
.run(osd
, sdata
, pg
, tp_handle
);
10685 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10686 reqid
= (*_op
)->get_reqid();
10689 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
10690 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
10693 handle_oncommits(oncommits
);
10696 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem
&& item
) {
10697 uint32_t shard_index
=
10698 item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
10700 dout(20) << __func__
<< " " << item
<< dendl
;
10702 OSDShard
* sdata
= osd
->shards
[shard_index
];
10703 assert (NULL
!= sdata
);
10707 std::lock_guard l
{sdata
->shard_lock
};
10708 empty
= sdata
->scheduler
->empty();
10709 sdata
->scheduler
->enqueue(std::move(item
));
10713 std::lock_guard l
{sdata
->sdata_wait_lock
};
10714 sdata
->sdata_cond
.notify_one();
10718 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem
&& item
)
10720 auto shard_index
= item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
10721 auto& sdata
= osd
->shards
[shard_index
];
10722 ceph_assert(sdata
);
10723 sdata
->shard_lock
.lock();
10724 auto p
= sdata
->pg_slots
.find(item
.get_ordering_token());
10725 if (p
!= sdata
->pg_slots
.end() &&
10726 !p
->second
->to_process
.empty()) {
10727 // we may be racing with _process, which has dequeued a new item
10728 // from scheduler, put it on to_process, and is now busy taking the
10729 // pg lock. ensure this old requeued item is ordered before any
10730 // such newer item in to_process.
10731 p
->second
->to_process
.push_front(std::move(item
));
10732 item
= std::move(p
->second
->to_process
.back());
10733 p
->second
->to_process
.pop_back();
10734 dout(20) << __func__
10735 << " " << p
->second
->to_process
.front()
10736 << " shuffled w/ " << item
<< dendl
;
10738 dout(20) << __func__
<< " " << item
<< dendl
;
10740 sdata
->scheduler
->enqueue_front(std::move(item
));
10741 sdata
->shard_lock
.unlock();
10742 std::lock_guard l
{sdata
->sdata_wait_lock
};
10743 sdata
->sdata_cond
.notify_one();
10747 namespace osd_cmds
{
10749 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
,
10752 if (!ceph_using_tcmalloc()) {
10753 os
<< "could not issue heap profiler command -- not using tcmalloc!";
10754 return -EOPNOTSUPP
;
10758 if (!cmd_getval(cmdmap
, "heapcmd", cmd
)) {
10759 os
<< "unable to get value for command \"" << cmd
<< "\"";
10763 std::vector
<std::string
> cmd_vec
;
10764 get_str_vec(cmd
, cmd_vec
);
10767 if (cmd_getval(cmdmap
, "value", val
)) {
10768 cmd_vec
.push_back(val
);
10771 ceph_heap_profiler_handle_command(cmd_vec
, os
);
10776 }} // namespace ceph::osd_cmds