1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
47 #include "osdc/Objecter.h"
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_releases.h"
52 #include "common/ceph_time.h"
53 #include "common/version.h"
54 #include "common/pick_address.h"
55 #include "common/blkdev.h"
56 #include "common/numa.h"
58 #include "os/ObjectStore.h"
60 #include "os/FuseStore.h"
63 #include "PrimaryLogPG.h"
65 #include "msg/Messenger.h"
66 #include "msg/Message.h"
68 #include "mon/MonClient.h"
70 #include "messages/MLog.h"
72 #include "messages/MGenericMessage.h"
73 #include "messages/MOSDPing.h"
74 #include "messages/MOSDFailure.h"
75 #include "messages/MOSDMarkMeDown.h"
76 #include "messages/MOSDMarkMeDead.h"
77 #include "messages/MOSDFull.h"
78 #include "messages/MOSDOp.h"
79 #include "messages/MOSDOpReply.h"
80 #include "messages/MOSDBackoff.h"
81 #include "messages/MOSDBeacon.h"
82 #include "messages/MOSDRepOp.h"
83 #include "messages/MOSDRepOpReply.h"
84 #include "messages/MOSDBoot.h"
85 #include "messages/MOSDPGTemp.h"
86 #include "messages/MOSDPGReadyToMerge.h"
88 #include "messages/MOSDMap.h"
89 #include "messages/MMonGetOSDMap.h"
90 #include "messages/MOSDPGNotify.h"
91 #include "messages/MOSDPGNotify2.h"
92 #include "messages/MOSDPGQuery.h"
93 #include "messages/MOSDPGQuery2.h"
94 #include "messages/MOSDPGLog.h"
95 #include "messages/MOSDPGRemove.h"
96 #include "messages/MOSDPGInfo.h"
97 #include "messages/MOSDPGInfo2.h"
98 #include "messages/MOSDPGCreate.h"
99 #include "messages/MOSDPGCreate2.h"
100 #include "messages/MOSDPGScan.h"
101 #include "messages/MBackfillReserve.h"
102 #include "messages/MRecoveryReserve.h"
103 #include "messages/MOSDForceRecovery.h"
104 #include "messages/MOSDECSubOpWrite.h"
105 #include "messages/MOSDECSubOpWriteReply.h"
106 #include "messages/MOSDECSubOpRead.h"
107 #include "messages/MOSDECSubOpReadReply.h"
108 #include "messages/MOSDPGCreated.h"
109 #include "messages/MOSDPGUpdateLogMissing.h"
110 #include "messages/MOSDPGUpdateLogMissingReply.h"
112 #include "messages/MOSDPeeringOp.h"
114 #include "messages/MOSDAlive.h"
116 #include "messages/MOSDScrub.h"
117 #include "messages/MOSDScrub2.h"
118 #include "messages/MOSDRepScrub.h"
120 #include "messages/MCommand.h"
121 #include "messages/MCommandReply.h"
123 #include "messages/MPGStats.h"
124 #include "messages/MPGStatsAck.h"
126 #include "messages/MWatchNotify.h"
127 #include "messages/MOSDPGPush.h"
128 #include "messages/MOSDPGPushReply.h"
129 #include "messages/MOSDPGPull.h"
131 #include "messages/MMonGetPurgedSnaps.h"
132 #include "messages/MMonGetPurgedSnapsReply.h"
134 #include "common/perf_counters.h"
135 #include "common/Timer.h"
136 #include "common/LogClient.h"
137 #include "common/AsyncReserver.h"
138 #include "common/HeartbeatMap.h"
139 #include "common/admin_socket.h"
140 #include "common/ceph_context.h"
142 #include "global/signal_handler.h"
143 #include "global/pidfile.h"
145 #include "include/color.h"
146 #include "perfglue/cpu_profiler.h"
147 #include "perfglue/heap_profiler.h"
149 #include "osd/OpRequest.h"
151 #include "auth/AuthAuthorizeHandler.h"
152 #include "auth/RotatingKeyRing.h"
154 #include "objclass/objclass.h"
156 #include "common/cmdparse.h"
157 #include "include/str_list.h"
158 #include "include/util.h"
160 #include "include/ceph_assert.h"
161 #include "common/config.h"
162 #include "common/EventTrace.h"
164 #include "json_spirit/json_spirit_reader.h"
165 #include "json_spirit/json_spirit_writer.h"
168 #define TRACEPOINT_DEFINE
169 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170 #include "tracing/osd.h"
171 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #undef TRACEPOINT_DEFINE
174 #define tracepoint(...)
177 #define dout_context cct
178 #define dout_subsys ceph_subsys_osd
180 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
182 using namespace ceph::osd::scheduler
;
183 using TOPNSPC::common::cmd_getval
;
185 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
186 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
189 //Initial features in new superblock.
190 //Features here are also automatically upgraded
191 CompatSet
OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat
;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
194 CompatSet::FeatureSet ceph_osd_feature_incompat
;
195 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
196 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
197 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
198 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
199 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
200 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
201 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
202 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
203 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
204 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
205 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
206 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
207 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
208 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
209 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES
);
210 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
);
211 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
212 ceph_osd_feature_incompat
);
215 //Features are added here that this OSD supports.
216 CompatSet
OSD::get_osd_compat_set() {
217 CompatSet compat
= get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
223 OSDService::OSDService(OSD
*osd
) :
226 whoami(osd
->whoami
), store(osd
->store
),
227 log_client(osd
->log_client
), clog(osd
->clog
),
228 pg_recovery_stats(osd
->pg_recovery_stats
),
229 cluster_messenger(osd
->cluster_messenger
),
230 client_messenger(osd
->client_messenger
),
232 recoverystate_perf(osd
->recoverystate_perf
),
234 osd_max_object_size(cct
->_conf
, "osd_max_object_size"),
235 osd_skip_data_digest(cct
->_conf
, "osd_skip_data_digest"),
236 publish_lock
{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock
{ceph::make_mutex("OSDService::pre_publish_lock")},
241 agent_valid_iterator(false),
243 flush_mode_high_count(0),
246 agent_stop_flag(false),
247 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
251 objecter(make_unique
<Objecter
>(osd
->client_messenger
->cct
,
252 osd
->objecter_messenger
,
253 osd
->monc
, nullptr, 0, 0)),
254 m_objecter_finishers(cct
->_conf
->osd_objecter_finishers
),
255 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
257 recovery_request_timer(cct
, recovery_request_lock
, false),
258 sleep_timer(cct
, sleep_lock
, false),
259 reserver_finisher(cct
),
260 local_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
261 cct
->_conf
->osd_min_recovery_priority
),
262 remote_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
263 cct
->_conf
->osd_min_recovery_priority
),
264 snap_reserver(cct
, &reserver_finisher
,
265 cct
->_conf
->osd_max_trimming_pgs
),
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
269 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
270 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
271 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
273 cur_ratio(0), physical_ratio(0),
274 boot_epoch(0), up_epoch(0), bind_epoch(0)
278 for (int i
= 0; i
< m_objecter_finishers
; i
++) {
280 str
<< "objecter-finisher-" << i
;
281 auto fin
= make_unique
<Finisher
>(osd
->client_messenger
->cct
, str
.str(), "finisher");
282 objecter_finishers
.push_back(std::move(fin
));
287 void OSDService::add_pgid(spg_t pgid
, PG
*pg
){
288 std::lock_guard
l(pgid_lock
);
289 if (!pgid_tracker
.count(pgid
)) {
292 pgid_tracker
[pgid
]++;
294 void OSDService::remove_pgid(spg_t pgid
, PG
*pg
)
296 std::lock_guard
l(pgid_lock
);
297 ceph_assert(pgid_tracker
.count(pgid
));
298 ceph_assert(pgid_tracker
[pgid
] > 0);
299 pgid_tracker
[pgid
]--;
300 if (pgid_tracker
[pgid
] == 0) {
301 pgid_tracker
.erase(pgid
);
302 live_pgs
.erase(pgid
);
305 void OSDService::dump_live_pgids()
307 std::lock_guard
l(pgid_lock
);
308 derr
<< "live pgids:" << dendl
;
309 for (map
<spg_t
, int>::const_iterator i
= pgid_tracker
.cbegin();
310 i
!= pgid_tracker
.cend();
312 derr
<< "\t" << *i
<< dendl
;
313 live_pgs
[i
->first
]->dump_live_ids();
319 ceph::signedspan
OSDService::get_mnow()
321 return ceph::mono_clock::now() - osd
->startup_time
;
324 void OSDService::identify_splits_and_merges(
328 set
<pair
<spg_t
,epoch_t
>> *split_children
,
329 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
331 if (!old_map
->have_pg_pool(pgid
.pool())) {
334 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
335 auto p
= osd
->pg_num_history
.pg_nums
.find(pgid
.pool());
336 if (p
== osd
->pg_num_history
.pg_nums
.end()) {
339 dout(20) << __func__
<< " " << pgid
<< " e" << old_map
->get_epoch()
340 << " to e" << new_map
->get_epoch()
341 << " pg_nums " << p
->second
<< dendl
;
343 queue
.push_back(pgid
);
345 while (!queue
.empty()) {
346 auto cur
= queue
.front();
349 unsigned pgnum
= old_pgnum
;
350 for (auto q
= p
->second
.lower_bound(old_map
->get_epoch());
351 q
!= p
->second
.end() &&
352 q
->first
<= new_map
->get_epoch();
354 if (pgnum
< q
->second
) {
356 if (cur
.ps() < pgnum
) {
358 if (cur
.is_split(pgnum
, q
->second
, &children
)) {
359 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
360 << " pg_num " << pgnum
<< " -> " << q
->second
361 << " children " << children
<< dendl
;
362 for (auto i
: children
) {
363 split_children
->insert(make_pair(i
, q
->first
));
368 } else if (cur
.ps() < q
->second
) {
369 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
370 << " pg_num " << pgnum
<< " -> " << q
->second
371 << " is a child" << dendl
;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children
->insert(make_pair(cur
, q
->first
));
378 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
379 << " pg_num " << pgnum
<< " -> " << q
->second
380 << " is post-split, skipping" << dendl
;
382 } else if (merge_pgs
) {
384 if (cur
.ps() >= q
->second
) {
385 if (cur
.ps() < pgnum
) {
387 if (cur
.is_merge_source(pgnum
, q
->second
, &parent
)) {
389 parent
.is_split(q
->second
, pgnum
, &children
);
390 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
391 << " pg_num " << pgnum
<< " -> " << q
->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children
<< dendl
;
394 merge_pgs
->insert(make_pair(parent
, q
->first
));
395 if (!did
.count(parent
)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue
.push_back(parent
);
400 for (auto c
: children
) {
401 merge_pgs
->insert(make_pair(c
, q
->first
));
407 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
408 << " pg_num " << pgnum
<< " -> " << q
->second
409 << " is beyond old pgnum, skipping" << dendl
;
413 if (cur
.is_split(q
->second
, pgnum
, &children
)) {
414 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
415 << " pg_num " << pgnum
<< " -> " << q
->second
416 << " is merge target, source " << children
<< dendl
;
417 for (auto c
: children
) {
418 merge_pgs
->insert(make_pair(c
, q
->first
));
422 merge_pgs
->insert(make_pair(cur
, q
->first
));
431 void OSDService::need_heartbeat_peer_update()
433 osd
->need_heartbeat_peer_update();
436 HeartbeatStampsRef
OSDService::get_hb_stamps(unsigned peer
)
438 std::lock_guard
l(hb_stamp_lock
);
439 if (peer
>= hb_stamps
.size()) {
440 hb_stamps
.resize(peer
+ 1);
442 if (!hb_stamps
[peer
]) {
443 hb_stamps
[peer
] = ceph::make_ref
<HeartbeatStamps
>(peer
);
445 return hb_stamps
[peer
];
448 void OSDService::queue_renew_lease(epoch_t epoch
, spg_t spgid
)
450 osd
->enqueue_peering_evt(
453 std::make_shared
<PGPeeringEvent
>(
458 void OSDService::start_shutdown()
461 std::lock_guard
l(agent_timer_lock
);
462 agent_timer
.shutdown();
466 std::lock_guard
l(sleep_lock
);
467 sleep_timer
.shutdown();
471 std::lock_guard
l(recovery_request_lock
);
472 recovery_request_timer
.shutdown();
476 void OSDService::shutdown_reserver()
478 reserver_finisher
.wait_for_empty();
479 reserver_finisher
.stop();
482 void OSDService::shutdown()
484 mono_timer
.suspend();
487 std::lock_guard
l(watch_lock
);
488 watch_timer
.shutdown();
491 objecter
->shutdown();
492 for (auto& f
: objecter_finishers
) {
497 publish_map(OSDMapRef());
498 next_osdmap
= OSDMapRef();
501 void OSDService::init()
503 reserver_finisher
.start();
504 for (auto& f
: objecter_finishers
) {
507 objecter
->set_client_incarnation(0);
509 // deprioritize objecter in daemonperf output
510 objecter
->get_logger()->set_prio_adjust(-3);
516 agent_thread
.create("osd_srv_agent");
518 if (cct
->_conf
->osd_recovery_delay_start
)
519 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
522 void OSDService::final_init()
524 objecter
->start(osdmap
.get());
527 void OSDService::activate_map()
529 // wake/unwake the tiering agent
530 std::lock_guard l
{agent_lock
};
532 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
534 agent_cond
.notify_all();
537 void OSDService::request_osdmap_update(epoch_t e
)
539 osd
->osdmap_subscribe(e
, false);
543 class AgentTimeoutCB
: public Context
{
546 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
547 void finish(int) override
{
548 pg
->agent_choose_mode_restart();
552 void OSDService::agent_entry()
554 dout(10) << __func__
<< " start" << dendl
;
555 std::unique_lock agent_locker
{agent_lock
};
557 while (!agent_stop_flag
) {
558 if (agent_queue
.empty()) {
559 dout(20) << __func__
<< " empty queue" << dendl
;
560 agent_cond
.wait(agent_locker
);
563 uint64_t level
= agent_queue
.rbegin()->first
;
564 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
566 << " tiers " << agent_queue
.size()
567 << ", top is " << level
568 << " with pgs " << top
.size()
569 << ", ops " << agent_ops
<< "/"
570 << cct
->_conf
->osd_agent_max_ops
571 << (agent_active
? " active" : " NOT ACTIVE")
573 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
574 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
575 int agent_flush_quota
= max
;
576 if (!flush_mode_high_count
)
577 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
578 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
579 agent_cond
.wait(agent_locker
);
583 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
584 agent_queue_pos
= top
.begin();
585 agent_valid_iterator
= true;
587 PGRef pg
= *agent_queue_pos
;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota
<< dendl
;
591 agent_locker
.unlock();
592 if (!pg
->agent_work(max
, agent_flush_quota
)) {
593 dout(10) << __func__
<< " " << pg
->pg_id
594 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
595 << " seconds" << dendl
;
597 osd
->logger
->inc(l_osd_tier_delay
);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
599 std::lock_guard timer_locker
{agent_timer_lock
};
600 Context
*cb
= new AgentTimeoutCB(pg
);
601 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
605 dout(10) << __func__
<< " finish" << dendl
;
608 void OSDService::agent_stop()
611 std::lock_guard
l(agent_lock
);
613 // By this time all ops should be cancelled
614 ceph_assert(agent_ops
== 0);
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue
.empty()) {
617 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
618 derr
<< "agent queue not empty, for example " << (*top
.begin())->get_pgid() << dendl
;
619 ceph_abort_msg("agent queue not empty");
622 agent_stop_flag
= true;
623 agent_cond
.notify_all();
628 // -------------------------------------
630 void OSDService::promote_throttle_recalibrate()
632 utime_t now
= ceph_clock_now();
633 double dur
= now
- last_recalibrate
;
634 last_recalibrate
= now
;
635 unsigned prob
= promote_probability_millis
;
637 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
638 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
640 unsigned min_prob
= 1;
642 uint64_t attempts
, obj
, bytes
;
643 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
644 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
645 << obj
<< " objects and " << byte_u_t(bytes
) << "; target "
646 << target_obj_sec
<< " obj/sec or "
647 << byte_u_t(target_bytes_sec
) << "/sec"
650 // calculate what the probability *should* be, given the targets
652 if (attempts
&& dur
> 0) {
653 uint64_t avg_size
= 1;
655 avg_size
= std::max
<uint64_t>(bytes
/ obj
, 1);
656 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
657 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
659 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
660 << avg_size
<< dendl
;
661 if (target_obj_sec
&& target_bytes_sec
)
662 new_prob
= std::min(po
, pb
);
663 else if (target_obj_sec
)
665 else if (target_bytes_sec
)
672 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
674 // correct for persistent skew between target rate and actual rate, adjust
677 if (attempts
&& obj
) {
678 actual
= obj
* 1000 / attempts
;
679 ratio
= (double)actual
/ (double)prob
;
680 new_prob
= (double)new_prob
/ ratio
;
682 new_prob
= std::max(new_prob
, min_prob
);
683 new_prob
= std::min(new_prob
, 1000u);
686 prob
= (prob
+ new_prob
) / 2;
687 prob
= std::max(prob
, min_prob
);
688 prob
= std::min(prob
, 1000u);
689 dout(10) << __func__
<< " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis
<< " -> " << prob
694 promote_probability_millis
= prob
;
696 // set hard limits for this interval to mitigate stampedes
697 promote_max_objects
= target_obj_sec
* osd
->OSD_TICK_INTERVAL
* 2;
698 promote_max_bytes
= target_bytes_sec
* osd
->OSD_TICK_INTERVAL
* 2;
701 // -------------------------------------
703 float OSDService::get_failsafe_full_ratio()
705 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
706 if (full_ratio
> 1.0) full_ratio
/= 100.0;
710 OSDService::s_names
OSDService::recalc_full_state(float ratio
, float pratio
, string
&inject
)
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap
= get_osdmap();
717 if (!osdmap
|| osdmap
->get_epoch() == 0) {
720 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
721 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
722 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
723 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
725 if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio
= failsafe_ratio
;
729 backfillfull_ratio
= failsafe_ratio
;
730 nearfull_ratio
= failsafe_ratio
;
731 } else if (full_ratio
<= 0 ||
732 backfillfull_ratio
<= 0 ||
733 nearfull_ratio
<= 0) {
734 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio
= failsafe_ratio
;
738 backfillfull_ratio
= failsafe_ratio
;
739 nearfull_ratio
= failsafe_ratio
;
742 if (injectfull_state
> NONE
&& injectfull
) {
743 inject
= "(Injected)";
744 return injectfull_state
;
745 } else if (pratio
> failsafe_ratio
) {
747 } else if (ratio
> full_ratio
) {
749 } else if (ratio
> backfillfull_ratio
) {
751 } else if (pratio
> nearfull_ratio
) {
757 void OSDService::check_full_status(float ratio
, float pratio
)
759 std::lock_guard
l(full_status_lock
);
762 physical_ratio
= pratio
;
766 new_state
= recalc_full_state(ratio
, pratio
, inject
);
768 dout(20) << __func__
<< " cur ratio " << ratio
769 << ", physical ratio " << pratio
770 << ", new state " << get_full_state_name(new_state
)
775 if (cur_state
!= new_state
) {
776 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
777 << " -> " << get_full_state_name(new_state
) << dendl
;
778 if (new_state
== FAILSAFE
) {
779 clog
->error() << "full status failsafe engaged, dropping updates, now "
780 << (int)roundf(ratio
* 100) << "% full";
781 } else if (cur_state
== FAILSAFE
) {
782 clog
->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio
* 100) << "% full";
785 cur_state
= new_state
;
789 bool OSDService::need_fullness_update()
791 OSDMapRef osdmap
= get_osdmap();
793 if (osdmap
->exists(whoami
)) {
794 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
796 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
798 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
805 else if (is_backfillfull())
807 else if (is_nearfull())
812 bool OSDService::_check_inject_full(DoutPrefixProvider
*dpp
, s_names type
) const
814 if (injectfull
&& injectfull_state
>= type
) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
819 ldpp_dout(dpp
, 10) << __func__
<< " Injected " << get_full_state_name(type
) << " OSD ("
820 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")"
827 bool OSDService::_check_full(DoutPrefixProvider
*dpp
, s_names type
) const
829 std::lock_guard
l(full_status_lock
);
831 if (_check_inject_full(dpp
, type
))
834 if (cur_state
>= type
)
835 ldpp_dout(dpp
, 10) << __func__
<< " current usage is " << cur_ratio
836 << " physical " << physical_ratio
<< dendl
;
838 return cur_state
>= type
;
841 bool OSDService::_tentative_full(DoutPrefixProvider
*dpp
, s_names type
, uint64_t adjust_used
, osd_stat_t adjusted_stat
)
843 ldpp_dout(dpp
, 20) << __func__
<< " type " << get_full_state_name(type
) << " adjust_used " << (adjust_used
>> 10) << "KiB" << dendl
;
845 std::lock_guard
l(full_status_lock
);
846 if (_check_inject_full(dpp
, type
)) {
852 float ratio
= compute_adjusted_ratio(adjusted_stat
, &pratio
, adjust_used
);
855 s_names tentative_state
= recalc_full_state(ratio
, pratio
, notused
);
857 if (tentative_state
>= type
)
858 ldpp_dout(dpp
, 10) << __func__
<< " tentative usage is " << ratio
<< dendl
;
860 return tentative_state
>= type
;
863 bool OSDService::check_failsafe_full(DoutPrefixProvider
*dpp
) const
865 return _check_full(dpp
, FAILSAFE
);
868 bool OSDService::check_full(DoutPrefixProvider
*dpp
) const
870 return _check_full(dpp
, FULL
);
873 bool OSDService::tentative_backfill_full(DoutPrefixProvider
*dpp
, uint64_t adjust_used
, osd_stat_t stats
)
875 return _tentative_full(dpp
, BACKFILLFULL
, adjust_used
, stats
);
878 bool OSDService::check_backfill_full(DoutPrefixProvider
*dpp
) const
880 return _check_full(dpp
, BACKFILLFULL
);
883 bool OSDService::check_nearfull(DoutPrefixProvider
*dpp
) const
885 return _check_full(dpp
, NEARFULL
);
888 bool OSDService::is_failsafe_full() const
890 std::lock_guard
l(full_status_lock
);
891 return cur_state
== FAILSAFE
;
894 bool OSDService::is_full() const
896 std::lock_guard
l(full_status_lock
);
897 return cur_state
>= FULL
;
900 bool OSDService::is_backfillfull() const
902 std::lock_guard
l(full_status_lock
);
903 return cur_state
>= BACKFILLFULL
;
906 bool OSDService::is_nearfull() const
908 std::lock_guard
l(full_status_lock
);
909 return cur_state
>= NEARFULL
;
912 void OSDService::set_injectfull(s_names type
, int64_t count
)
914 std::lock_guard
l(full_status_lock
);
915 injectfull_state
= type
;
919 void OSDService::set_statfs(const struct store_statfs_t
&stbuf
,
920 osd_alert_list_t
& alerts
)
922 uint64_t bytes
= stbuf
.total
;
923 uint64_t avail
= stbuf
.available
;
924 uint64_t used
= stbuf
.get_used_raw();
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct
->_conf
->fake_statfs_for_testing
) {
929 uint64_t total_num_bytes
= 0;
933 total_num_bytes
+= p
->get_stats_num_bytes();
935 bytes
= cct
->_conf
->fake_statfs_for_testing
;
936 if (total_num_bytes
< bytes
)
937 avail
= bytes
- total_num_bytes
;
940 dout(0) << __func__
<< " fake total " << cct
->_conf
->fake_statfs_for_testing
941 << " adjust available " << avail
943 used
= bytes
- avail
;
946 osd
->logger
->set(l_osd_stat_bytes
, bytes
);
947 osd
->logger
->set(l_osd_stat_bytes_used
, used
);
948 osd
->logger
->set(l_osd_stat_bytes_avail
, avail
);
950 std::lock_guard
l(stat_lock
);
951 osd_stat
.statfs
= stbuf
;
952 osd_stat
.os_alerts
.clear();
953 osd_stat
.os_alerts
[whoami
].swap(alerts
);
954 if (cct
->_conf
->fake_statfs_for_testing
) {
955 osd_stat
.statfs
.total
= bytes
;
956 osd_stat
.statfs
.available
= avail
;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat
.statfs
.internally_reserved
= 0;
962 osd_stat_t
OSDService::set_osd_stat(vector
<int>& hb_peers
,
965 utime_t now
= ceph_clock_now();
966 auto stale_time
= g_conf().get_val
<int64_t>("osd_mon_heartbeat_stat_stale");
967 std::lock_guard
l(stat_lock
);
968 osd_stat
.hb_peers
.swap(hb_peers
);
969 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
970 osd_stat
.num_pgs
= num_pgs
;
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i
: osd_stat
.hb_pingtime
) {
974 if (i
.second
.last_update
== 0)
976 if (stale_time
&& now
.sec() - i
.second
.last_update
> stale_time
) {
977 dout(20) << __func__
<< " time out heartbeat for osd " << i
.first
978 << " last_update " << i
.second
.last_update
<< dendl
;
979 osd_stat
.hb_pingtime
.erase(i
.first
);
986 void OSDService::inc_osd_stat_repaired()
988 std::lock_guard
l(stat_lock
);
989 osd_stat
.num_shards_repaired
++;
993 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat
, float *pratio
,
994 uint64_t adjust_used
)
997 ((float)new_stat
.statfs
.get_used()) / ((float)new_stat
.statfs
.total
);
1000 dout(20) << __func__
<< " Before kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1001 if (new_stat
.statfs
.available
> adjust_used
)
1002 new_stat
.statfs
.available
-= adjust_used
;
1004 new_stat
.statfs
.available
= 0;
1005 dout(20) << __func__
<< " After kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted
= 0;
1011 osd
->_get_pgs(&pgs
);
1012 for (auto p
: pgs
) {
1013 backfill_adjusted
+= p
->pg_stat_adjust(&new_stat
);
1015 if (backfill_adjusted
) {
1016 dout(20) << __func__
<< " backfill adjusted " << new_stat
<< dendl
;
1018 return ((float)new_stat
.statfs
.get_used()) / ((float)new_stat
.statfs
.total
);
1021 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
1023 OSDMapRef next_map
= get_nextmap_reserved();
1024 // service map is always newer/newest
1025 ceph_assert(from_epoch
<= next_map
->get_epoch());
1027 if (next_map
->is_down(peer
) ||
1028 next_map
->get_info(peer
).up_from
> from_epoch
) {
1030 release_map(next_map
);
1033 ConnectionRef peer_con
;
1034 if (peer
== whoami
) {
1035 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1037 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1038 next_map
->get_cluster_addrs(peer
), false, true);
1040 maybe_share_map(peer_con
.get(), next_map
);
1041 peer_con
->send_message(m
);
1042 release_map(next_map
);
1045 void OSDService::send_message_osd_cluster(std::vector
<std::pair
<int, Message
*>>& messages
, epoch_t from_epoch
)
1047 OSDMapRef next_map
= get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch
<= next_map
->get_epoch());
1051 for (auto& iter
: messages
) {
1052 if (next_map
->is_down(iter
.first
) ||
1053 next_map
->get_info(iter
.first
).up_from
> from_epoch
) {
1057 ConnectionRef peer_con
;
1058 if (iter
.first
== whoami
) {
1059 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1061 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1062 next_map
->get_cluster_addrs(iter
.first
), false, true);
1064 maybe_share_map(peer_con
.get(), next_map
);
1065 peer_con
->send_message(iter
.second
);
1067 release_map(next_map
);
1069 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
1071 OSDMapRef next_map
= get_nextmap_reserved();
1072 // service map is always newer/newest
1073 ceph_assert(from_epoch
<= next_map
->get_epoch());
1075 if (next_map
->is_down(peer
) ||
1076 next_map
->get_info(peer
).up_from
> from_epoch
) {
1077 release_map(next_map
);
1081 if (peer
== whoami
) {
1082 con
= osd
->cluster_messenger
->get_loopback_connection();
1084 con
= osd
->cluster_messenger
->connect_to_osd(
1085 next_map
->get_cluster_addrs(peer
), false, true);
1087 release_map(next_map
);
1091 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
1093 OSDMapRef next_map
= get_nextmap_reserved();
1094 // service map is always newer/newest
1095 ceph_assert(from_epoch
<= next_map
->get_epoch());
1097 pair
<ConnectionRef
,ConnectionRef
> ret
;
1098 if (next_map
->is_down(peer
) ||
1099 next_map
->get_info(peer
).up_from
> from_epoch
) {
1100 release_map(next_map
);
1103 ret
.first
= osd
->hb_back_client_messenger
->connect_to_osd(
1104 next_map
->get_hb_back_addrs(peer
));
1105 ret
.second
= osd
->hb_front_client_messenger
->connect_to_osd(
1106 next_map
->get_hb_front_addrs(peer
));
1107 release_map(next_map
);
1111 entity_name_t
OSDService::get_cluster_msgr_name() const
1113 return cluster_messenger
->get_myname();
1116 void OSDService::queue_want_pg_temp(pg_t pgid
,
1117 const vector
<int>& want
,
1120 std::lock_guard
l(pg_temp_lock
);
1121 auto p
= pg_temp_pending
.find(pgid
);
1122 if (p
== pg_temp_pending
.end() ||
1123 p
->second
.acting
!= want
||
1125 pg_temp_wanted
[pgid
] = {want
, forced
};
1129 void OSDService::remove_want_pg_temp(pg_t pgid
)
1131 std::lock_guard
l(pg_temp_lock
);
1132 pg_temp_wanted
.erase(pgid
);
1133 pg_temp_pending
.erase(pgid
);
1136 void OSDService::_sent_pg_temp()
1138 #ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending
.merge(pg_temp_wanted
);
1141 pg_temp_pending
.insert(make_move_iterator(begin(pg_temp_wanted
)),
1142 make_move_iterator(end(pg_temp_wanted
)));
1144 pg_temp_wanted
.clear();
1147 void OSDService::requeue_pg_temp()
1149 std::lock_guard
l(pg_temp_lock
);
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted
= pg_temp_wanted
.size();
1153 unsigned old_pending
= pg_temp_pending
.size();
1155 pg_temp_wanted
.swap(pg_temp_pending
);
1156 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1157 << pg_temp_wanted
.size() << dendl
;
1160 std::ostream
& operator<<(std::ostream
& out
,
1161 const OSDService::pg_temp_t
& pg_temp
)
1163 out
<< pg_temp
.acting
;
1164 if (pg_temp
.forced
) {
1170 void OSDService::send_pg_temp()
1172 std::lock_guard
l(pg_temp_lock
);
1173 if (pg_temp_wanted
.empty())
1175 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1176 MOSDPGTemp
*ms
[2] = {nullptr, nullptr};
1177 for (auto& [pgid
, pg_temp
] : pg_temp_wanted
) {
1178 auto& m
= ms
[pg_temp
.forced
];
1180 m
= new MOSDPGTemp(osdmap
->get_epoch());
1181 m
->forced
= pg_temp
.forced
;
1183 m
->pg_temp
.emplace(pgid
, pg_temp
.acting
);
1187 monc
->send_mon_message(m
);
1193 void OSDService::send_pg_created(pg_t pgid
)
1195 std::lock_guard
l(pg_created_lock
);
1196 dout(20) << __func__
<< dendl
;
1197 auto o
= get_osdmap();
1198 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1199 pg_created
.insert(pgid
);
1200 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1204 void OSDService::send_pg_created()
1206 std::lock_guard
l(pg_created_lock
);
1207 dout(20) << __func__
<< dendl
;
1208 auto o
= get_osdmap();
1209 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1210 for (auto pgid
: pg_created
) {
1211 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1216 void OSDService::prune_pg_created()
1218 std::lock_guard
l(pg_created_lock
);
1219 dout(20) << __func__
<< dendl
;
1220 auto o
= get_osdmap();
1221 auto i
= pg_created
.begin();
1222 while (i
!= pg_created
.end()) {
1223 auto p
= o
->get_pg_pool(i
->pool());
1224 if (!p
|| !p
->has_flag(pg_pool_t::FLAG_CREATING
)) {
1225 dout(20) << __func__
<< " pruning " << *i
<< dendl
;
1226 i
= pg_created
.erase(i
);
1228 dout(20) << __func__
<< " keeping " << *i
<< dendl
;
1235 // --------------------------------------
1238 bool OSDService::can_inc_scrubs()
1240 bool can_inc
= false;
1241 std::lock_guard
l(sched_scrub_lock
);
1243 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1244 dout(20) << __func__
<< " == true " << scrubs_local
<< " local + " << scrubs_remote
1245 << " remote < max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1248 dout(20) << __func__
<< " == false " << scrubs_local
<< " local + " << scrubs_remote
1249 << " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1255 bool OSDService::inc_scrubs_local()
1257 bool result
= false;
1258 std::lock_guard l
{sched_scrub_lock
};
1259 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1260 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
+1)
1261 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1265 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1270 void OSDService::dec_scrubs_local()
1272 std::lock_guard l
{sched_scrub_lock
};
1273 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
-1)
1274 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1276 ceph_assert(scrubs_local
>= 0);
1279 bool OSDService::inc_scrubs_remote()
1281 bool result
= false;
1282 std::lock_guard l
{sched_scrub_lock
};
1283 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1284 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
+1)
1285 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1289 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1294 void OSDService::dec_scrubs_remote()
1296 std::lock_guard l
{sched_scrub_lock
};
1297 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
-1)
1298 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1300 ceph_assert(scrubs_remote
>= 0);
1303 void OSDService::dump_scrub_reservations(Formatter
*f
)
1305 std::lock_guard l
{sched_scrub_lock
};
1306 f
->dump_int("scrubs_local", scrubs_local
);
1307 f
->dump_int("scrubs_remote", scrubs_remote
);
1308 f
->dump_int("osd_max_scrubs", cct
->_conf
->osd_max_scrubs
);
1311 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1312 epoch_t
*_bind_epoch
) const
1314 std::lock_guard
l(epoch_lock
);
1316 *_boot_epoch
= boot_epoch
;
1318 *_up_epoch
= up_epoch
;
1320 *_bind_epoch
= bind_epoch
;
1323 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1324 const epoch_t
*_bind_epoch
)
1326 std::lock_guard
l(epoch_lock
);
1328 ceph_assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1329 boot_epoch
= *_boot_epoch
;
1332 ceph_assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1333 up_epoch
= *_up_epoch
;
1336 ceph_assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1337 bind_epoch
= *_bind_epoch
;
1341 bool OSDService::prepare_to_stop()
1343 std::unique_lock
l(is_stopping_lock
);
1344 if (get_state() != NOT_STOPPING
)
1347 OSDMapRef osdmap
= get_osdmap();
1348 if (osdmap
&& osdmap
->is_up(whoami
)) {
1349 dout(0) << __func__
<< " telling mon we are shutting down" << dendl
;
1350 set_state(PREPARING_TO_STOP
);
1351 monc
->send_mon_message(
1355 osdmap
->get_addrs(whoami
),
1356 osdmap
->get_epoch(),
1359 const auto timeout
= ceph::make_timespan(cct
->_conf
->osd_mon_shutdown_timeout
);
1360 is_stopping_cond
.wait_for(l
, timeout
,
1361 [this] { return get_state() == STOPPING
; });
1363 dout(0) << __func__
<< " starting shutdown" << dendl
;
1364 set_state(STOPPING
);
1368 void OSDService::got_stop_ack()
1370 std::scoped_lock
l(is_stopping_lock
);
1371 if (get_state() == PREPARING_TO_STOP
) {
1372 dout(0) << __func__
<< " starting shutdown" << dendl
;
1373 set_state(STOPPING
);
1374 is_stopping_cond
.notify_all();
1376 dout(10) << __func__
<< " ignoring msg" << dendl
;
1380 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1381 OSDSuperblock
& sblock
)
1383 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1384 osdmap
->get_encoding_features());
1385 m
->oldest_map
= max_oldest_map
;
1386 m
->newest_map
= sblock
.newest_map
;
1388 int max
= cct
->_conf
->osd_map_message_max
;
1389 ssize_t max_bytes
= cct
->_conf
->osd_map_message_max_bytes
;
1391 if (since
< m
->oldest_map
) {
1392 // we don't have the next map the target wants, so start with a
1395 dout(10) << __func__
<< " oldest map " << max_oldest_map
<< " > since "
1396 << since
<< ", starting with full map" << dendl
;
1397 since
= m
->oldest_map
;
1398 if (!get_map_bl(since
, bl
)) {
1399 derr
<< __func__
<< " missing full map " << since
<< dendl
;
1403 max_bytes
-= bl
.length();
1404 m
->maps
[since
].claim(bl
);
1406 for (epoch_t e
= since
+ 1; e
<= to
; ++e
) {
1408 if (get_inc_map_bl(e
, bl
)) {
1409 m
->incremental_maps
[e
].claim(bl
);
1411 dout(10) << __func__
<< " missing incremental map " << e
<< dendl
;
1412 if (!get_map_bl(e
, bl
)) {
1413 derr
<< __func__
<< " also missing full map " << e
<< dendl
;
1416 m
->maps
[e
].claim(bl
);
1419 max_bytes
-= bl
.length();
1420 if (max
<= 0 || max_bytes
<= 0) {
1427 if (!m
->maps
.empty() ||
1428 !m
->incremental_maps
.empty()) {
1429 // send what we have so far
1434 if (get_inc_map_bl(m
->newest_map
, bl
)) {
1435 m
->incremental_maps
[m
->newest_map
].claim(bl
);
1437 derr
<< __func__
<< " unable to load latest map " << m
->newest_map
<< dendl
;
1438 if (!get_map_bl(m
->newest_map
, bl
)) {
1439 derr
<< __func__
<< " unable to load latest full map " << m
->newest_map
1443 m
->maps
[m
->newest_map
].claim(bl
);
1448 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1450 con
->send_message(m
);
1453 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1454 const OSDMapRef
& osdmap
)
1456 epoch_t to
= osdmap
->get_epoch();
1457 dout(10) << "send_incremental_map " << since
<< " -> " << to
1458 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1462 OSDSuperblock
sblock(get_superblock());
1463 if (since
< sblock
.oldest_map
) {
1464 // just send latest full map
1465 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1466 osdmap
->get_encoding_features());
1467 m
->oldest_map
= max_oldest_map
;
1468 m
->newest_map
= sblock
.newest_map
;
1469 get_map_bl(to
, m
->maps
[to
]);
1474 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1475 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl
;
1477 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1480 m
= build_incremental_map_msg(since
, to
, sblock
);
1485 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1487 bool found
= map_bl_cache
.lookup(e
, &bl
);
1490 logger
->inc(l_osd_map_bl_cache_hit
);
1494 logger
->inc(l_osd_map_bl_cache_miss
);
1495 found
= store
->read(meta_ch
,
1496 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1504 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1506 std::lock_guard
l(map_cache_lock
);
1507 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1510 logger
->inc(l_osd_map_bl_cache_hit
);
1514 logger
->inc(l_osd_map_bl_cache_miss
);
1515 found
= store
->read(meta_ch
,
1516 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1519 _add_map_inc_bl(e
, bl
);
1524 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1526 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1527 // cache a contiguous buffer
1528 if (bl
.get_num_buffers() > 1) {
1531 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1532 map_bl_cache
.add(e
, bl
);
1535 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1537 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1538 // cache a contiguous buffer
1539 if (bl
.get_num_buffers() > 1) {
1542 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1543 map_bl_inc_cache
.add(e
, bl
);
1546 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1548 epoch_t e
= o
->get_epoch();
1550 if (cct
->_conf
->osd_map_dedup
) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1554 OSDMap::dedup(for_dedup
.get(), o
);
1558 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1565 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1567 std::lock_guard
l(map_cache_lock
);
1568 OSDMapRef retval
= map_cache
.lookup(epoch
);
1570 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1572 logger
->inc(l_osd_map_cache_hit
);
1577 logger
->inc(l_osd_map_cache_miss
);
1578 epoch_t lb
= map_cache
.cached_key_lower_bound();
1580 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1581 logger
->inc(l_osd_map_cache_miss_low
);
1582 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1586 OSDMap
*map
= new OSDMap
;
1588 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1590 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1591 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1597 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1599 return _add_map(map
);
1605 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1607 reply_op_error(op
, err
, eversion_t(), 0, {});
1610 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1612 vector
<pg_log_op_return_item_t
> op_returns
)
1614 auto m
= op
->get_req
<MOSDOp
>();
1615 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1617 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1619 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
,
1620 !m
->has_flag(CEPH_OSD_FLAG_RETURNVEC
));
1621 reply
->set_reply_versions(v
, uv
);
1622 reply
->set_op_returns(op_returns
);
1623 m
->get_connection()->send_message(reply
);
1626 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1628 if (!cct
->_conf
->osd_debug_misdirected_ops
) {
1632 auto m
= op
->get_req
<MOSDOp
>();
1633 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1635 ceph_assert(m
->get_map_epoch() >= pg
->get_history().same_primary_since
);
1637 if (pg
->is_ec_pg()) {
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1654 ceph_assert(m
->get_map_epoch() <= superblock
.newest_map
);
1655 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1657 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1658 << m
->get_map_epoch() << ", dropping" << dendl
;
1661 pg_t _pgid
= m
->get_raw_pg();
1663 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1664 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1665 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1666 pgid
.shard
!= pg
->pg_id
.shard
) {
1667 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1668 << m
->get_map_epoch() << ", dropping" << dendl
;
1673 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1674 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1675 << " pg " << m
->get_raw_pg()
1676 << " to osd." << whoami
1677 << " not " << pg
->get_acting()
1678 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1681 void OSDService::enqueue_back(OpSchedulerItem
&& qi
)
1683 osd
->op_shardedwq
.queue(std::move(qi
));
1686 void OSDService::enqueue_front(OpSchedulerItem
&& qi
)
1688 osd
->op_shardedwq
.queue_front(std::move(qi
));
1691 void OSDService::queue_recovery_context(
1693 GenContext
<ThreadPool::TPHandle
&> *c
)
1695 epoch_t e
= get_osdmap_epoch();
1698 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1699 new PGRecoveryContext(pg
->get_pgid(), c
, e
)),
1700 cct
->_conf
->osd_recovery_cost
,
1701 cct
->_conf
->osd_recovery_priority
,
1707 void OSDService::queue_for_snap_trim(PG
*pg
)
1709 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1712 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1713 new PGSnapTrim(pg
->get_pgid(), pg
->get_osdmap_epoch())),
1714 cct
->_conf
->osd_snap_trim_cost
,
1715 cct
->_conf
->osd_snap_trim_priority
,
1718 pg
->get_osdmap_epoch()));
1721 void OSDService::queue_for_scrub(PG
*pg
, bool with_high_priority
)
1723 unsigned scrub_queue_priority
= pg
->scrubber
.priority
;
1724 if (with_high_priority
&& scrub_queue_priority
< cct
->_conf
->osd_client_op_priority
) {
1725 scrub_queue_priority
= cct
->_conf
->osd_client_op_priority
;
1727 const auto epoch
= pg
->get_osdmap_epoch();
1730 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGScrub(pg
->get_pgid(), epoch
)),
1731 cct
->_conf
->osd_scrub_cost
,
1732 scrub_queue_priority
,
1738 void OSDService::queue_for_pg_delete(spg_t pgid
, epoch_t e
)
1740 dout(10) << __func__
<< " on " << pgid
<< " e " << e
<< dendl
;
1743 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1744 new PGDelete(pgid
, e
)),
1745 cct
->_conf
->osd_pg_delete_cost
,
1746 cct
->_conf
->osd_pg_delete_priority
,
1752 bool OSDService::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
1754 return osd
->try_finish_pg_delete(pg
, old_pg_num
);
1759 void OSDService::set_ready_to_merge_source(PG
*pg
, eversion_t version
)
1761 std::lock_guard
l(merge_lock
);
1762 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1763 ready_to_merge_source
[pg
->pg_id
.pgid
] = version
;
1764 assert(not_ready_to_merge_source
.count(pg
->pg_id
.pgid
) == 0);
1765 _send_ready_to_merge();
1768 void OSDService::set_ready_to_merge_target(PG
*pg
,
1770 epoch_t last_epoch_started
,
1771 epoch_t last_epoch_clean
)
1773 std::lock_guard
l(merge_lock
);
1774 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1775 ready_to_merge_target
.insert(make_pair(pg
->pg_id
.pgid
,
1778 last_epoch_clean
)));
1779 assert(not_ready_to_merge_target
.count(pg
->pg_id
.pgid
) == 0);
1780 _send_ready_to_merge();
1783 void OSDService::set_not_ready_to_merge_source(pg_t source
)
1785 std::lock_guard
l(merge_lock
);
1786 dout(10) << __func__
<< " " << source
<< dendl
;
1787 not_ready_to_merge_source
.insert(source
);
1788 assert(ready_to_merge_source
.count(source
) == 0);
1789 _send_ready_to_merge();
1792 void OSDService::set_not_ready_to_merge_target(pg_t target
, pg_t source
)
1794 std::lock_guard
l(merge_lock
);
1795 dout(10) << __func__
<< " " << target
<< " source " << source
<< dendl
;
1796 not_ready_to_merge_target
[target
] = source
;
1797 assert(ready_to_merge_target
.count(target
) == 0);
1798 _send_ready_to_merge();
1801 void OSDService::send_ready_to_merge()
1803 std::lock_guard
l(merge_lock
);
1804 _send_ready_to_merge();
1807 void OSDService::_send_ready_to_merge()
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1816 for (auto src
: not_ready_to_merge_source
) {
1817 if (sent_ready_to_merge_source
.count(src
) == 0) {
1818 monc
->send_mon_message(new MOSDPGReadyToMerge(
1822 osdmap
->get_epoch()));
1823 sent_ready_to_merge_source
.insert(src
);
1826 for (auto p
: not_ready_to_merge_target
) {
1827 if (sent_ready_to_merge_source
.count(p
.second
) == 0) {
1828 monc
->send_mon_message(new MOSDPGReadyToMerge(
1832 osdmap
->get_epoch()));
1833 sent_ready_to_merge_source
.insert(p
.second
);
1836 for (auto src
: ready_to_merge_source
) {
1837 if (not_ready_to_merge_source
.count(src
.first
) ||
1838 not_ready_to_merge_target
.count(src
.first
.get_parent())) {
1841 auto p
= ready_to_merge_target
.find(src
.first
.get_parent());
1842 if (p
!= ready_to_merge_target
.end() &&
1843 sent_ready_to_merge_source
.count(src
.first
) == 0) {
1844 monc
->send_mon_message(new MOSDPGReadyToMerge(
1845 src
.first
, // source pgid
1846 src
.second
, // src version
1847 std::get
<0>(p
->second
), // target version
1848 std::get
<1>(p
->second
), // PG's last_epoch_started
1849 std::get
<2>(p
->second
), // PG's last_epoch_clean
1851 osdmap
->get_epoch()));
1852 sent_ready_to_merge_source
.insert(src
.first
);
1857 void OSDService::clear_ready_to_merge(PG
*pg
)
1859 std::lock_guard
l(merge_lock
);
1860 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1861 ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1862 ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1863 not_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1864 not_ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1865 sent_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1868 void OSDService::clear_sent_ready_to_merge()
1870 std::lock_guard
l(merge_lock
);
1871 sent_ready_to_merge_source
.clear();
1874 void OSDService::prune_sent_ready_to_merge(const OSDMapRef
& osdmap
)
1876 std::lock_guard
l(merge_lock
);
1877 auto i
= sent_ready_to_merge_source
.begin();
1878 while (i
!= sent_ready_to_merge_source
.end()) {
1879 if (!osdmap
->pg_exists(*i
)) {
1880 dout(10) << __func__
<< " " << *i
<< dendl
;
1881 i
= sent_ready_to_merge_source
.erase(i
);
1890 void OSDService::_queue_for_recovery(
1891 std::pair
<epoch_t
, PGRef
> p
,
1892 uint64_t reserved_pushes
)
1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
1897 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1899 p
.second
->get_pgid(), p
.first
, reserved_pushes
)),
1900 cct
->_conf
->osd_recovery_cost
,
1901 cct
->_conf
->osd_recovery_priority
,
1907 // ====================================================================
1911 #define dout_prefix *_dout
1913 // Commands shared between OSD's console and admin console:
1915 namespace osd_cmds
{
1917 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
);
1919 }} // namespace ceph::osd_cmds
1921 int OSD::mkfs(CephContext
*cct
, ObjectStore
*store
, uuid_d fsid
, int whoami
, string osdspec_affinity
)
1927 ObjectStore::CollectionHandle ch
;
1929 // if we are fed a uuid for this osd, use it.
1930 store
->set_fsid(cct
->_conf
->osd_uuid
);
1932 ret
= store
->mkfs();
1934 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret
) << dendl
;
1939 store
->set_cache_shards(1); // doesn't matter for mkfs!
1941 ret
= store
->mount();
1943 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret
) << dendl
;
1948 ch
= store
->open_collection(coll_t::meta());
1950 ret
= store
->read(ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
1952 derr
<< "OSD::mkfs: have meta collection but no superblock" << dendl
;
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl
;
1957 auto p
= sbbl
.cbegin();
1959 if (whoami
!= sb
.whoami
) {
1960 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
1965 if (fsid
!= sb
.cluster_fsid
) {
1966 derr
<< "provided cluster fsid " << fsid
1967 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
1972 // create superblock
1973 sb
.cluster_fsid
= fsid
;
1974 sb
.osd_fsid
= store
->get_fsid();
1976 sb
.compat_features
= get_osd_initial_compat_set();
1981 ObjectStore::CollectionHandle ch
= store
->create_new_collection(
1983 ObjectStore::Transaction t
;
1984 t
.create_collection(coll_t::meta(), 0);
1985 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
1986 ret
= store
->queue_transaction(ch
, std::move(t
));
1988 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1989 << "queue_transaction returned " << cpp_strerror(ret
) << dendl
;
1994 ret
= write_meta(cct
, store
, sb
.cluster_fsid
, sb
.osd_fsid
, whoami
, osdspec_affinity
);
1996 derr
<< "OSD::mkfs: failed to write fsid file: error "
1997 << cpp_strerror(ret
) << dendl
;
2011 int OSD::write_meta(CephContext
*cct
, ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
, string
& osdspec_affinity
)
2016 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
2017 r
= store
->write_meta("magic", val
);
2021 snprintf(val
, sizeof(val
), "%d", whoami
);
2022 r
= store
->write_meta("whoami", val
);
2026 cluster_fsid
.print(val
);
2027 r
= store
->write_meta("ceph_fsid", val
);
2031 string key
= cct
->_conf
.get_val
<string
>("key");
2033 r
= store
->write_meta("osd_key", key
);
2037 string keyfile
= cct
->_conf
.get_val
<string
>("keyfile");
2038 if (!keyfile
.empty()) {
2041 r
= keybl
.read_file(keyfile
.c_str(), &err
);
2043 derr
<< __func__
<< " failed to read keyfile " << keyfile
<< ": "
2044 << err
<< ": " << cpp_strerror(r
) << dendl
;
2047 r
= store
->write_meta("osd_key", keybl
.to_str());
2052 if (!osdspec_affinity
.empty()) {
2053 r
= store
->write_meta("osdspec_affinity", osdspec_affinity
.c_str());
2058 r
= store
->write_meta("ready", "ready");
2065 int OSD::peek_meta(ObjectStore
*store
,
2067 uuid_d
*cluster_fsid
,
2070 ceph_release_t
*require_osd_release
)
2074 int r
= store
->read_meta("magic", &val
);
2079 r
= store
->read_meta("whoami", &val
);
2082 *whoami
= atoi(val
.c_str());
2084 r
= store
->read_meta("ceph_fsid", &val
);
2087 r
= cluster_fsid
->parse(val
.c_str());
2091 r
= store
->read_meta("fsid", &val
);
2093 *osd_fsid
= uuid_d();
2095 r
= osd_fsid
->parse(val
.c_str());
2100 r
= store
->read_meta("require_osd_release", &val
);
2102 *require_osd_release
= ceph_release_from_name(val
);
2110 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2114 OSD::OSD(CephContext
*cct_
, ObjectStore
*store_
,
2116 Messenger
*internal_messenger
,
2117 Messenger
*external_messenger
,
2118 Messenger
*hb_client_front
,
2119 Messenger
*hb_client_back
,
2120 Messenger
*hb_front_serverm
,
2121 Messenger
*hb_back_serverm
,
2122 Messenger
*osdc_messenger
,
2124 const std::string
&dev
, const std::string
&jdev
) :
2126 tick_timer(cct
, osd_lock
),
2127 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
2128 gss_ktfile_client(cct
->_conf
.get_val
<std::string
>("gss_ktab_client_file")),
2129 cluster_messenger(internal_messenger
),
2130 client_messenger(external_messenger
),
2131 objecter_messenger(osdc_messenger
),
2133 mgrc(cct_
, client_messenger
, &mc
->monmap
),
2135 recoverystate_perf(NULL
),
2137 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
2138 clog(log_client
.create_channel()),
2140 dev_path(dev
), journal_path(jdev
),
2141 store_is_rotational(store
->is_rotational()),
2142 trace_endpoint("0.0.0.0", 0, "osd"),
2144 m_osd_pg_epoch_max_lag_factor(cct
->_conf
.get_val
<double>(
2145 "osd_pg_epoch_max_lag_factor")),
2146 osd_compat(get_osd_compat_set()),
2147 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
2148 get_num_op_threads()),
2149 heartbeat_stop(false),
2150 heartbeat_need_update(true),
2151 hb_front_client_messenger(hb_client_front
),
2152 hb_back_client_messenger(hb_client_back
),
2153 hb_front_server_messenger(hb_front_serverm
),
2154 hb_back_server_messenger(hb_back_serverm
),
2156 heartbeat_thread(this),
2157 heartbeat_dispatcher(this),
2158 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
2159 cct
->_conf
->osd_num_op_tracker_shard
),
2160 test_ops_hook(NULL
),
2163 cct
->_conf
->osd_op_thread_timeout
,
2164 cct
->_conf
->osd_op_thread_suicide_timeout
,
2166 last_pg_create_epoch(0),
2169 requested_full_first(0),
2170 requested_full_last(0),
2174 if (!gss_ktfile_client
.empty()) {
2175 // Assert we can export environment variable
2177 The default client keytab is used, if it is present and readable,
2178 to automatically obtain initial credentials for GSSAPI client
2179 applications. The principal name of the first entry in the client
2180 keytab is used by default when obtaining initial credentials.
2181 1. The KRB5_CLIENT_KTNAME environment variable.
2182 2. The default_client_keytab_name profile variable in [libdefaults].
2183 3. The hardcoded default, DEFCKTNAME.
2185 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2186 gss_ktfile_client
.c_str(), 1));
2187 ceph_assert(set_result
== 0);
2190 monc
->set_messenger(client_messenger
);
2191 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
2192 cct
->_conf
->osd_op_log_threshold
);
2193 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
2194 cct
->_conf
->osd_op_history_duration
);
2195 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
2196 cct
->_conf
->osd_op_history_slow_op_threshold
);
2197 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
2199 std::stringstream ss
;
2200 ss
<< "osd." << whoami
;
2201 trace_endpoint
.copy_name(ss
.str());
2204 // initialize shards
2205 num_shards
= get_num_op_shards();
2206 for (uint32_t i
= 0; i
< num_shards
; i
++) {
2207 OSDShard
*one_shard
= new OSDShard(
2211 shards
.push_back(one_shard
);
2217 while (!shards
.empty()) {
2218 delete shards
.back();
2221 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
2222 cct
->get_perfcounters_collection()->remove(logger
);
2223 delete recoverystate_perf
;
2228 double OSD::get_tick_interval() const
2230 // vary +/- 5% to avoid scrub scheduling livelocks
2231 constexpr auto delta
= 0.05;
2232 return (OSD_TICK_INTERVAL
*
2233 ceph::util::generate_random_number(1.0 - delta
, 1.0 + delta
));
2236 void OSD::handle_signal(int signum
)
2238 ceph_assert(signum
== SIGINT
|| signum
== SIGTERM
);
2239 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
2245 std::lock_guard
lock(osd_lock
);
2249 if (store
->test_mount_in_use()) {
2250 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
2251 << "currently in use. (Is ceph-osd already running?)" << dendl
;
2255 cct
->_conf
.add_observer(this);
2259 int OSD::set_numa_affinity()
2261 // storage numa node
2262 int store_node
= -1;
2263 store
->get_numa_node(&store_node
, nullptr, nullptr);
2264 if (store_node
>= 0) {
2265 dout(1) << __func__
<< " storage numa node " << store_node
<< dendl
;
2268 // check network numa node(s)
2269 int front_node
= -1, back_node
= -1;
2270 string front_iface
= pick_iface(
2272 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
2273 string back_iface
= pick_iface(
2275 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
2276 int r
= get_iface_numa_node(front_iface
, &front_node
);
2277 if (r
>= 0 && front_node
>= 0) {
2278 dout(1) << __func__
<< " public network " << front_iface
<< " numa node "
2279 << front_node
<< dendl
;
2280 r
= get_iface_numa_node(back_iface
, &back_node
);
2281 if (r
>= 0 && back_node
>= 0) {
2282 dout(1) << __func__
<< " cluster network " << back_iface
<< " numa node "
2283 << back_node
<< dendl
;
2284 if (front_node
== back_node
&&
2285 front_node
== store_node
) {
2286 dout(1) << " objectstore and network numa nodes all match" << dendl
;
2287 if (g_conf().get_val
<bool>("osd_numa_auto_affinity")) {
2288 numa_node
= front_node
;
2290 } else if (front_node
!= back_node
) {
2291 dout(1) << __func__
<< " public and cluster network numa nodes do not match"
2294 dout(1) << __func__
<< " objectstore and network numa nodes do not match"
2297 } else if (back_node
== -2) {
2298 dout(1) << __func__
<< " cluster network " << back_iface
2299 << " ports numa nodes do not match" << dendl
;
2301 derr
<< __func__
<< " unable to identify cluster interface '" << back_iface
2302 << "' numa node: " << cpp_strerror(r
) << dendl
;
2304 } else if (front_node
== -2) {
2305 dout(1) << __func__
<< " public network " << front_iface
2306 << " ports numa nodes do not match" << dendl
;
2308 derr
<< __func__
<< " unable to identify public interface '" << front_iface
2309 << "' numa node: " << cpp_strerror(r
) << dendl
;
2311 if (int node
= g_conf().get_val
<int64_t>("osd_numa_node"); node
>= 0) {
2312 // this takes precedence over the automagic logic above
2315 if (numa_node
>= 0) {
2316 int r
= get_numa_node_cpu_set(numa_node
, &numa_cpu_set_size
, &numa_cpu_set
);
2318 dout(1) << __func__
<< " unable to determine numa node " << numa_node
2319 << " CPUs" << dendl
;
2322 dout(1) << __func__
<< " setting numa affinity to node " << numa_node
2324 << cpu_set_to_str_list(numa_cpu_set_size
, &numa_cpu_set
)
2326 r
= set_cpu_affinity_all_threads(numa_cpu_set_size
, &numa_cpu_set
);
2329 derr
<< __func__
<< " failed to set numa affinity: " << cpp_strerror(r
)
2335 dout(1) << __func__
<< " not setting numa affinity" << dendl
;
2342 class OSDSocketHook
: public AdminSocketHook
{
2345 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
2346 int call(std::string_view prefix
, const cmdmap_t
& cmdmap
,
2349 bufferlist
& out
) override
{
2350 ceph_abort("should use async hook");
2353 std::string_view prefix
,
2354 const cmdmap_t
& cmdmap
,
2356 const bufferlist
& inbl
,
2357 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
) override
{
2359 osd
->asok_command(prefix
, cmdmap
, f
, inbl
, on_finish
);
2360 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2362 on_finish(-EINVAL
, e
.what(), empty
);
2367 std::set
<int64_t> OSD::get_mapped_pools()
2369 std::set
<int64_t> pools
;
2370 std::vector
<spg_t
> pgids
;
2372 for (const auto &pgid
: pgids
) {
2373 pools
.insert(pgid
.pool());
2378 void OSD::asok_command(
2379 std::string_view prefix
, const cmdmap_t
& cmdmap
,
2381 const bufferlist
& inbl
,
2382 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
2385 stringstream ss
; // stderr error message stream
2386 bufferlist outbl
; // if empty at end, we'll dump formatter as output
2388 // --- PG commands are routed here to PG::do_command ---
2389 if (prefix
== "pg" ||
2390 prefix
== "query" ||
2391 prefix
== "mark_unfound_lost" ||
2392 prefix
== "list_unfound" ||
2393 prefix
== "scrub" ||
2394 prefix
== "deep_scrub"
2398 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
2399 ss
<< "no pgid specified";
2403 if (!pgid
.parse(pgidstr
.c_str())) {
2404 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
2410 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
2411 (pg
= _lookup_lock_pg(pcand
))) {
2412 if (pg
->is_primary()) {
2413 cmdmap_t new_cmdmap
= cmdmap
;
2415 pg
->do_command(prefix
, new_cmdmap
, inbl
, on_finish
);
2417 return; // the pg handler calls on_finish directly
2418 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2425 ss
<< "not primary for pgid " << pgid
;
2426 // do not reply; they will get newer maps and realize they
2433 ss
<< "i don't have pgid " << pgid
;
2438 // --- OSD commands follow ---
2440 else if (prefix
== "status") {
2441 lock_guard
l(osd_lock
);
2442 f
->open_object_section("status");
2443 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
2444 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
2445 f
->dump_unsigned("whoami", superblock
.whoami
);
2446 f
->dump_string("state", get_state_name(get_state()));
2447 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
2448 f
->dump_unsigned("newest_map", superblock
.newest_map
);
2449 f
->dump_unsigned("num_pgs", num_pgs
);
2451 } else if (prefix
== "flush_journal") {
2452 store
->flush_journal();
2453 } else if (prefix
== "dump_ops_in_flight" ||
2455 prefix
== "dump_blocked_ops" ||
2456 prefix
== "dump_historic_ops" ||
2457 prefix
== "dump_historic_ops_by_duration" ||
2458 prefix
== "dump_historic_slow_ops") {
2460 const string error_str
= "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2461 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2462 will start to track new ops received afterwards.";
2464 set
<string
> filters
;
2465 vector
<string
> filter_str
;
2466 if (cmd_getval(cmdmap
, "filterstr", filter_str
)) {
2467 copy(filter_str
.begin(), filter_str
.end(),
2468 inserter(filters
, filters
.end()));
2471 if (prefix
== "dump_ops_in_flight" ||
2473 if (!op_tracker
.dump_ops_in_flight(f
, false, filters
)) {
2479 if (prefix
== "dump_blocked_ops") {
2480 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
)) {
2486 if (prefix
== "dump_historic_ops") {
2487 if (!op_tracker
.dump_historic_ops(f
, false, filters
)) {
2493 if (prefix
== "dump_historic_ops_by_duration") {
2494 if (!op_tracker
.dump_historic_ops(f
, true, filters
)) {
2500 if (prefix
== "dump_historic_slow_ops") {
2501 if (!op_tracker
.dump_historic_slow_ops(f
, filters
)) {
2507 } else if (prefix
== "dump_op_pq_state") {
2508 f
->open_object_section("pq");
2509 op_shardedwq
.dump(f
);
2511 } else if (prefix
== "dump_blacklist") {
2512 list
<pair
<entity_addr_t
,utime_t
> > bl
;
2513 OSDMapRef curmap
= service
.get_osdmap();
2515 f
->open_array_section("blacklist");
2516 curmap
->get_blacklist(&bl
);
2517 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
2518 it
!= bl
.end(); ++it
) {
2519 f
->open_object_section("entry");
2520 f
->open_object_section("entity_addr_t");
2522 f
->close_section(); //entity_addr_t
2523 it
->second
.localtime(f
->dump_stream("expire_time"));
2524 f
->close_section(); //entry
2526 f
->close_section(); //blacklist
2527 } else if (prefix
== "dump_watchers") {
2528 list
<obj_watch_item_t
> watchers
;
2532 for (auto& pg
: pgs
) {
2533 list
<obj_watch_item_t
> pg_watchers
;
2534 pg
->get_watchers(&pg_watchers
);
2535 watchers
.splice(watchers
.end(), pg_watchers
);
2538 f
->open_array_section("watchers");
2539 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2540 it
!= watchers
.end(); ++it
) {
2542 f
->open_object_section("watch");
2544 f
->dump_string("namespace", it
->obj
.nspace
);
2545 f
->dump_string("object", it
->obj
.oid
.name
);
2547 f
->open_object_section("entity_name");
2548 it
->wi
.name
.dump(f
);
2549 f
->close_section(); //entity_name_t
2551 f
->dump_unsigned("cookie", it
->wi
.cookie
);
2552 f
->dump_unsigned("timeout", it
->wi
.timeout_seconds
);
2554 f
->open_object_section("entity_addr_t");
2555 it
->wi
.addr
.dump(f
);
2556 f
->close_section(); //entity_addr_t
2558 f
->close_section(); //watch
2561 f
->close_section(); //watchers
2562 } else if (prefix
== "dump_recovery_reservations") {
2563 f
->open_object_section("reservations");
2564 f
->open_object_section("local_reservations");
2565 service
.local_reserver
.dump(f
);
2567 f
->open_object_section("remote_reservations");
2568 service
.remote_reserver
.dump(f
);
2571 } else if (prefix
== "dump_scrub_reservations") {
2572 f
->open_object_section("scrub_reservations");
2573 service
.dump_scrub_reservations(f
);
2575 } else if (prefix
== "get_latest_osdmap") {
2576 get_latest_osdmap();
2577 } else if (prefix
== "set_heap_property") {
2581 bool success
= false;
2582 if (!cmd_getval(cmdmap
, "property", property
)) {
2583 error
= "unable to get property";
2585 } else if (!cmd_getval(cmdmap
, "value", value
)) {
2586 error
= "unable to get value";
2588 } else if (value
< 0) {
2589 error
= "negative value not allowed";
2591 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2592 error
= "invalid property";
2597 f
->open_object_section("result");
2598 f
->dump_string("error", error
);
2599 f
->dump_bool("success", success
);
2601 } else if (prefix
== "get_heap_property") {
2605 bool success
= false;
2606 if (!cmd_getval(cmdmap
, "property", property
)) {
2607 error
= "unable to get property";
2609 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2610 error
= "invalid property";
2615 f
->open_object_section("result");
2616 f
->dump_string("error", error
);
2617 f
->dump_bool("success", success
);
2618 f
->dump_int("value", value
);
2620 } else if (prefix
== "dump_objectstore_kv_stats") {
2621 store
->get_db_statistics(f
);
2622 } else if (prefix
== "dump_scrubs") {
2623 service
.dumps_scrub(f
);
2624 } else if (prefix
== "calc_objectstore_db_histogram") {
2625 store
->generate_db_histogram(f
);
2626 } else if (prefix
== "flush_store_cache") {
2627 store
->flush_cache(&ss
);
2628 } else if (prefix
== "dump_pgstate_history") {
2629 f
->open_object_section("pgstate_history");
2630 f
->open_array_section("pgs");
2633 for (auto& pg
: pgs
) {
2634 f
->open_object_section("pg");
2635 f
->dump_stream("pg") << pg
->pg_id
;
2636 f
->dump_string("currently", pg
->get_current_state());
2637 pg
->dump_pgstate_history(f
);
2642 } else if (prefix
== "compact") {
2643 dout(1) << "triggering manual compaction" << dendl
;
2644 auto start
= ceph::coarse_mono_clock::now();
2646 auto end
= ceph::coarse_mono_clock::now();
2647 double duration
= std::chrono::duration
<double>(end
-start
).count();
2648 dout(1) << "finished manual compaction in "
2650 << " seconds" << dendl
;
2651 f
->open_object_section("compact_result");
2652 f
->dump_float("elapsed_time", duration
);
2654 } else if (prefix
== "get_mapped_pools") {
2655 f
->open_array_section("mapped_pools");
2656 set
<int64_t> poollist
= get_mapped_pools();
2657 for (auto pool
: poollist
) {
2658 f
->dump_int("pool_id", pool
);
2661 } else if (prefix
== "smart") {
2663 cmd_getval(cmdmap
, "devid", devid
);
2665 probe_smart(devid
, out
);
2666 outbl
.append(out
.str());
2667 } else if (prefix
== "list_devices") {
2668 set
<string
> devnames
;
2669 store
->get_devices(&devnames
);
2670 f
->open_array_section("list_devices");
2671 for (auto dev
: devnames
) {
2672 if (dev
.find("dm-") == 0) {
2676 f
->open_object_section("device");
2677 f
->dump_string("device", "/dev/" + dev
);
2678 f
->dump_string("device_id", get_device_id(dev
, &err
));
2682 } else if (prefix
== "send_beacon") {
2683 lock_guard
l(osd_lock
);
2685 send_beacon(ceph::coarse_mono_clock::now());
2689 else if (prefix
== "cluster_log") {
2691 cmd_getval(cmdmap
, "message", msg
);
2694 ss
<< "ignoring empty log message";
2697 string message
= msg
.front();
2698 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
2699 message
+= " " + *a
;
2701 cmd_getval(cmdmap
, "level", lvl
);
2702 clog_type level
= string_to_clog_type(lvl
);
2705 ss
<< "unknown level '" << lvl
<< "'";
2708 clog
->do_log(level
, message
);
2711 else if (prefix
== "bench") {
2714 int64_t osize
, onum
;
2715 // default count 1G, size 4MB
2716 cmd_getval(cmdmap
, "count", count
, (int64_t)1 << 30);
2717 cmd_getval(cmdmap
, "size", bsize
, (int64_t)4 << 20);
2718 cmd_getval(cmdmap
, "object_size", osize
, (int64_t)0);
2719 cmd_getval(cmdmap
, "object_num", onum
, (int64_t)0);
2721 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
2723 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
2724 // let us limit the block size because the next checks rely on it
2725 // having a sane value. If we allow any block size to be set things
2726 // can still go sideways.
2727 ss
<< "block 'size' values are capped at "
2728 << byte_u_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
2729 << " a higher value, please adjust 'osd_bench_max_block_size'";
2732 } else if (bsize
< (int64_t) (1 << 20)) {
2733 // entering the realm of small block sizes.
2734 // limit the count to a sane value, assuming a configurable amount of
2735 // IOPS and duration, so that the OSD doesn't get hung up on this,
2736 // preventing timeouts from going off
2738 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
2739 if (count
> max_count
) {
2740 ss
<< "'count' values greater than " << max_count
2741 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
2742 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
2743 << " for " << duration
<< " seconds,"
2744 << " can cause ill effects on osd. "
2745 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2746 << " value if you wish to use a higher 'count'.";
2751 // 1MB block sizes are big enough so that we get more stuff done.
2752 // However, to avoid the osd from getting hung on this and having
2753 // timers being triggered, we are going to limit the count assuming
2754 // a configurable throughput and duration.
2755 // NOTE: max_count is the total amount of bytes that we believe we
2756 // will be able to write during 'duration' for the given
2757 // throughput. The block size hardly impacts this unless it's
2758 // way too big. Given we already check how big the block size
2759 // is, it's safe to assume everything will check out.
2761 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
2762 if (count
> max_count
) {
2763 ss
<< "'count' values greater than " << max_count
2764 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
2765 << byte_u_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
2766 << " for " << duration
<< " seconds,"
2767 << " can cause ill effects on osd. "
2768 << " Please adjust 'osd_bench_large_size_max_throughput'"
2769 << " with a higher value if you wish to use a higher 'count'.";
2775 if (osize
&& bsize
> osize
)
2778 dout(1) << " bench count " << count
2779 << " bsize " << byte_u_t(bsize
) << dendl
;
2781 ObjectStore::Transaction cleanupt
;
2783 if (osize
&& onum
) {
2785 bufferptr
bp(osize
);
2787 bl
.push_back(std::move(bp
));
2788 bl
.rebuild_page_aligned();
2789 for (int i
=0; i
<onum
; ++i
) {
2791 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
2793 hobject_t
soid(sobject_t(oid
, 0));
2794 ObjectStore::Transaction t
;
2795 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
2796 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
2797 cleanupt
.remove(coll_t(), ghobject_t(soid
));
2802 bufferptr
bp(bsize
);
2804 bl
.push_back(std::move(bp
));
2805 bl
.rebuild_page_aligned();
2809 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2814 utime_t start
= ceph_clock_now();
2815 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
2817 unsigned offset
= 0;
2818 if (onum
&& osize
) {
2819 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
2820 offset
= rand() % (osize
/ bsize
) * bsize
;
2822 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
2825 hobject_t
soid(sobject_t(oid
, 0));
2826 ObjectStore::Transaction t
;
2827 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
2828 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
2829 if (!onum
|| !osize
)
2830 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
2835 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2839 utime_t end
= ceph_clock_now();
2842 store
->queue_transaction(service
.meta_ch
, std::move(cleanupt
), NULL
);
2845 if (!service
.meta_ch
->flush_commit(&waiter
)) {
2850 double elapsed
= end
- start
;
2851 double rate
= count
/ elapsed
;
2852 double iops
= rate
/ bsize
;
2853 f
->open_object_section("osd_bench_results");
2854 f
->dump_int("bytes_written", count
);
2855 f
->dump_int("blocksize", bsize
);
2856 f
->dump_float("elapsed_sec", elapsed
);
2857 f
->dump_float("bytes_per_sec", rate
);
2858 f
->dump_float("iops", iops
);
2862 else if (prefix
== "flush_pg_stats") {
2863 mgrc
.send_pgstats();
2864 f
->dump_unsigned("stat_seq", service
.get_osd_stat_seq());
2867 else if (prefix
== "heap") {
2868 ret
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ss
);
2871 else if (prefix
== "debug dump_missing") {
2872 f
->open_array_section("pgs");
2875 for (auto& pg
: pgs
) {
2876 string s
= stringify(pg
->pg_id
);
2877 f
->open_array_section(s
.c_str());
2879 pg
->dump_missing(f
);
2886 else if (prefix
== "debug kick_recovery_wq") {
2888 cmd_getval(cmdmap
, "delay", delay
);
2891 ret
= cct
->_conf
.set_val("osd_recovery_delay_start", oss
.str().c_str());
2893 ss
<< "kick_recovery_wq: error setting "
2894 << "osd_recovery_delay_start to '" << delay
<< "': error "
2898 cct
->_conf
.apply_changes(nullptr);
2899 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
2900 << "to " << cct
->_conf
->osd_recovery_delay_start
;
2903 else if (prefix
== "cpu_profiler") {
2906 cmd_getval(cmdmap
, "arg", arg
);
2907 vector
<string
> argvec
;
2908 get_str_vec(arg
, argvec
);
2909 cpu_profiler_handle_command(argvec
, ds
);
2910 outbl
.append(ds
.str());
2913 else if (prefix
== "dump_pg_recovery_stats") {
2914 lock_guard
l(osd_lock
);
2915 pg_recovery_stats
.dump_formatted(f
);
2918 else if (prefix
== "reset_pg_recovery_stats") {
2919 lock_guard
l(osd_lock
);
2920 pg_recovery_stats
.reset();
2923 else if (prefix
== "perf histogram dump") {
2925 std::string counter
;
2926 cmd_getval(cmdmap
, "logger", logger
);
2927 cmd_getval(cmdmap
, "counter", counter
);
2928 cct
->get_perfcounters_collection()->dump_formatted_histograms(
2929 f
, false, logger
, counter
);
2932 else if (prefix
== "cache drop") {
2933 lock_guard
l(osd_lock
);
2934 dout(20) << "clearing all caches" << dendl
;
2935 // Clear the objectstore's cache - onode and buffer for Bluestore,
2936 // system's pagecache for Filestore
2937 ret
= store
->flush_cache(&ss
);
2939 ss
<< "Error flushing objectstore cache: " << cpp_strerror(ret
);
2942 // Clear the objectcontext cache (per PG)
2945 for (auto& pg
: pgs
) {
2950 else if (prefix
== "cache status") {
2951 lock_guard
l(osd_lock
);
2952 int obj_ctx_count
= 0;
2955 for (auto& pg
: pgs
) {
2956 obj_ctx_count
+= pg
->get_cache_obj_count();
2958 f
->open_object_section("cache_status");
2959 f
->dump_int("object_ctx", obj_ctx_count
);
2960 store
->dump_cache_stats(f
);
2964 else if (prefix
== "scrub_purged_snaps") {
2965 lock_guard
l(osd_lock
);
2966 scrub_purged_snaps();
2969 else if (prefix
== "dump_osd_network") {
2970 lock_guard
l(osd_lock
);
2972 if (!(cmd_getval(cmdmap
, "value", value
))) {
2973 // Convert milliseconds to microseconds
2974 value
= static_cast<double>(g_conf().get_val
<double>(
2975 "mon_warn_on_slow_ping_time")) * 1000;
2977 double ratio
= g_conf().get_val
<double>("mon_warn_on_slow_ping_ratio");
2978 value
= g_conf().get_val
<int64_t>("osd_heartbeat_grace");
2979 value
*= 1000000 * ratio
; // Seconds of grace to microseconds at ratio
2982 // Convert user input to microseconds
2985 if (value
< 0) value
= 0;
2987 struct osd_ping_time_t
{
2991 std::array
<uint32_t,3> times
;
2992 std::array
<uint32_t,3> min
;
2993 std::array
<uint32_t,3> max
;
2995 uint32_t last_update
;
2997 bool operator<(const osd_ping_time_t
& rhs
) const {
2998 if (pingtime
< rhs
.pingtime
)
3000 if (pingtime
> rhs
.pingtime
)
3010 set
<osd_ping_time_t
> sorted
;
3011 // Get pingtimes under lock and not on the stack
3012 map
<int, osd_stat_t::Interfaces
> *pingtimes
= new map
<int, osd_stat_t::Interfaces
>;
3013 service
.get_hb_pingtime(pingtimes
);
3014 for (auto j
: *pingtimes
) {
3015 if (j
.second
.last_update
== 0)
3017 osd_ping_time_t item
;
3018 item
.pingtime
= std::max(j
.second
.back_pingtime
[0], j
.second
.back_pingtime
[1]);
3019 item
.pingtime
= std::max(item
.pingtime
, j
.second
.back_pingtime
[2]);
3020 if (item
.pingtime
>= value
) {
3022 item
.times
[0] = j
.second
.back_pingtime
[0];
3023 item
.times
[1] = j
.second
.back_pingtime
[1];
3024 item
.times
[2] = j
.second
.back_pingtime
[2];
3025 item
.min
[0] = j
.second
.back_min
[0];
3026 item
.min
[1] = j
.second
.back_min
[1];
3027 item
.min
[2] = j
.second
.back_min
[2];
3028 item
.max
[0] = j
.second
.back_max
[0];
3029 item
.max
[1] = j
.second
.back_max
[1];
3030 item
.max
[2] = j
.second
.back_max
[2];
3031 item
.last
= j
.second
.back_last
;
3033 item
.last_update
= j
.second
.last_update
;
3034 sorted
.emplace(item
);
3036 if (j
.second
.front_last
== 0)
3038 item
.pingtime
= std::max(j
.second
.front_pingtime
[0], j
.second
.front_pingtime
[1]);
3039 item
.pingtime
= std::max(item
.pingtime
, j
.second
.front_pingtime
[2]);
3040 if (item
.pingtime
>= value
) {
3042 item
.times
[0] = j
.second
.front_pingtime
[0];
3043 item
.times
[1] = j
.second
.front_pingtime
[1];
3044 item
.times
[2] = j
.second
.front_pingtime
[2];
3045 item
.min
[0] = j
.second
.front_min
[0];
3046 item
.min
[1] = j
.second
.front_min
[1];
3047 item
.min
[2] = j
.second
.front_min
[2];
3048 item
.max
[0] = j
.second
.front_max
[0];
3049 item
.max
[1] = j
.second
.front_max
[1];
3050 item
.max
[2] = j
.second
.front_max
[2];
3051 item
.last
= j
.second
.front_last
;
3052 item
.last_update
= j
.second
.last_update
;
3054 sorted
.emplace(item
);
3059 // Network ping times (1min 5min 15min)
3060 f
->open_object_section("network_ping_times");
3061 f
->dump_int("threshold", value
/ 1000);
3062 f
->open_array_section("entries");
3063 for (auto &sitem
: boost::adaptors::reverse(sorted
)) {
3064 ceph_assert(sitem
.pingtime
>= value
);
3065 f
->open_object_section("entry");
3067 const time_t lu(sitem
.last_update
);
3069 string
lustr(ctime_r(&lu
, buffer
));
3070 lustr
.pop_back(); // Remove trailing \n
3071 auto stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3072 f
->dump_string("last update", lustr
);
3073 f
->dump_bool("stale", ceph_clock_now().sec() - sitem
.last_update
> stale
);
3074 f
->dump_int("from osd", whoami
);
3075 f
->dump_int("to osd", sitem
.to
);
3076 f
->dump_string("interface", (sitem
.back
? "back" : "front"));
3077 f
->open_object_section("average");
3078 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.times
[0],3).c_str());
3079 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.times
[1],3).c_str());
3080 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.times
[2],3).c_str());
3081 f
->close_section(); // average
3082 f
->open_object_section("min");
3083 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3084 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3085 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3086 f
->close_section(); // min
3087 f
->open_object_section("max");
3088 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3089 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3090 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3091 f
->close_section(); // max
3092 f
->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem
.last
,3).c_str());
3093 f
->close_section(); // entry
3095 f
->close_section(); // entries
3096 f
->close_section(); // network_ping_times
3098 ceph_abort_msg("broken asok registration");
3102 on_finish(ret
, ss
.str(), outbl
);
3105 class TestOpsSocketHook
: public AdminSocketHook
{
3106 OSDService
*service
;
3109 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
3110 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
3112 std::ostream
& errss
,
3113 bufferlist
& out
) override
{
3117 test_ops(service
, store
, command
, cmdmap
, outss
);
3119 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
3125 void test_ops(OSDService
*service
, ObjectStore
*store
,
3126 std::string_view command
, const cmdmap_t
& cmdmap
, ostream
&ss
);
3130 class OSD::C_Tick
: public Context
{
3133 explicit C_Tick(OSD
*o
) : osd(o
) {}
3134 void finish(int r
) override
{
3139 class OSD::C_Tick_WithoutOSDLock
: public Context
{
3142 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
3143 void finish(int r
) override
{
3144 osd
->tick_without_osd_lock();
3148 int OSD::enable_disable_fuse(bool stop
)
3152 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
3153 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
3154 dout(1) << __func__
<< " disabling" << dendl
;
3158 r
= ::rmdir(mntpath
.c_str());
3161 derr
<< __func__
<< " failed to rmdir " << mntpath
<< ": "
3162 << cpp_strerror(r
) << dendl
;
3167 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
3168 dout(1) << __func__
<< " enabling" << dendl
;
3169 r
= ::mkdir(mntpath
.c_str(), 0700);
3172 if (r
< 0 && r
!= -EEXIST
) {
3173 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
3174 << cpp_strerror(r
) << dendl
;
3177 fuse_store
= new FuseStore(store
, mntpath
);
3178 r
= fuse_store
->start();
3180 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
3186 #endif // HAVE_LIBFUSE
3190 size_t OSD::get_num_cache_shards()
3192 return cct
->_conf
.get_val
<Option::size_t>("osd_num_cache_shards");
3195 int OSD::get_num_op_shards()
3197 if (cct
->_conf
->osd_op_num_shards
)
3198 return cct
->_conf
->osd_op_num_shards
;
3199 if (store_is_rotational
)
3200 return cct
->_conf
->osd_op_num_shards_hdd
;
3202 return cct
->_conf
->osd_op_num_shards_ssd
;
3205 int OSD::get_num_op_threads()
3207 if (cct
->_conf
->osd_op_num_threads_per_shard
)
3208 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard
;
3209 if (store_is_rotational
)
3210 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_hdd
;
3212 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_ssd
;
3215 float OSD::get_osd_recovery_sleep()
3217 if (cct
->_conf
->osd_recovery_sleep
)
3218 return cct
->_conf
->osd_recovery_sleep
;
3219 if (!store_is_rotational
&& !journal_is_rotational
)
3220 return cct
->_conf
->osd_recovery_sleep_ssd
;
3221 else if (store_is_rotational
&& !journal_is_rotational
)
3222 return cct
->_conf
.get_val
<double>("osd_recovery_sleep_hybrid");
3224 return cct
->_conf
->osd_recovery_sleep_hdd
;
3227 float OSD::get_osd_delete_sleep()
3229 float osd_delete_sleep
= cct
->_conf
.get_val
<double>("osd_delete_sleep");
3230 if (osd_delete_sleep
> 0)
3231 return osd_delete_sleep
;
3232 if (!store_is_rotational
&& !journal_is_rotational
)
3233 return cct
->_conf
.get_val
<double>("osd_delete_sleep_ssd");
3234 if (store_is_rotational
&& !journal_is_rotational
)
3235 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hybrid");
3236 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hdd");
3239 int OSD::get_recovery_max_active()
3241 if (cct
->_conf
->osd_recovery_max_active
)
3242 return cct
->_conf
->osd_recovery_max_active
;
3243 if (store_is_rotational
)
3244 return cct
->_conf
->osd_recovery_max_active_hdd
;
3246 return cct
->_conf
->osd_recovery_max_active_ssd
;
3249 float OSD::get_osd_snap_trim_sleep()
3251 float osd_snap_trim_sleep
= cct
->_conf
.get_val
<double>("osd_snap_trim_sleep");
3252 if (osd_snap_trim_sleep
> 0)
3253 return osd_snap_trim_sleep
;
3254 if (!store_is_rotational
&& !journal_is_rotational
)
3255 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_ssd");
3256 if (store_is_rotational
&& !journal_is_rotational
)
3257 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hybrid");
3258 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hdd");
3264 CompatSet initial
, diff
;
3265 std::lock_guard
lock(osd_lock
);
3270 tick_timer_without_osd_lock
.init();
3271 service
.recovery_request_timer
.init();
3272 service
.sleep_timer
.init();
3274 boot_finisher
.start();
3278 store
->read_meta("require_osd_release", &val
);
3279 last_require_osd_release
= ceph_release_from_name(val
);
3283 dout(2) << "init " << dev_path
3284 << " (looks like " << (store_is_rotational
? "hdd" : "ssd") << ")"
3286 dout(2) << "journal " << journal_path
<< dendl
;
3287 ceph_assert(store
); // call pre_init() first!
3289 store
->set_cache_shards(get_num_cache_shards());
3291 int r
= store
->mount();
3293 derr
<< "OSD:init: unable to mount object store" << dendl
;
3296 journal_is_rotational
= store
->is_journal_rotational();
3297 dout(2) << "journal looks like " << (journal_is_rotational
? "hdd" : "ssd")
3300 enable_disable_fuse(false);
3302 dout(2) << "boot" << dendl
;
3304 service
.meta_ch
= store
->open_collection(coll_t::meta());
3306 // initialize the daily loadavg with current 15min loadavg
3308 if (getloadavg(loadavgs
, 3) == 3) {
3309 daily_loadavg
= loadavgs
[2];
3311 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
3312 daily_loadavg
= 1.0;
3315 int rotating_auth_attempts
= 0;
3316 auto rotating_auth_timeout
=
3317 g_conf().get_val
<int64_t>("rotating_keys_bootstrap_timeout");
3319 // sanity check long object name handling
3322 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
3323 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
3324 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
3325 r
= store
->validate_hobject_key(l
);
3327 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
3328 << "object name[space] len" << dendl
;
3329 derr
<< " osd max object name len = "
3330 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
3331 derr
<< " osd max object namespace len = "
3332 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
3333 derr
<< cpp_strerror(r
) << dendl
;
3334 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
3337 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
3340 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
3345 r
= read_superblock();
3347 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
3352 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
3353 derr
<< "The disk uses features unsupported by the executable." << dendl
;
3354 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
3355 derr
<< " daemon features " << osd_compat
<< dendl
;
3357 if (osd_compat
.writeable(superblock
.compat_features
)) {
3358 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3359 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
3364 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3365 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
3371 assert_warn(whoami
== superblock
.whoami
);
3372 if (whoami
!= superblock
.whoami
) {
3373 derr
<< "OSD::init: superblock says osd"
3374 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
3379 startup_time
= ceph::mono_clock::now();
3381 // load up "current" osdmap
3382 assert_warn(!get_osdmap());
3384 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
3388 osdmap
= get_map(superblock
.current_epoch
);
3391 // make sure we don't have legacy pgs deleting
3394 int r
= store
->list_collections(ls
);
3395 ceph_assert(r
>= 0);
3398 if (c
.is_pg(&pgid
) &&
3399 !osdmap
->have_pg_pool(pgid
.pool())) {
3400 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
3401 if (!store
->exists(service
.meta_ch
, oid
)) {
3402 derr
<< __func__
<< " missing pg_pool_t for deleted pool "
3403 << pgid
.pool() << " for pg " << pgid
3404 << "; please downgrade to luminous and allow "
3405 << "pg deletion to complete before upgrading" << dendl
;
3412 initial
= get_osd_initial_compat_set();
3413 diff
= superblock
.compat_features
.unsupported(initial
);
3414 if (superblock
.compat_features
.merge(initial
)) {
3415 // Are we adding SNAPMAPPER2?
3416 if (diff
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
)) {
3417 dout(1) << __func__
<< " upgrade snap_mapper (first start as octopus)"
3419 auto ch
= service
.meta_ch
;
3420 auto hoid
= make_snapmapper_oid();
3421 unsigned max
= cct
->_conf
->osd_target_transaction_size
;
3422 r
= SnapMapper::convert_legacy(cct
, store
, ch
, hoid
, max
);
3426 // We need to persist the new compat_set before we
3428 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
3429 ObjectStore::Transaction t
;
3430 write_superblock(t
);
3431 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3436 // make sure snap mapper object exists
3437 if (!store
->exists(service
.meta_ch
, OSD::make_snapmapper_oid())) {
3438 dout(10) << "init creating/touching snapmapper object" << dendl
;
3439 ObjectStore::Transaction t
;
3440 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3441 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3445 if (!store
->exists(service
.meta_ch
, OSD::make_purged_snaps_oid())) {
3446 dout(10) << "init creating/touching purged_snaps object" << dendl
;
3447 ObjectStore::Transaction t
;
3448 t
.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3449 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3454 if (cct
->_conf
->osd_open_classes_on_start
) {
3455 int r
= ClassHandler::get_instance().open_all_classes();
3457 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
3460 check_osdmap_features();
3462 create_recoverystate_perf();
3465 epoch_t bind_epoch
= osdmap
->get_epoch();
3466 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
3469 clear_temp_objects();
3471 // initialize osdmap references in sharded wq
3472 for (auto& shard
: shards
) {
3473 std::lock_guard
l(shard
->osdmap_lock
);
3474 shard
->shard_osdmap
= osdmap
;
3477 // load up pgs (as they previously existed)
3480 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
3486 struct store_statfs_t stbuf
;
3487 osd_alert_list_t alerts
;
3488 int r
= store
->statfs(&stbuf
, &alerts
);
3489 ceph_assert(r
== 0);
3490 service
.set_statfs(stbuf
, alerts
);
3493 // client_messenger auth_client is already set up by monc.
3494 for (auto m
: { cluster_messenger
,
3496 hb_front_client_messenger
,
3497 hb_back_client_messenger
,
3498 hb_front_server_messenger
,
3499 hb_back_server_messenger
} ) {
3500 m
->set_auth_client(monc
);
3502 for (auto m
: { client_messenger
,
3504 hb_front_server_messenger
,
3505 hb_back_server_messenger
}) {
3506 m
->set_auth_server(monc
);
3508 monc
->set_handle_authentication_dispatcher(this);
3510 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
3511 | CEPH_ENTITY_TYPE_MGR
);
3516 mgrc
.set_pgstats_cb([this](){ return collect_pg_stats(); });
3517 mgrc
.set_perf_metric_query_cb(
3518 [this](const ConfigPayload
&config_payload
) {
3519 set_perf_queries(config_payload
);
3522 return get_perf_reports();
3526 // tell monc about log_client so it will know about mon session resets
3527 monc
->set_log_client(&log_client
);
3528 update_log_config();
3531 client_messenger
->add_dispatcher_tail(&mgrc
);
3532 client_messenger
->add_dispatcher_tail(this);
3533 cluster_messenger
->add_dispatcher_head(this);
3535 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3536 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3537 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3538 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3540 objecter_messenger
->add_dispatcher_head(service
.objecter
.get());
3543 service
.publish_map(osdmap
);
3544 service
.publish_superblock(superblock
);
3545 service
.max_oldest_map
= superblock
.oldest_map
;
3547 for (auto& shard
: shards
) {
3548 // put PGs in a temporary set because we may modify pg_slots
3549 // unordered_map below.
3551 for (auto& i
: shard
->pg_slots
) {
3552 PGRef pg
= i
.second
->pg
;
3558 for (auto pg
: pgs
) {
3559 std::scoped_lock l
{*pg
};
3560 set
<pair
<spg_t
,epoch_t
>> new_children
;
3561 set
<pair
<spg_t
,epoch_t
>> merge_pgs
;
3562 service
.identify_splits_and_merges(pg
->get_osdmap(), osdmap
, pg
->pg_id
,
3563 &new_children
, &merge_pgs
);
3564 if (!new_children
.empty()) {
3565 for (auto shard
: shards
) {
3566 shard
->prime_splits(osdmap
, &new_children
);
3568 assert(new_children
.empty());
3570 if (!merge_pgs
.empty()) {
3571 for (auto shard
: shards
) {
3572 shard
->prime_merges(osdmap
, &merge_pgs
);
3574 assert(merge_pgs
.empty());
3581 // start the heartbeat
3582 heartbeat_thread
.create("osd_srv_heartbt");
3585 tick_timer
.add_event_after(get_tick_interval(),
3588 std::lock_guard
l(tick_timer_lock
);
3589 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
3590 new C_Tick_WithoutOSDLock(this));
3595 r
= monc
->authenticate();
3597 derr
<< __func__
<< " authentication failed: " << cpp_strerror(r
)
3602 while (monc
->wait_auth_rotating(rotating_auth_timeout
) < 0) {
3603 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
3604 ++rotating_auth_attempts
;
3605 if (rotating_auth_attempts
> g_conf()->max_rotating_auth_attempts
) {
3606 derr
<< __func__
<< " wait_auth_rotating timed out" << dendl
;
3611 r
= update_crush_device_class();
3613 derr
<< __func__
<< " unable to update_crush_device_class: "
3614 << cpp_strerror(r
) << dendl
;
3618 r
= update_crush_location();
3620 derr
<< __func__
<< " unable to update_crush_location: "
3621 << cpp_strerror(r
) << dendl
;
3629 // start objecter *after* we have authenticated, so that we don't ignore
3630 // the OSDMaps it requests.
3631 service
.final_init();
3635 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
3638 dout(0) << "done with init, starting boot process" << dendl
;
3640 // subscribe to any pg creations
3641 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
3643 // MgrClient needs this (it doesn't have MonClient reference itself)
3644 monc
->sub_want("mgrmap", 0, 0);
3646 // we don't need to ask for an osdmap here; objecter will
3647 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3656 enable_disable_fuse(true);
3663 void OSD::final_init()
3665 AdminSocket
*admin_socket
= cct
->get_admin_socket();
3666 asok_hook
= new OSDSocketHook(this);
3667 int r
= admin_socket
->register_command("status", asok_hook
,
3668 "high-level status of OSD");
3669 ceph_assert(r
== 0);
3670 r
= admin_socket
->register_command("flush_journal",
3672 "flush the journal to permanent store");
3673 ceph_assert(r
== 0);
3674 r
= admin_socket
->register_command("dump_ops_in_flight " \
3675 "name=filterstr,type=CephString,n=N,req=false",
3677 "show the ops currently in flight");
3678 ceph_assert(r
== 0);
3679 r
= admin_socket
->register_command("ops " \
3680 "name=filterstr,type=CephString,n=N,req=false",
3682 "show the ops currently in flight");
3683 ceph_assert(r
== 0);
3684 r
= admin_socket
->register_command("dump_blocked_ops " \
3685 "name=filterstr,type=CephString,n=N,req=false",
3687 "show the blocked ops currently in flight");
3688 ceph_assert(r
== 0);
3689 r
= admin_socket
->register_command("dump_historic_ops " \
3690 "name=filterstr,type=CephString,n=N,req=false",
3693 ceph_assert(r
== 0);
3694 r
= admin_socket
->register_command("dump_historic_slow_ops " \
3695 "name=filterstr,type=CephString,n=N,req=false",
3697 "show slowest recent ops");
3698 ceph_assert(r
== 0);
3699 r
= admin_socket
->register_command("dump_historic_ops_by_duration " \
3700 "name=filterstr,type=CephString,n=N,req=false",
3702 "show slowest recent ops, sorted by duration");
3703 ceph_assert(r
== 0);
3704 r
= admin_socket
->register_command("dump_op_pq_state",
3706 "dump op priority queue state");
3707 ceph_assert(r
== 0);
3708 r
= admin_socket
->register_command("dump_blacklist",
3710 "dump blacklisted clients and times");
3711 ceph_assert(r
== 0);
3712 r
= admin_socket
->register_command("dump_watchers",
3714 "show clients which have active watches,"
3715 " and on which objects");
3716 ceph_assert(r
== 0);
3717 r
= admin_socket
->register_command("dump_recovery_reservations",
3719 "show recovery reservations");
3720 ceph_assert(r
== 0);
3721 r
= admin_socket
->register_command("dump_scrub_reservations",
3723 "show scrub reservations");
3724 ceph_assert(r
== 0);
3725 r
= admin_socket
->register_command("get_latest_osdmap",
3727 "force osd to update the latest map from "
3729 ceph_assert(r
== 0);
3731 r
= admin_socket
->register_command("set_heap_property " \
3732 "name=property,type=CephString " \
3733 "name=value,type=CephInt",
3735 "update malloc extension heap property");
3736 ceph_assert(r
== 0);
3738 r
= admin_socket
->register_command("get_heap_property " \
3739 "name=property,type=CephString",
3741 "get malloc extension heap property");
3742 ceph_assert(r
== 0);
3744 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
3746 "print statistics of kvdb which used by bluestore");
3747 ceph_assert(r
== 0);
3749 r
= admin_socket
->register_command("dump_scrubs",
3751 "print scheduled scrubs");
3752 ceph_assert(r
== 0);
3754 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
3756 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3757 ceph_assert(r
== 0);
3759 r
= admin_socket
->register_command("flush_store_cache",
3761 "Flush bluestore internal cache");
3762 ceph_assert(r
== 0);
3763 r
= admin_socket
->register_command("dump_pgstate_history",
3765 "show recent state history");
3766 ceph_assert(r
== 0);
3768 r
= admin_socket
->register_command("compact",
3770 "Commpact object store's omap."
3771 " WARNING: Compaction probably slows your requests");
3772 ceph_assert(r
== 0);
3774 r
= admin_socket
->register_command("get_mapped_pools",
3776 "dump pools whose PG(s) are mapped to this OSD.");
3778 ceph_assert(r
== 0);
3780 r
= admin_socket
->register_command("smart name=devid,type=CephString,req=false",
3782 "probe OSD devices for SMART data.");
3784 ceph_assert(r
== 0);
3786 r
= admin_socket
->register_command("list_devices",
3788 "list OSD devices.");
3789 r
= admin_socket
->register_command("send_beacon",
3791 "send OSD beacon to mon immediately");
3793 r
= admin_socket
->register_command(
3794 "dump_osd_network name=value,type=CephInt,req=false", asok_hook
,
3795 "Dump osd heartbeat network ping times");
3796 ceph_assert(r
== 0);
3798 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
);
3799 // Note: pools are CephString instead of CephPoolname because
3800 // these commands traditionally support both pool names and numbers
3801 r
= admin_socket
->register_command(
3803 "name=pool,type=CephString " \
3804 "name=objname,type=CephObjectname " \
3805 "name=key,type=CephString "\
3806 "name=val,type=CephString",
3809 ceph_assert(r
== 0);
3810 r
= admin_socket
->register_command(
3812 "name=pool,type=CephString " \
3813 "name=objname,type=CephObjectname " \
3814 "name=key,type=CephString",
3817 ceph_assert(r
== 0);
3818 r
= admin_socket
->register_command(
3820 "name=pool,type=CephString " \
3821 "name=objname,type=CephObjectname " \
3822 "name=header,type=CephString",
3825 ceph_assert(r
== 0);
3827 r
= admin_socket
->register_command(
3829 "name=pool,type=CephString " \
3830 "name=objname,type=CephObjectname",
3832 "output entire object map");
3833 ceph_assert(r
== 0);
3835 r
= admin_socket
->register_command(
3837 "name=pool,type=CephString " \
3838 "name=objname,type=CephObjectname " \
3839 "name=len,type=CephInt",
3841 "truncate object to length");
3842 ceph_assert(r
== 0);
3844 r
= admin_socket
->register_command(
3846 "name=pool,type=CephString " \
3847 "name=objname,type=CephObjectname " \
3848 "name=shardid,type=CephInt,req=false,range=0|255",
3850 "inject data error to an object");
3851 ceph_assert(r
== 0);
3853 r
= admin_socket
->register_command(
3855 "name=pool,type=CephString " \
3856 "name=objname,type=CephObjectname " \
3857 "name=shardid,type=CephInt,req=false,range=0|255",
3859 "inject metadata error to an object");
3860 ceph_assert(r
== 0);
3861 r
= admin_socket
->register_command(
3862 "set_recovery_delay " \
3863 "name=utime,type=CephInt,req=false",
3865 "Delay osd recovery by specified seconds");
3866 ceph_assert(r
== 0);
3867 r
= admin_socket
->register_command(
3869 "name=type,type=CephString,req=false " \
3870 "name=count,type=CephInt,req=false ",
3872 "Inject a full disk (optional count times)");
3873 ceph_assert(r
== 0);
3874 r
= admin_socket
->register_command(
3876 "name=count,type=CephInt,req=false " \
3877 "name=size,type=CephInt,req=false " \
3878 "name=object_size,type=CephInt,req=false " \
3879 "name=object_num,type=CephInt,req=false ",
3881 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3882 "(default count=1G default size=4MB). Results in log.");
3883 ceph_assert(r
== 0);
3884 r
= admin_socket
->register_command(
3886 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3887 "name=message,type=CephString,n=N",
3889 "log a message to the cluster log");
3890 ceph_assert(r
== 0);
3891 r
= admin_socket
->register_command(
3895 ceph_assert(r
== 0);
3896 r
= admin_socket
->register_command(
3898 "name=heapcmd,type=CephChoices,strings=" \
3899 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3900 "name=value,type=CephString,req=false",
3902 "show heap usage info (available only if compiled with tcmalloc)");
3903 ceph_assert(r
== 0);
3904 r
= admin_socket
->register_command(
3905 "debug dump_missing " \
3906 "name=filename,type=CephFilepath",
3908 "dump missing objects to a named file");
3909 ceph_assert(r
== 0);
3910 r
= admin_socket
->register_command(
3911 "debug kick_recovery_wq " \
3912 "name=delay,type=CephInt,range=0",
3914 "set osd_recovery_delay_start to <val>");
3915 ceph_assert(r
== 0);
3916 r
= admin_socket
->register_command(
3918 "name=arg,type=CephChoices,strings=status|flush",
3920 "run cpu profiling on daemon");
3921 ceph_assert(r
== 0);
3922 r
= admin_socket
->register_command(
3923 "dump_pg_recovery_stats",
3925 "dump pg recovery statistics");
3926 ceph_assert(r
== 0);
3927 r
= admin_socket
->register_command(
3928 "reset_pg_recovery_stats",
3930 "reset pg recovery statistics");
3931 ceph_assert(r
== 0);
3932 r
= admin_socket
->register_command(
3935 "Drop all OSD caches");
3936 ceph_assert(r
== 0);
3937 r
= admin_socket
->register_command(
3940 "Get OSD caches statistics");
3941 ceph_assert(r
== 0);
3942 r
= admin_socket
->register_command(
3943 "scrub_purged_snaps",
3945 "Scrub purged_snaps vs snapmapper index");
3946 ceph_assert(r
== 0);
3948 // -- pg commands --
3949 // old form: ceph pg <pgid> command ...
3950 r
= admin_socket
->register_command(
3952 "name=pgid,type=CephPgid " \
3953 "name=cmd,type=CephChoices,strings=query",
3956 ceph_assert(r
== 0);
3957 r
= admin_socket
->register_command(
3959 "name=pgid,type=CephPgid " \
3960 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3961 "name=mulcmd,type=CephChoices,strings=revert|delete",
3964 ceph_assert(r
== 0);
3965 r
= admin_socket
->register_command(
3967 "name=pgid,type=CephPgid " \
3968 "name=cmd,type=CephChoices,strings=list_unfound " \
3969 "name=offset,type=CephString,req=false",
3972 ceph_assert(r
== 0);
3973 r
= admin_socket
->register_command(
3975 "name=pgid,type=CephPgid " \
3976 "name=cmd,type=CephChoices,strings=scrub " \
3977 "name=time,type=CephInt,req=false",
3980 ceph_assert(r
== 0);
3981 r
= admin_socket
->register_command(
3983 "name=pgid,type=CephPgid " \
3984 "name=cmd,type=CephChoices,strings=deep_scrub " \
3985 "name=time,type=CephInt,req=false",
3988 ceph_assert(r
== 0);
3989 // new form: tell <pgid> <cmd> for both cli and rest
3990 r
= admin_socket
->register_command(
3993 "show details of a specific pg");
3994 ceph_assert(r
== 0);
3995 r
= admin_socket
->register_command(
3996 "mark_unfound_lost " \
3997 "name=pgid,type=CephPgid,req=false " \
3998 "name=mulcmd,type=CephChoices,strings=revert|delete",
4000 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4001 ceph_assert(r
== 0);
4002 r
= admin_socket
->register_command(
4004 "name=pgid,type=CephPgid,req=false " \
4005 "name=offset,type=CephString,req=false",
4007 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4008 ceph_assert(r
== 0);
4009 r
= admin_socket
->register_command(
4011 "name=pgid,type=CephPgid,req=false " \
4012 "name=time,type=CephInt,req=false",
4014 "Trigger a scheduled scrub ");
4015 ceph_assert(r
== 0);
4016 r
= admin_socket
->register_command(
4018 "name=pgid,type=CephPgid,req=false " \
4019 "name=time,type=CephInt,req=false",
4021 "Trigger a scheduled deep scrub ");
4022 ceph_assert(r
== 0);
4025 void OSD::create_logger()
4027 dout(10) << "create_logger" << dendl
;
4029 logger
= build_osd_logger(cct
);
4030 cct
->get_perfcounters_collection()->add(logger
);
4033 void OSD::create_recoverystate_perf()
4035 dout(10) << "create_recoverystate_perf" << dendl
;
4037 recoverystate_perf
= build_recoverystate_perf(cct
);
4038 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
4043 if (cct
->_conf
->osd_fast_shutdown
) {
4044 derr
<< "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl
;
4049 if (!service
.prepare_to_stop())
4050 return 0; // already shutting down
4052 if (is_stopping()) {
4056 dout(0) << "shutdown" << dendl
;
4058 set_state(STATE_STOPPING
);
4061 if (cct
->_conf
.get_val
<bool>("osd_debug_shutdown")) {
4062 cct
->_conf
.set_val("debug_osd", "100");
4063 cct
->_conf
.set_val("debug_journal", "100");
4064 cct
->_conf
.set_val("debug_filestore", "100");
4065 cct
->_conf
.set_val("debug_bluestore", "100");
4066 cct
->_conf
.set_val("debug_ms", "100");
4067 cct
->_conf
.apply_changes(nullptr);
4070 // stop MgrClient earlier as it's more like an internal consumer of OSD
4073 service
.start_shutdown();
4075 // stop sending work to pgs. this just prevents any new work in _process
4076 // from racing with on_shutdown and potentially entering the pg after.
4077 op_shardedwq
.drain();
4083 for (auto pg
: pgs
) {
4088 // drain op queue again (in case PGs requeued something)
4089 op_shardedwq
.drain();
4091 finished
.clear(); // zap waiters (bleh, this is messy)
4092 waiting_for_osdmap
.clear();
4095 // unregister commands
4096 cct
->get_admin_socket()->unregister_commands(asok_hook
);
4100 cct
->get_admin_socket()->unregister_commands(test_ops_hook
);
4101 delete test_ops_hook
;
4102 test_ops_hook
= NULL
;
4107 std::lock_guard l
{heartbeat_lock
};
4108 heartbeat_stop
= true;
4109 heartbeat_cond
.notify_all();
4110 heartbeat_peers
.clear();
4112 heartbeat_thread
.join();
4114 hb_back_server_messenger
->mark_down_all();
4115 hb_front_server_messenger
->mark_down_all();
4116 hb_front_client_messenger
->mark_down_all();
4117 hb_back_client_messenger
->mark_down_all();
4121 dout(10) << "op sharded tp stopped" << dendl
;
4123 dout(10) << "stopping agent" << dendl
;
4124 service
.agent_stop();
4126 boot_finisher
.wait_for_empty();
4130 boot_finisher
.stop();
4131 reset_heartbeat_peers(true);
4133 tick_timer
.shutdown();
4136 std::lock_guard
l(tick_timer_lock
);
4137 tick_timer_without_osd_lock
.shutdown();
4140 // note unmount epoch
4141 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl
;
4142 superblock
.mounted
= service
.get_boot_epoch();
4143 superblock
.clean_thru
= get_osdmap_epoch();
4144 ObjectStore::Transaction t
;
4145 write_superblock(t
);
4146 int r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
4148 derr
<< "OSD::shutdown: error writing superblock: "
4149 << cpp_strerror(r
) << dendl
;
4153 service
.shutdown_reserver();
4156 #ifdef PG_DEBUG_REFS
4157 service
.dump_live_pgids();
4161 _get_pgs(&pgs
, true);
4165 for (auto& pg
: pgs
) {
4166 if (pg
->is_deleted()) {
4169 dout(20) << " kicking pg " << pg
<< dendl
;
4171 if (pg
->get_num_ref() != 1) {
4172 derr
<< "pgid " << pg
->get_pgid() << " has ref count of "
4173 << pg
->get_num_ref() << dendl
;
4174 #ifdef PG_DEBUG_REFS
4175 pg
->dump_live_ids();
4177 if (cct
->_conf
->osd_shutdown_pgref_assert
) {
4185 #ifdef PG_DEBUG_REFS
4186 service
.dump_live_pgids();
4190 cct
->_conf
.remove_observer(this);
4193 service
.meta_ch
.reset();
4195 dout(10) << "syncing store" << dendl
;
4196 enable_disable_fuse(true);
4198 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
4199 dout(10) << "flushing journal" << dendl
;
4200 store
->flush_journal();
4206 std::unique_lock l
{map_lock
};
4207 set_osdmap(OSDMapRef());
4209 for (auto s
: shards
) {
4210 std::lock_guard
l(s
->osdmap_lock
);
4211 s
->shard_osdmap
= OSDMapRef();
4215 std::lock_guard
lock(osd_lock
);
4219 dout(10) << "Store synced" << dendl
;
4221 op_tracker
.on_shutdown();
4223 ClassHandler::get_instance().shutdown();
4224 client_messenger
->shutdown();
4225 cluster_messenger
->shutdown();
4226 hb_front_client_messenger
->shutdown();
4227 hb_back_client_messenger
->shutdown();
4228 objecter_messenger
->shutdown();
4229 hb_front_server_messenger
->shutdown();
4230 hb_back_server_messenger
->shutdown();
4235 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
4237 bool created
= false;
4239 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
4240 vector
<string
> vcmd
{cmd
};
4244 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
4247 if (r
== -ENOENT
&& !created
) {
4248 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
4249 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
4250 vector
<string
> vnewcmd
{newcmd
};
4254 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
4257 derr
<< __func__
<< " fail: osd does not exist and created failed: "
4258 << cpp_strerror(r
) << dendl
;
4264 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
4273 int OSD::update_crush_location()
4275 if (!cct
->_conf
->osd_crush_update_on_start
) {
4276 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
4281 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
4282 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
4284 struct store_statfs_t st
;
4285 osd_alert_list_t alerts
;
4286 int r
= store
->statfs(&st
, &alerts
);
4288 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
4291 snprintf(weight
, sizeof(weight
), "%.4lf",
4294 double(1ull << 40 /* TB */)));
4297 dout(10) << __func__
<< " crush location is " << cct
->crush_location
<< dendl
;
4300 string("{\"prefix\": \"osd crush create-or-move\", ") +
4301 string("\"id\": ") + stringify(whoami
) + ", " +
4302 string("\"weight\":") + weight
+ ", " +
4303 string("\"args\": [") + stringify(cct
->crush_location
) + "]}";
4304 return mon_cmd_maybe_osd_create(cmd
);
4307 int OSD::update_crush_device_class()
4309 if (!cct
->_conf
->osd_class_update_on_start
) {
4310 dout(10) << __func__
<< " osd_class_update_on_start = false" << dendl
;
4314 string device_class
;
4315 int r
= store
->read_meta("crush_device_class", &device_class
);
4316 if (r
< 0 || device_class
.empty()) {
4317 device_class
= store
->get_default_device_class();
4320 if (device_class
.empty()) {
4321 dout(20) << __func__
<< " no device class stored locally" << dendl
;
4326 string("{\"prefix\": \"osd crush set-device-class\", ") +
4327 string("\"class\": \"") + device_class
+ string("\", ") +
4328 string("\"ids\": [\"") + stringify(whoami
) + string("\"]}");
4330 r
= mon_cmd_maybe_osd_create(cmd
);
4332 // good, already bound to a device-class
4339 void OSD::write_superblock(ObjectStore::Transaction
& t
)
4341 dout(10) << "write_superblock " << superblock
<< dendl
;
4343 //hack: at minimum it's using the baseline feature set
4344 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
4345 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
4348 encode(superblock
, bl
);
4349 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
4352 int OSD::read_superblock()
4355 int r
= store
->read(service
.meta_ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
4359 auto p
= bl
.cbegin();
4360 decode(superblock
, p
);
4362 dout(10) << "read_superblock " << superblock
<< dendl
;
4367 void OSD::clear_temp_objects()
4369 dout(10) << __func__
<< dendl
;
4371 store
->list_collections(ls
);
4372 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
4374 if (!p
->is_pg(&pgid
))
4377 // list temp objects
4378 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
4380 vector
<ghobject_t
> temps
;
4383 vector
<ghobject_t
> objects
;
4384 auto ch
= store
->open_collection(*p
);
4386 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4387 store
->get_ideal_list_max(),
4389 if (objects
.empty())
4391 vector
<ghobject_t
>::iterator q
;
4392 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
4393 // Hammer set pool for temps to -1, so check for clean-up
4394 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
4395 temps
.push_back(*q
);
4400 // If we saw a non-temp object and hit the break above we can
4401 // break out of the while loop too.
4402 if (q
!= objects
.end())
4405 if (!temps
.empty()) {
4406 ObjectStore::Transaction t
;
4408 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
4409 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
4411 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
4412 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4413 t
= ObjectStore::Transaction();
4418 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4424 void OSD::recursive_remove_collection(CephContext
* cct
,
4425 ObjectStore
*store
, spg_t pgid
,
4431 make_snapmapper_oid());
4433 ObjectStore::CollectionHandle ch
= store
->open_collection(tmp
);
4434 ObjectStore::Transaction t
;
4435 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
4438 int max
= cct
->_conf
->osd_target_transaction_size
;
4439 vector
<ghobject_t
> objects
;
4440 objects
.reserve(max
);
4443 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4444 max
, &objects
, &next
);
4445 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
4446 if (objects
.empty())
4448 for (auto& p
: objects
) {
4449 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
4450 int r
= mapper
.remove_oid(p
.hobj
, &_t
);
4451 if (r
!= 0 && r
!= -ENOENT
)
4455 int r
= store
->queue_transaction(ch
, std::move(t
));
4456 ceph_assert(r
== 0);
4457 t
= ObjectStore::Transaction();
4459 t
.remove_collection(tmp
);
4460 int r
= store
->queue_transaction(ch
, std::move(t
));
4461 ceph_assert(r
== 0);
4464 if (!ch
->flush_commit(&waiter
)) {
4470 // ======================================================
4474 OSDMapRef createmap
,
4477 dout(10) << __func__
<< " " << pgid
<< dendl
;
4479 map
<string
,string
> ec_profile
;
4481 if (createmap
->have_pg_pool(pgid
.pool())) {
4482 pi
= *createmap
->get_pg_pool(pgid
.pool());
4483 name
= createmap
->get_pool_name(pgid
.pool());
4484 if (pi
.is_erasure()) {
4485 ec_profile
= createmap
->get_erasure_code_profile(pi
.erasure_code_profile
);
4488 // pool was deleted; grab final pg_pool_t off disk.
4489 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
4491 int r
= store
->read(service
.meta_ch
, oid
, 0, 0, bl
);
4493 derr
<< __func__
<< " missing pool " << pgid
.pool() << " tombstone"
4497 ceph_assert(r
>= 0);
4498 auto p
= bl
.cbegin();
4501 if (p
.end()) { // dev release v13.0.2 did not include ec_profile
4502 derr
<< __func__
<< " missing ec_profile from pool " << pgid
.pool()
4503 << " tombstone" << dendl
;
4506 decode(ec_profile
, p
);
4508 PGPool
pool(cct
, createmap
, pgid
.pool(), pi
, name
);
4510 if (pi
.type
== pg_pool_t::TYPE_REPLICATED
||
4511 pi
.type
== pg_pool_t::TYPE_ERASURE
)
4512 pg
= new PrimaryLogPG(&service
, createmap
, pool
, ec_profile
, pgid
);
4518 void OSD::_get_pgs(vector
<PGRef
> *v
, bool clear_too
)
4521 v
->reserve(get_num_pgs());
4522 for (auto& s
: shards
) {
4523 std::lock_guard
l(s
->shard_lock
);
4524 for (auto& j
: s
->pg_slots
) {
4526 !j
.second
->pg
->is_deleted()) {
4527 v
->push_back(j
.second
->pg
);
4529 s
->_detach_pg(j
.second
.get());
4536 void OSD::_get_pgids(vector
<spg_t
> *v
)
4539 v
->reserve(get_num_pgs());
4540 for (auto& s
: shards
) {
4541 std::lock_guard
l(s
->shard_lock
);
4542 for (auto& j
: s
->pg_slots
) {
4544 !j
.second
->pg
->is_deleted()) {
4545 v
->push_back(j
.first
);
4551 void OSD::register_pg(PGRef pg
)
4553 spg_t pgid
= pg
->get_pgid();
4554 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4555 auto sdata
= shards
[shard_index
];
4556 std::lock_guard
l(sdata
->shard_lock
);
4557 auto r
= sdata
->pg_slots
.emplace(pgid
, make_unique
<OSDShardPGSlot
>());
4558 ceph_assert(r
.second
);
4559 auto *slot
= r
.first
->second
.get();
4560 dout(20) << __func__
<< " " << pgid
<< " " << pg
<< dendl
;
4561 sdata
->_attach_pg(slot
, pg
.get());
4564 bool OSD::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
4566 auto sdata
= pg
->osd_shard
;
4569 std::lock_guard
l(sdata
->shard_lock
);
4570 auto p
= sdata
->pg_slots
.find(pg
->pg_id
);
4571 if (p
== sdata
->pg_slots
.end() ||
4573 dout(20) << __func__
<< " " << pg
->pg_id
<< " not found" << dendl
;
4576 if (p
->second
->waiting_for_merge_epoch
) {
4577 dout(20) << __func__
<< " " << pg
->pg_id
<< " waiting for merge" << dendl
;
4580 dout(20) << __func__
<< " " << pg
->pg_id
<< " " << pg
<< dendl
;
4581 sdata
->_detach_pg(p
->second
.get());
4584 for (auto shard
: shards
) {
4585 shard
->unprime_split_children(pg
->pg_id
, old_pg_num
);
4588 // update pg count now since we might not get an osdmap any time soon.
4589 if (pg
->is_primary())
4590 service
.logger
->dec(l_osd_pg_primary
);
4591 else if (pg
->is_nonprimary())
4592 service
.logger
->dec(l_osd_pg_replica
); // misnomver
4594 service
.logger
->dec(l_osd_pg_stray
);
4599 PGRef
OSD::_lookup_pg(spg_t pgid
)
4601 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4602 auto sdata
= shards
[shard_index
];
4603 std::lock_guard
l(sdata
->shard_lock
);
4604 auto p
= sdata
->pg_slots
.find(pgid
);
4605 if (p
== sdata
->pg_slots
.end()) {
4608 return p
->second
->pg
;
4611 PGRef
OSD::_lookup_lock_pg(spg_t pgid
)
4613 PGRef pg
= _lookup_pg(pgid
);
4618 if (!pg
->is_deleted()) {
4625 PGRef
OSD::lookup_lock_pg(spg_t pgid
)
4627 return _lookup_lock_pg(pgid
);
4630 void OSD::load_pgs()
4632 ceph_assert(ceph_mutex_is_locked(osd_lock
));
4633 dout(0) << "load_pgs" << dendl
;
4636 auto pghist
= make_pg_num_history_oid();
4638 int r
= store
->read(service
.meta_ch
, pghist
, 0, 0, bl
, 0);
4639 if (r
>= 0 && bl
.length() > 0) {
4640 auto p
= bl
.cbegin();
4641 decode(pg_num_history
, p
);
4643 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
4647 int r
= store
->list_collections(ls
);
4649 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
4653 for (vector
<coll_t
>::iterator it
= ls
.begin();
4657 if (it
->is_temp(&pgid
) ||
4658 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
, pgid
))) {
4659 dout(10) << "load_pgs " << *it
4660 << " removing, legacy or flagged for removal pg" << dendl
;
4661 recursive_remove_collection(cct
, store
, pgid
, *it
);
4665 if (!it
->is_pg(&pgid
)) {
4666 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
4670 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
4671 epoch_t map_epoch
= 0;
4672 int r
= PG::peek_map_epoch(store
, pgid
, &map_epoch
);
4674 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
4680 if (map_epoch
> 0) {
4681 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
4683 if (!get_osdmap()->have_pg_pool(pgid
.pool())) {
4684 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
4685 << " on pg " << pgid
<< ", but the pool is not present in the "
4686 << "current map, so this is probably a result of bug 10617. "
4687 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4688 << "to clean it up later." << dendl
;
4691 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
4692 << map_epoch
<< ", but missing map. Crashing."
4694 ceph_abort_msg("Missing map in load_pgs");
4697 pg
= _make_pg(pgosdmap
, pgid
);
4699 pg
= _make_pg(get_osdmap(), pgid
);
4702 recursive_remove_collection(cct
, store
, pgid
, *it
);
4706 // there can be no waiters here, so we don't call _wake_pg_slot
4709 pg
->ch
= store
->open_collection(pg
->coll
);
4711 // read pg state, log
4712 pg
->read_state(store
);
4715 dout(10) << "load_pgs " << *it
<< " deleting dne" << dendl
;
4718 recursive_remove_collection(cct
, store
, pgid
, *it
);
4722 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4723 assert(NULL
!= shards
[shard_index
]);
4724 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4727 pg
->reg_next_scrub();
4729 dout(10) << __func__
<< " loaded " << *pg
<< dendl
;
4735 dout(0) << __func__
<< " opened " << num
<< " pgs" << dendl
;
4739 PGRef
OSD::handle_pg_create_info(const OSDMapRef
& osdmap
,
4740 const PGCreateInfo
*info
)
4742 spg_t pgid
= info
->pgid
;
4744 if (maybe_wait_for_max_pg(osdmap
, pgid
, info
->by_mon
)) {
4745 dout(10) << __func__
<< " hit max pg, dropping" << dendl
;
4749 PeeringCtx rctx
= create_context();
4751 OSDMapRef startmap
= get_map(info
->epoch
);
4754 int64_t pool_id
= pgid
.pgid
.pool();
4755 const pg_pool_t
*pool
= osdmap
->get_pg_pool(pool_id
);
4757 dout(10) << __func__
<< " ignoring " << pgid
<< ", pool dne" << dendl
;
4760 if (osdmap
->require_osd_release
>= ceph_release_t::nautilus
&&
4761 !pool
->has_flag(pg_pool_t::FLAG_CREATING
)) {
4762 // this ensures we do not process old creating messages after the
4763 // pool's initial pgs have been created (and pg are subsequently
4764 // allowed to split or merge).
4765 dout(20) << __func__
<< " dropping " << pgid
4766 << "create, pool does not have CREATING flag set" << dendl
;
4771 int up_primary
, acting_primary
;
4772 vector
<int> up
, acting
;
4773 startmap
->pg_to_up_acting_osds(
4774 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4776 const pg_pool_t
* pp
= startmap
->get_pg_pool(pgid
.pool());
4777 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
4778 store
->get_type() != "bluestore") {
4779 clog
->warn() << "pg " << pgid
4780 << " is at risk of silent data corruption: "
4781 << "the pool allows ec overwrites but is not stored in "
4782 << "bluestore, so deep scrubbing will not detect bitrot";
4784 create_pg_collection(
4785 rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
4786 init_pg_ondisk(rctx
.transaction
, pgid
, pp
);
4788 int role
= startmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
4790 PGRef pg
= _make_pg(startmap
, pgid
);
4791 pg
->ch
= store
->create_new_collection(pg
->coll
);
4794 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4795 assert(NULL
!= shards
[shard_index
]);
4796 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4801 // we are holding the shard lock
4802 ceph_assert(!pg
->is_deleted());
4811 info
->past_intervals
,
4815 pg
->init_collection_pool_opts();
4817 if (pg
->is_primary()) {
4818 std::lock_guard locker
{m_perf_queries_lock
};
4819 pg
->set_dynamic_perf_stats_queries(m_perf_queries
);
4822 pg
->handle_initialize(rctx
);
4823 pg
->handle_activate_map(rctx
);
4825 dispatch_context(rctx
, pg
.get(), osdmap
, nullptr);
4827 dout(10) << __func__
<< " new pg " << *pg
<< dendl
;
4831 bool OSD::maybe_wait_for_max_pg(const OSDMapRef
& osdmap
,
4835 const auto max_pgs_per_osd
=
4836 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4837 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4839 if (num_pgs
< max_pgs_per_osd
) {
4843 std::lock_guard
l(pending_creates_lock
);
4844 if (is_mon_create
) {
4845 pending_creates_from_mon
++;
4847 bool is_primary
= osdmap
->get_pg_acting_role(pgid
, whoami
) == 0;
4848 pending_creates_from_osd
.emplace(pgid
, is_primary
);
4850 dout(1) << __func__
<< " withhold creation of pg " << pgid
4851 << ": " << num_pgs
<< " >= "<< max_pgs_per_osd
<< dendl
;
4855 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4856 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4857 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4858 static vector
<int32_t> twiddle(const vector
<int>& acting
) {
4859 if (acting
.size() > 1) {
4862 vector
<int32_t> twiddled(acting
.begin(), acting
.end());
4863 twiddled
.push_back(-1);
4868 void OSD::resume_creating_pg()
4870 bool do_sub_pg_creates
= false;
4871 bool have_pending_creates
= false;
4873 const auto max_pgs_per_osd
=
4874 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4875 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4876 if (max_pgs_per_osd
<= num_pgs
) {
4877 // this could happen if admin decreases this setting before a PG is removed
4880 unsigned spare_pgs
= max_pgs_per_osd
- num_pgs
;
4881 std::lock_guard
l(pending_creates_lock
);
4882 if (pending_creates_from_mon
> 0) {
4883 dout(20) << __func__
<< " pending_creates_from_mon "
4884 << pending_creates_from_mon
<< dendl
;
4885 do_sub_pg_creates
= true;
4886 if (pending_creates_from_mon
>= spare_pgs
) {
4887 spare_pgs
= pending_creates_from_mon
= 0;
4889 spare_pgs
-= pending_creates_from_mon
;
4890 pending_creates_from_mon
= 0;
4893 auto pg
= pending_creates_from_osd
.cbegin();
4894 while (spare_pgs
> 0 && pg
!= pending_creates_from_osd
.cend()) {
4895 dout(20) << __func__
<< " pg " << pg
->first
<< dendl
;
4897 get_osdmap()->pg_to_up_acting_osds(pg
->first
.pgid
, nullptr, nullptr, &acting
, nullptr);
4898 service
.queue_want_pg_temp(pg
->first
.pgid
, twiddle(acting
), true);
4899 pg
= pending_creates_from_osd
.erase(pg
);
4900 do_sub_pg_creates
= true;
4903 have_pending_creates
= (pending_creates_from_mon
> 0 ||
4904 !pending_creates_from_osd
.empty());
4907 bool do_renew_subs
= false;
4908 if (do_sub_pg_creates
) {
4909 if (monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0)) {
4910 dout(4) << __func__
<< ": resolicit pg creates from mon since "
4911 << last_pg_create_epoch
<< dendl
;
4912 do_renew_subs
= true;
4915 version_t start
= get_osdmap_epoch() + 1;
4916 if (have_pending_creates
) {
4917 // don't miss any new osdmap deleting PGs
4918 if (monc
->sub_want("osdmap", start
, 0)) {
4919 dout(4) << __func__
<< ": resolicit osdmap from mon since "
4921 do_renew_subs
= true;
4923 } else if (do_sub_pg_creates
) {
4924 // no need to subscribe the osdmap continuously anymore
4925 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4926 if (monc
->sub_want_increment("osdmap", start
, CEPH_SUBSCRIBE_ONETIME
)) {
4927 dout(4) << __func__
<< ": re-subscribe osdmap(onetime) since "
4929 do_renew_subs
= true;
4933 if (do_renew_subs
) {
4937 service
.send_pg_temp();
4940 void OSD::build_initial_pg_history(
4943 utime_t created_stamp
,
4947 dout(10) << __func__
<< " " << pgid
<< " created " << created
<< dendl
;
4948 *h
= pg_history_t(created
, created_stamp
);
4950 OSDMapRef lastmap
= service
.get_map(created
);
4951 int up_primary
, acting_primary
;
4952 vector
<int> up
, acting
;
4953 lastmap
->pg_to_up_acting_osds(
4954 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4956 ostringstream debug
;
4957 for (epoch_t e
= created
+ 1; e
<= get_osdmap_epoch(); ++e
) {
4958 OSDMapRef osdmap
= service
.get_map(e
);
4959 int new_up_primary
, new_acting_primary
;
4960 vector
<int> new_up
, new_acting
;
4961 osdmap
->pg_to_up_acting_osds(
4962 pgid
.pgid
, &new_up
, &new_up_primary
, &new_acting
, &new_acting_primary
);
4964 // this is a bit imprecise, but sufficient?
4965 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
4966 const pg_pool_t
*pi
;
4967 bool operator()(const set
<pg_shard_t
> &have
) const {
4968 return have
.size() >= pi
->min_size
;
4970 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
4971 } min_size_predicate(osdmap
->get_pg_pool(pgid
.pgid
.pool()));
4973 bool new_interval
= PastIntervals::check_new_interval(
4980 h
->same_interval_since
,
4981 h
->last_epoch_clean
,
4989 h
->same_interval_since
= e
;
4991 h
->same_up_since
= e
;
4993 if (acting_primary
!= new_acting_primary
) {
4994 h
->same_primary_since
= e
;
4996 if (pgid
.pgid
.is_split(lastmap
->get_pg_num(pgid
.pgid
.pool()),
4997 osdmap
->get_pg_num(pgid
.pgid
.pool()),
4999 h
->last_epoch_split
= e
;
5002 acting
= new_acting
;
5003 up_primary
= new_up_primary
;
5004 acting_primary
= new_acting_primary
;
5008 dout(20) << __func__
<< " " << debug
.str() << dendl
;
5009 dout(10) << __func__
<< " " << *h
<< " " << *pi
5010 << " [" << (pi
->empty() ? pair
<epoch_t
,epoch_t
>(0,0) :
5011 pi
->get_bounds()) << ")"
5015 void OSD::_add_heartbeat_peer(int p
)
5021 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
5022 if (i
== heartbeat_peers
.end()) {
5023 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, get_osdmap_epoch());
5026 assert(cons
.second
);
5028 hi
= &heartbeat_peers
[p
];
5031 auto stamps
= service
.get_hb_stamps(p
);
5033 auto sb
= ceph::make_ref
<Session
>(cct
, cons
.first
.get());
5035 sb
->stamps
= stamps
;
5036 hi
->hb_interval_start
= ceph_clock_now();
5037 hi
->con_back
= cons
.first
.get();
5038 hi
->con_back
->set_priv(sb
);
5040 auto sf
= ceph::make_ref
<Session
>(cct
, cons
.second
.get());
5042 sf
->stamps
= stamps
;
5043 hi
->con_front
= cons
.second
.get();
5044 hi
->con_front
->set_priv(sf
);
5046 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5047 << " " << hi
->con_back
->get_peer_addr()
5048 << " " << hi
->con_front
->get_peer_addr()
5053 hi
->epoch
= get_osdmap_epoch();
5056 void OSD::_remove_heartbeat_peer(int n
)
5058 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
5059 ceph_assert(q
!= heartbeat_peers
.end());
5060 dout(20) << " removing heartbeat peer osd." << n
5061 << " " << q
->second
.con_back
->get_peer_addr()
5062 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
5064 q
->second
.clear_mark_down();
5065 heartbeat_peers
.erase(q
);
5068 void OSD::need_heartbeat_peer_update()
5072 dout(20) << "need_heartbeat_peer_update" << dendl
;
5073 heartbeat_set_peers_need_update();
5076 void OSD::maybe_update_heartbeat_peers()
5078 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5080 if (is_waiting_for_healthy() || is_active()) {
5081 utime_t now
= ceph_clock_now();
5082 if (last_heartbeat_resample
== utime_t()) {
5083 last_heartbeat_resample
= now
;
5084 heartbeat_set_peers_need_update();
5085 } else if (!heartbeat_peers_need_update()) {
5086 utime_t dur
= now
- last_heartbeat_resample
;
5087 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
5088 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
5089 heartbeat_set_peers_need_update();
5090 last_heartbeat_resample
= now
;
5091 // automatically clean up any stale heartbeat peers
5092 // if we are unhealthy, then clean all
5093 reset_heartbeat_peers(is_waiting_for_healthy());
5098 if (!heartbeat_peers_need_update())
5100 heartbeat_clear_peers_need_update();
5102 std::lock_guard
l(heartbeat_lock
);
5104 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
5107 // build heartbeat from set
5111 for (auto& pg
: pgs
) {
5112 pg
->with_heartbeat_peers([&](int peer
) {
5113 if (get_osdmap()->is_up(peer
)) {
5114 _add_heartbeat_peer(peer
);
5120 // include next and previous up osds to ensure we have a fully-connected set
5121 set
<int> want
, extras
;
5122 const int next
= get_osdmap()->get_next_up_osd_after(whoami
);
5125 int prev
= get_osdmap()->get_previous_up_osd_before(whoami
);
5126 if (prev
>= 0 && prev
!= next
)
5129 // make sure we have at least **min_down** osds coming from different
5130 // subtree level (e.g., hosts) for fast failure detection.
5131 auto min_down
= cct
->_conf
.get_val
<uint64_t>("mon_osd_min_down_reporters");
5132 auto subtree
= cct
->_conf
.get_val
<string
>("mon_osd_reporter_subtree_level");
5133 auto limit
= std::max(min_down
, (uint64_t)cct
->_conf
->osd_heartbeat_min_peers
);
5134 get_osdmap()->get_random_up_osds_by_subtree(
5135 whoami
, subtree
, limit
, want
, &want
);
5137 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
5138 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
5140 _add_heartbeat_peer(*p
);
5143 // remove down peers; enumerate extras
5144 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5145 while (p
!= heartbeat_peers
.end()) {
5146 if (!get_osdmap()->is_up(p
->first
)) {
5149 _remove_heartbeat_peer(o
);
5152 if (p
->second
.epoch
< get_osdmap_epoch()) {
5153 extras
.insert(p
->first
);
5159 for (int n
= next
; n
>= 0; ) {
5160 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
5162 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
5163 dout(10) << " adding random peer osd." << n
<< dendl
;
5165 _add_heartbeat_peer(n
);
5167 n
= get_osdmap()->get_next_up_osd_after(n
);
5169 break; // came full circle; stop
5173 for (set
<int>::iterator p
= extras
.begin();
5174 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
5178 _remove_heartbeat_peer(*p
);
5181 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
5183 // clean up stale failure pending
5184 for (auto it
= failure_pending
.begin(); it
!= failure_pending
.end();) {
5185 if (heartbeat_peers
.count(it
->first
) == 0) {
5186 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
5187 failure_pending
.erase(it
++);
5194 void OSD::reset_heartbeat_peers(bool all
)
5196 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5197 dout(10) << "reset_heartbeat_peers" << dendl
;
5198 utime_t stale
= ceph_clock_now();
5199 stale
-= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
5200 std::lock_guard
l(heartbeat_lock
);
5201 for (auto it
= heartbeat_peers
.begin(); it
!= heartbeat_peers
.end();) {
5202 HeartbeatInfo
& hi
= it
->second
;
5203 if (all
|| hi
.is_stale(stale
)) {
5204 hi
.clear_mark_down();
5205 // stop sending failure_report to mon too
5206 failure_queue
.erase(it
->first
);
5207 heartbeat_peers
.erase(it
++);
5214 void OSD::handle_osd_ping(MOSDPing
*m
)
5216 if (superblock
.cluster_fsid
!= m
->fsid
) {
5217 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
5218 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
5224 int from
= m
->get_source().num();
5226 heartbeat_lock
.lock();
5227 if (is_stopping()) {
5228 heartbeat_lock
.unlock();
5233 utime_t now
= ceph_clock_now();
5234 auto mnow
= service
.get_mnow();
5235 ConnectionRef
con(m
->get_connection());
5236 OSDMapRef curmap
= service
.get_osdmap();
5238 heartbeat_lock
.unlock();
5243 auto sref
= con
->get_priv();
5244 Session
*s
= static_cast<Session
*>(sref
.get());
5246 heartbeat_lock
.unlock();
5252 s
->stamps
= service
.get_hb_stamps(from
);
5257 case MOSDPing::PING
:
5259 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
5260 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
5261 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
5262 if (heartbeat_drop
->second
== 0) {
5263 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
5265 --heartbeat_drop
->second
;
5266 dout(5) << "Dropping heartbeat from " << from
5267 << ", " << heartbeat_drop
->second
5268 << " remaining to drop" << dendl
;
5271 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
5272 ((((double)(rand()%100))/100.0))) {
5274 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
5275 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
5276 dout(5) << "Dropping heartbeat from " << from
5277 << ", " << heartbeat_drop
->second
5278 << " remaining to drop" << dendl
;
5283 ceph::signedspan sender_delta_ub
{};
5284 s
->stamps
->got_ping(
5290 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5292 if (!cct
->get_heartbeat_map()->is_healthy()) {
5293 dout(10) << "internal heartbeat not healthy, dropping ping request"
5298 Message
*r
= new MOSDPing(monc
->get_fsid(),
5299 curmap
->get_epoch(),
5300 MOSDPing::PING_REPLY
,
5304 service
.get_up_epoch(),
5305 cct
->_conf
->osd_heartbeat_min_size
,
5307 con
->send_message(r
);
5309 if (curmap
->is_up(from
)) {
5311 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5312 from
, curmap
->get_epoch());
5314 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5317 } else if (!curmap
->exists(from
) ||
5318 curmap
->get_down_at(from
) > m
->map_epoch
) {
5319 // tell them they have died
5320 Message
*r
= new MOSDPing(monc
->get_fsid(),
5321 curmap
->get_epoch(),
5326 service
.get_up_epoch(),
5327 cct
->_conf
->osd_heartbeat_min_size
);
5328 con
->send_message(r
);
5333 case MOSDPing::PING_REPLY
:
5335 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
5336 if (i
!= heartbeat_peers
.end()) {
5337 auto acked
= i
->second
.ping_history
.find(m
->ping_stamp
);
5338 if (acked
!= i
->second
.ping_history
.end()) {
5339 int &unacknowledged
= acked
->second
.second
;
5340 if (con
== i
->second
.con_back
) {
5341 dout(25) << "handle_osd_ping got reply from osd." << from
5342 << " first_tx " << i
->second
.first_tx
5343 << " last_tx " << i
->second
.last_tx
5344 << " last_rx_back " << i
->second
.last_rx_back
5346 << " last_rx_front " << i
->second
.last_rx_front
5348 i
->second
.last_rx_back
= now
;
5349 ceph_assert(unacknowledged
> 0);
5351 // if there is no front con, set both stamps.
5352 if (i
->second
.con_front
== NULL
) {
5353 i
->second
.last_rx_front
= now
;
5354 ceph_assert(unacknowledged
> 0);
5357 } else if (con
== i
->second
.con_front
) {
5358 dout(25) << "handle_osd_ping got reply from osd." << from
5359 << " first_tx " << i
->second
.first_tx
5360 << " last_tx " << i
->second
.last_tx
5361 << " last_rx_back " << i
->second
.last_rx_back
5362 << " last_rx_front " << i
->second
.last_rx_front
5365 i
->second
.last_rx_front
= now
;
5366 ceph_assert(unacknowledged
> 0);
5370 if (unacknowledged
== 0) {
5371 // succeeded in getting all replies
5372 dout(25) << "handle_osd_ping got all replies from osd." << from
5373 << " , erase pending ping(sent at " << m
->ping_stamp
<< ")"
5374 << " and older pending ping(s)"
5377 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5378 ++i
->second
.hb_average_count
;
5379 uint32_t back_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_back
- m
->ping_stamp
);
5380 i
->second
.hb_total_back
+= back_pingtime
;
5381 if (back_pingtime
< i
->second
.hb_min_back
)
5382 i
->second
.hb_min_back
= back_pingtime
;
5383 if (back_pingtime
> i
->second
.hb_max_back
)
5384 i
->second
.hb_max_back
= back_pingtime
;
5385 uint32_t front_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_front
- m
->ping_stamp
);
5386 i
->second
.hb_total_front
+= front_pingtime
;
5387 if (front_pingtime
< i
->second
.hb_min_front
)
5388 i
->second
.hb_min_front
= front_pingtime
;
5389 if (front_pingtime
> i
->second
.hb_max_front
)
5390 i
->second
.hb_max_front
= front_pingtime
;
5392 ceph_assert(i
->second
.hb_interval_start
!= utime_t());
5393 if (i
->second
.hb_interval_start
== utime_t())
5394 i
->second
.hb_interval_start
= now
;
5395 int64_t hb_avg_time_period
= 60;
5396 if (cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span")) {
5397 hb_avg_time_period
= cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span");
5399 if (now
- i
->second
.hb_interval_start
>= utime_t(hb_avg_time_period
, 0)) {
5400 uint32_t back_avg
= i
->second
.hb_total_back
/ i
->second
.hb_average_count
;
5401 uint32_t back_min
= i
->second
.hb_min_back
;
5402 uint32_t back_max
= i
->second
.hb_max_back
;
5403 uint32_t front_avg
= i
->second
.hb_total_front
/ i
->second
.hb_average_count
;
5404 uint32_t front_min
= i
->second
.hb_min_front
;
5405 uint32_t front_max
= i
->second
.hb_max_front
;
5407 // Reset for new interval
5408 i
->second
.hb_average_count
= 0;
5409 i
->second
.hb_interval_start
= now
;
5410 i
->second
.hb_total_back
= i
->second
.hb_max_back
= 0;
5411 i
->second
.hb_min_back
= UINT_MAX
;
5412 i
->second
.hb_total_front
= i
->second
.hb_max_front
= 0;
5413 i
->second
.hb_min_front
= UINT_MAX
;
5415 // Record per osd interace ping times
5416 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5417 if (i
->second
.hb_back_pingtime
.size() == 0) {
5418 ceph_assert(i
->second
.hb_front_pingtime
.size() == 0);
5419 for (unsigned k
= 0 ; k
< hb_vector_size
; ++k
) {
5420 i
->second
.hb_back_pingtime
.push_back(back_avg
);
5421 i
->second
.hb_back_min
.push_back(back_min
);
5422 i
->second
.hb_back_max
.push_back(back_max
);
5423 i
->second
.hb_front_pingtime
.push_back(front_avg
);
5424 i
->second
.hb_front_min
.push_back(front_min
);
5425 i
->second
.hb_front_max
.push_back(front_max
);
5426 ++i
->second
.hb_index
;
5429 int index
= i
->second
.hb_index
& (hb_vector_size
- 1);
5430 i
->second
.hb_back_pingtime
[index
] = back_avg
;
5431 i
->second
.hb_back_min
[index
] = back_min
;
5432 i
->second
.hb_back_max
[index
] = back_max
;
5433 i
->second
.hb_front_pingtime
[index
] = front_avg
;
5434 i
->second
.hb_front_min
[index
] = front_min
;
5435 i
->second
.hb_front_max
[index
] = front_max
;
5436 ++i
->second
.hb_index
;
5440 std::lock_guard
l(service
.stat_lock
);
5441 service
.osd_stat
.hb_pingtime
[from
].last_update
= now
.sec();
5442 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5445 uint32_t min
= UINT_MAX
;
5449 uint32_t size
= (uint32_t)i
->second
.hb_back_pingtime
.size();
5450 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5452 int index
= (i
->second
.hb_index
+ k
) % size
;
5453 total
+= i
->second
.hb_back_pingtime
[index
];
5454 if (i
->second
.hb_back_min
[index
] < min
)
5455 min
= i
->second
.hb_back_min
[index
];
5456 if (i
->second
.hb_back_max
[index
] > max
)
5457 max
= i
->second
.hb_back_max
[index
];
5458 if (count
== 1 || count
== 5 || count
== 15) {
5459 service
.osd_stat
.hb_pingtime
[from
].back_pingtime
[which
] = total
/ count
;
5460 service
.osd_stat
.hb_pingtime
[from
].back_min
[which
] = min
;
5461 service
.osd_stat
.hb_pingtime
[from
].back_max
[which
] = max
;
5468 if (i
->second
.con_front
!= NULL
) {
5469 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5476 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5478 int index
= (i
->second
.hb_index
+ k
) % size
;
5479 total
+= i
->second
.hb_front_pingtime
[index
];
5480 if (i
->second
.hb_front_min
[index
] < min
)
5481 min
= i
->second
.hb_front_min
[index
];
5482 if (i
->second
.hb_front_max
[index
] > max
)
5483 max
= i
->second
.hb_front_max
[index
];
5484 if (count
== 1 || count
== 5 || count
== 15) {
5485 service
.osd_stat
.hb_pingtime
[from
].front_pingtime
[which
] = total
/ count
;
5486 service
.osd_stat
.hb_pingtime
[from
].front_min
[which
] = min
;
5487 service
.osd_stat
.hb_pingtime
[from
].front_max
[which
] = max
;
5496 std::lock_guard
l(service
.stat_lock
);
5497 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5498 if (i
->second
.con_front
!= NULL
)
5499 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5501 i
->second
.ping_history
.erase(i
->second
.ping_history
.begin(), ++acked
);
5504 if (i
->second
.is_healthy(now
)) {
5505 // Cancel false reports
5506 auto failure_queue_entry
= failure_queue
.find(from
);
5507 if (failure_queue_entry
!= failure_queue
.end()) {
5508 dout(10) << "handle_osd_ping canceling queued "
5509 << "failure report for osd." << from
<< dendl
;
5510 failure_queue
.erase(failure_queue_entry
);
5513 auto failure_pending_entry
= failure_pending
.find(from
);
5514 if (failure_pending_entry
!= failure_pending
.end()) {
5515 dout(10) << "handle_osd_ping canceling in-flight "
5516 << "failure report for osd." << from
<< dendl
;
5517 send_still_alive(curmap
->get_epoch(),
5519 failure_pending_entry
->second
.second
);
5520 failure_pending
.erase(failure_pending_entry
);
5524 // old replies, deprecated by newly sent pings.
5525 dout(10) << "handle_osd_ping no pending ping(sent at " << m
->ping_stamp
5526 << ") is found, treat as covered by newly sent pings "
5533 curmap
->is_up(from
)) {
5535 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5536 from
, curmap
->get_epoch());
5538 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5543 s
->stamps
->got_ping_reply(
5547 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5551 case MOSDPing::YOU_DIED
:
5552 dout(10) << "handle_osd_ping " << m
->get_source_inst()
5553 << " says i am down in " << m
->map_epoch
<< dendl
;
5554 osdmap_subscribe(curmap
->get_epoch()+1, false);
5558 heartbeat_lock
.unlock();
5562 void OSD::heartbeat_entry()
5564 std::unique_lock
l(heartbeat_lock
);
5567 while (!heartbeat_stop
) {
5571 if (cct
->_conf
.get_val
<bool>("debug_disable_randomized_ping")) {
5572 wait
= (float)cct
->_conf
->osd_heartbeat_interval
;
5574 wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
5576 auto w
= ceph::make_timespan(wait
);
5577 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
5578 heartbeat_cond
.wait_for(l
, w
);
5581 dout(30) << "heartbeat_entry woke up" << dendl
;
5585 void OSD::heartbeat_check()
5587 ceph_assert(ceph_mutex_is_locked(heartbeat_lock
));
5588 utime_t now
= ceph_clock_now();
5590 // check for incoming heartbeats (move me elsewhere?)
5591 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5592 p
!= heartbeat_peers
.end();
5595 if (p
->second
.first_tx
== utime_t()) {
5596 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
5597 << " yet, skipping" << dendl
;
5601 dout(25) << "heartbeat_check osd." << p
->first
5602 << " first_tx " << p
->second
.first_tx
5603 << " last_tx " << p
->second
.last_tx
5604 << " last_rx_back " << p
->second
.last_rx_back
5605 << " last_rx_front " << p
->second
.last_rx_front
5607 if (p
->second
.is_unhealthy(now
)) {
5608 utime_t oldest_deadline
= p
->second
.ping_history
.begin()->second
.first
;
5609 if (p
->second
.last_rx_back
== utime_t() ||
5610 p
->second
.last_rx_front
== utime_t()) {
5611 derr
<< "heartbeat_check: no reply from "
5612 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5613 << " osd." << p
->first
5614 << " ever on either front or back, first ping sent "
5615 << p
->second
.first_tx
5616 << " (oldest deadline " << oldest_deadline
<< ")"
5619 failure_queue
[p
->first
] = p
->second
.first_tx
;
5621 derr
<< "heartbeat_check: no reply from "
5622 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5623 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
5624 << " front " << p
->second
.last_rx_front
5625 << " (oldest deadline " << oldest_deadline
<< ")"
5628 failure_queue
[p
->first
] = std::min(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
5634 void OSD::heartbeat()
5636 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock
));
5637 dout(30) << "heartbeat" << dendl
;
5641 int hb_interval
= cct
->_conf
->osd_heartbeat_interval
;
5642 int n_samples
= 86400;
5643 if (hb_interval
> 1) {
5644 n_samples
/= hb_interval
;
5649 if (getloadavg(loadavgs
, 1) == 1) {
5650 logger
->set(l_osd_loadavg
, 100 * loadavgs
[0]);
5651 daily_loadavg
= (daily_loadavg
* (n_samples
- 1) + loadavgs
[0]) / n_samples
;
5652 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg
<< dendl
;
5655 dout(30) << "heartbeat checking stats" << dendl
;
5657 // refresh peer list and osd stats
5658 vector
<int> hb_peers
;
5659 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5660 p
!= heartbeat_peers
.end();
5662 hb_peers
.push_back(p
->first
);
5664 auto new_stat
= service
.set_osd_stat(hb_peers
, get_num_pgs());
5665 dout(5) << __func__
<< " " << new_stat
<< dendl
;
5666 ceph_assert(new_stat
.statfs
.total
);
5669 float ratio
= service
.compute_adjusted_ratio(new_stat
, &pratio
);
5671 service
.check_full_status(ratio
, pratio
);
5673 utime_t now
= ceph_clock_now();
5674 auto mnow
= service
.get_mnow();
5675 utime_t deadline
= now
;
5676 deadline
+= cct
->_conf
->osd_heartbeat_grace
;
5679 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
5680 i
!= heartbeat_peers
.end();
5682 int peer
= i
->first
;
5683 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
5685 i
->second
.last_tx
= now
;
5686 if (i
->second
.first_tx
== utime_t())
5687 i
->second
.first_tx
= now
;
5688 i
->second
.ping_history
[now
] = make_pair(deadline
,
5689 HeartbeatInfo::HEARTBEAT_MAX_CONN
);
5690 if (i
->second
.hb_interval_start
== utime_t())
5691 i
->second
.hb_interval_start
= now
;
5693 Session
*s
= static_cast<Session
*>(i
->second
.con_back
->get_priv().get());
5694 std::optional
<ceph::signedspan
> delta_ub
;
5695 s
->stamps
->sent_ping(&delta_ub
);
5697 i
->second
.con_back
->send_message(
5698 new MOSDPing(monc
->get_fsid(),
5699 service
.get_osdmap_epoch(),
5704 service
.get_up_epoch(),
5705 cct
->_conf
->osd_heartbeat_min_size
,
5708 if (i
->second
.con_front
)
5709 i
->second
.con_front
->send_message(
5710 new MOSDPing(monc
->get_fsid(),
5711 service
.get_osdmap_epoch(),
5716 service
.get_up_epoch(),
5717 cct
->_conf
->osd_heartbeat_min_size
,
5721 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
5723 // hmm.. am i all alone?
5724 dout(30) << "heartbeat lonely?" << dendl
;
5725 if (heartbeat_peers
.empty()) {
5726 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
5727 last_mon_heartbeat
= now
;
5728 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
5729 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5733 dout(30) << "heartbeat done" << dendl
;
5736 bool OSD::heartbeat_reset(Connection
*con
)
5738 std::lock_guard
l(heartbeat_lock
);
5739 auto s
= con
->get_priv();
5740 dout(20) << __func__
<< " con " << con
<< " s " << s
.get() << dendl
;
5741 con
->set_priv(nullptr);
5743 if (is_stopping()) {
5746 auto session
= static_cast<Session
*>(s
.get());
5747 auto p
= heartbeat_peers
.find(session
->peer
);
5748 if (p
!= heartbeat_peers
.end() &&
5749 (p
->second
.con_back
== con
||
5750 p
->second
.con_front
== con
)) {
5751 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5752 << ", reopening" << dendl
;
5753 p
->second
.clear_mark_down(con
);
5754 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
5756 p
->second
.con_back
= newcon
.first
.get();
5757 p
->second
.con_back
->set_priv(s
);
5758 if (newcon
.second
) {
5759 p
->second
.con_front
= newcon
.second
.get();
5760 p
->second
.con_front
->set_priv(s
);
5762 p
->second
.ping_history
.clear();
5764 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5765 << ", raced with osdmap update, closing out peer" << dendl
;
5766 heartbeat_peers
.erase(p
);
5769 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
5777 // =========================================
5781 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5782 dout(10) << "tick" << dendl
;
5784 utime_t now
= ceph_clock_now();
5785 // throw out any obsolete markdown log
5786 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
5787 while (!osd_markdown_log
.empty() &&
5788 osd_markdown_log
.front() + grace
< now
)
5789 osd_markdown_log
.pop_front();
5791 if (is_active() || is_waiting_for_healthy()) {
5792 maybe_update_heartbeat_peers();
5795 if (is_waiting_for_healthy()) {
5799 if (is_waiting_for_healthy() || is_booting()) {
5800 std::lock_guard
l(heartbeat_lock
);
5801 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
) {
5802 last_mon_heartbeat
= now
;
5803 dout(1) << __func__
<< " checking mon for new map" << dendl
;
5804 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5810 // scrub purged_snaps every deep scrub interval
5812 const utime_t last
= superblock
.last_purged_snaps_scrub
;
5813 utime_t next
= last
;
5814 next
+= cct
->_conf
->osd_scrub_min_interval
;
5816 // use a seed that is stable for each scrub interval, but varies
5817 // by OSD to avoid any herds.
5818 rng
.seed(whoami
+ superblock
.last_purged_snaps_scrub
.sec());
5819 double r
= (rng() % 1024) / 1024;
5821 cct
->_conf
->osd_scrub_min_interval
*
5822 cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
5823 if (next
< ceph_clock_now()) {
5824 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5825 << " next " << next
<< " ... now" << dendl
;
5826 scrub_purged_snaps();
5828 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5829 << " next " << next
<< dendl
;
5833 tick_timer
.add_event_after(get_tick_interval(), new C_Tick(this));
5836 void OSD::tick_without_osd_lock()
5838 ceph_assert(ceph_mutex_is_locked(tick_timer_lock
));
5839 dout(10) << "tick_without_osd_lock" << dendl
;
5841 logger
->set(l_osd_cached_crc
, buffer::get_cached_crc());
5842 logger
->set(l_osd_cached_crc_adjusted
, buffer::get_cached_crc_adjusted());
5843 logger
->set(l_osd_missed_crc
, buffer::get_missed_crc());
5845 // refresh osd stats
5846 struct store_statfs_t stbuf
;
5847 osd_alert_list_t alerts
;
5848 int r
= store
->statfs(&stbuf
, &alerts
);
5849 ceph_assert(r
== 0);
5850 service
.set_statfs(stbuf
, alerts
);
5852 // osd_lock is not being held, which means the OSD state
5853 // might change when doing the monitor report
5854 if (is_active() || is_waiting_for_healthy()) {
5856 std::lock_guard l
{heartbeat_lock
};
5859 map_lock
.lock_shared();
5860 std::lock_guard
l(mon_report_lock
);
5863 utime_t now
= ceph_clock_now();
5864 if (service
.need_fullness_update() ||
5865 now
- last_mon_report
> cct
->_conf
->osd_mon_report_interval
) {
5866 last_mon_report
= now
;
5870 map_lock
.unlock_shared();
5872 epoch_t max_waiting_epoch
= 0;
5873 for (auto s
: shards
) {
5874 max_waiting_epoch
= std::max(max_waiting_epoch
,
5875 s
->get_max_waiting_epoch());
5877 if (max_waiting_epoch
> get_osdmap()->get_epoch()) {
5878 dout(20) << __func__
<< " max_waiting_epoch " << max_waiting_epoch
5879 << ", requesting new map" << dendl
;
5880 osdmap_subscribe(superblock
.newest_map
+ 1, false);
5885 if (!scrub_random_backoff()) {
5888 service
.promote_throttle_recalibrate();
5889 resume_creating_pg();
5890 bool need_send_beacon
= false;
5891 const auto now
= ceph::coarse_mono_clock::now();
5893 // borrow lec lock to pretect last_sent_beacon from changing
5894 std::lock_guard l
{min_last_epoch_clean_lock
};
5895 const auto elapsed
= now
- last_sent_beacon
;
5896 if (chrono::duration_cast
<chrono::seconds
>(elapsed
).count() >
5897 cct
->_conf
->osd_beacon_report_interval
) {
5898 need_send_beacon
= true;
5901 if (need_send_beacon
) {
5906 mgrc
.update_daemon_health(get_health_metrics());
5907 service
.kick_recovery_queue();
5908 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
5909 new C_Tick_WithoutOSDLock(this));
5913 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5914 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5915 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5916 // getomap <pool> [namespace/]<obj-name>
5917 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5918 // injectmdataerr [namespace/]<obj-name> [shardid]
5919 // injectdataerr [namespace/]<obj-name> [shardid]
5921 // set_recovery_delay [utime]
5922 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
5923 std::string_view command
,
5924 const cmdmap_t
& cmdmap
, ostream
&ss
)
5927 //Support changing the omap on a single osd by using the Admin Socket to
5928 //directly request the osd make a change.
5929 if (command
== "setomapval" || command
== "rmomapkey" ||
5930 command
== "setomapheader" || command
== "getomap" ||
5931 command
== "truncobj" || command
== "injectmdataerr" ||
5932 command
== "injectdataerr"
5936 OSDMapRef curmap
= service
->get_osdmap();
5941 cmd_getval(cmdmap
, "pool", poolstr
);
5942 pool
= curmap
->lookup_pg_pool_name(poolstr
);
5943 //If we can't find it by name then maybe id specified
5944 if (pool
< 0 && isdigit(poolstr
[0]))
5945 pool
= atoll(poolstr
.c_str());
5947 ss
<< "Invalid pool '" << poolstr
<< "''";
5951 string objname
, nspace
;
5952 cmd_getval(cmdmap
, "objname", objname
);
5953 std::size_t found
= objname
.find_first_of('/');
5954 if (found
!= string::npos
) {
5955 nspace
= objname
.substr(0, found
);
5956 objname
= objname
.substr(found
+1);
5958 object_locator_t
oloc(pool
, nspace
);
5959 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
5962 ss
<< "Invalid namespace/objname";
5967 cmd_getval(cmdmap
, "shardid", shardid
, int64_t(shard_id_t::NO_SHARD
));
5968 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
5969 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
5970 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
5971 if (curmap
->pg_is_ec(rawpg
)) {
5972 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
5973 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
5978 ObjectStore::Transaction t
;
5980 if (command
== "setomapval") {
5981 map
<string
, bufferlist
> newattrs
;
5984 cmd_getval(cmdmap
, "key", key
);
5985 cmd_getval(cmdmap
, "val", valstr
);
5988 newattrs
[key
] = val
;
5989 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
5990 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5992 ss
<< "error=" << r
;
5995 } else if (command
== "rmomapkey") {
5997 cmd_getval(cmdmap
, "key", key
);
5999 t
.omap_rmkey(coll_t(pgid
), ghobject_t(obj
), key
);
6000 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6002 ss
<< "error=" << r
;
6005 } else if (command
== "setomapheader") {
6006 bufferlist newheader
;
6009 cmd_getval(cmdmap
, "header", headerstr
);
6010 newheader
.append(headerstr
);
6011 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
6012 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6014 ss
<< "error=" << r
;
6017 } else if (command
== "getomap") {
6018 //Debug: Output entire omap
6020 map
<string
, bufferlist
> keyvals
;
6021 auto ch
= store
->open_collection(coll_t(pgid
));
6023 ss
<< "unable to open collection for " << pgid
;
6026 r
= store
->omap_get(ch
, ghobject_t(obj
), &hdrbl
, &keyvals
);
6028 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
6029 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
6030 it
!= keyvals
.end(); ++it
)
6031 ss
<< " key=" << (*it
).first
<< " val="
6032 << string((*it
).second
.c_str(), (*it
).second
.length());
6034 ss
<< "error=" << r
;
6037 } else if (command
== "truncobj") {
6039 cmd_getval(cmdmap
, "len", trunclen
);
6040 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
6041 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6043 ss
<< "error=" << r
;
6046 } else if (command
== "injectdataerr") {
6047 store
->inject_data_error(gobj
);
6049 } else if (command
== "injectmdataerr") {
6050 store
->inject_mdata_error(gobj
);
6055 if (command
== "set_recovery_delay") {
6057 cmd_getval(cmdmap
, "utime", delay
, (int64_t)0);
6060 int r
= service
->cct
->_conf
.set_val("osd_recovery_delay_start",
6063 ss
<< "set_recovery_delay: error setting "
6064 << "osd_recovery_delay_start to '" << delay
<< "': error "
6068 service
->cct
->_conf
.apply_changes(nullptr);
6069 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
6070 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
6073 if (command
== "injectfull") {
6076 OSDService::s_names state
;
6077 cmd_getval(cmdmap
, "type", type
, string("full"));
6078 cmd_getval(cmdmap
, "count", count
, (int64_t)-1);
6079 if (type
== "none" || count
== 0) {
6083 state
= service
->get_full_state(type
);
6084 if (state
== OSDService::s_names::INVALID
) {
6085 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6088 service
->set_injectfull(state
, count
);
6091 ss
<< "Internal error - command=" << command
;
6094 // =========================================
6096 void OSD::ms_handle_connect(Connection
*con
)
6098 dout(10) << __func__
<< " con " << con
<< dendl
;
6099 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
6100 std::lock_guard
l(osd_lock
);
6103 dout(10) << __func__
<< " on mon" << dendl
;
6107 } else if (is_booting()) {
6108 _send_boot(); // resend boot message
6110 map_lock
.lock_shared();
6111 std::lock_guard
l2(mon_report_lock
);
6113 utime_t now
= ceph_clock_now();
6114 last_mon_report
= now
;
6116 // resend everything, it's a new session
6119 service
.requeue_pg_temp();
6120 service
.clear_sent_ready_to_merge();
6121 service
.send_pg_temp();
6122 service
.send_ready_to_merge();
6123 service
.send_pg_created();
6127 map_lock
.unlock_shared();
6129 send_beacon(ceph::coarse_mono_clock::now());
6133 // full map requests may happen while active or pre-boot
6134 if (requested_full_first
) {
6135 rerequest_full_maps();
6140 void OSD::ms_handle_fast_connect(Connection
*con
)
6142 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6143 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6144 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6145 s
= ceph::make_ref
<Session
>(cct
, con
);
6147 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
6148 << " addr=" << s
->con
->get_peer_addr() << dendl
;
6149 // we don't connect to clients
6150 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6151 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6156 void OSD::ms_handle_fast_accept(Connection
*con
)
6158 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6159 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6160 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6161 s
= ceph::make_ref
<Session
>(cct
, con
);
6163 dout(10) << "new session (incoming)" << s
<< " con=" << con
6164 << " addr=" << con
->get_peer_addr()
6165 << " must have raced with connect" << dendl
;
6166 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6167 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6172 bool OSD::ms_handle_reset(Connection
*con
)
6174 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6175 dout(2) << "ms_handle_reset con " << con
<< " session " << session
.get() << dendl
;
6178 session
->wstate
.reset(con
);
6179 session
->con
->set_priv(nullptr);
6180 session
->con
.reset(); // break con <-> session ref cycle
6181 // note that we break session->con *before* the session_handle_reset
6182 // cleanup below. this avoids a race between us and
6183 // PG::add_backoff, Session::check_backoff, etc.
6184 session_handle_reset(session
);
6188 bool OSD::ms_handle_refused(Connection
*con
)
6190 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
6193 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6194 dout(2) << "ms_handle_refused con " << con
<< " session " << session
.get() << dendl
;
6197 int type
= con
->get_peer_type();
6198 // handle only OSD failures here
6199 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
6200 OSDMapRef osdmap
= get_osdmap();
6202 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
6203 if (id
>= 0 && osdmap
->is_up(id
)) {
6204 // I'm cheating mon heartbeat grace logic, because we know it's not going
6205 // to respawn alone. +1 so we won't hit any boundary case.
6206 monc
->send_mon_message(
6210 osdmap
->get_addrs(id
),
6211 cct
->_conf
->osd_heartbeat_grace
+ 1,
6212 osdmap
->get_epoch(),
6213 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
6221 struct C_OSD_GetVersion
: public Context
{
6223 uint64_t oldest
, newest
;
6224 explicit C_OSD_GetVersion(OSD
*o
) : osd(o
), oldest(0), newest(0) {}
6225 void finish(int r
) override
{
6227 osd
->_got_mon_epochs(oldest
, newest
);
6231 void OSD::start_boot()
6233 if (!_is_healthy()) {
6234 // if we are not healthy, do not mark ourselves up (yet)
6235 dout(1) << "not healthy; waiting to boot" << dendl
;
6236 if (!is_waiting_for_healthy())
6237 start_waiting_for_healthy();
6238 // send pings sooner rather than later
6242 dout(1) << __func__
<< dendl
;
6243 set_state(STATE_PREBOOT
);
6244 dout(10) << "start_boot - have maps " << superblock
.oldest_map
6245 << ".." << superblock
.newest_map
<< dendl
;
6246 C_OSD_GetVersion
*c
= new C_OSD_GetVersion(this);
6247 monc
->get_version("osdmap", &c
->newest
, &c
->oldest
, c
);
6250 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
6252 std::lock_guard
l(osd_lock
);
6254 _preboot(oldest
, newest
);
6258 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
6260 ceph_assert(is_preboot());
6261 dout(10) << __func__
<< " _preboot mon has osdmaps "
6262 << oldest
<< ".." << newest
<< dendl
;
6264 // ensure our local fullness awareness is accurate
6266 std::lock_guard
l(heartbeat_lock
);
6270 const auto& monmap
= monc
->monmap
;
6271 const auto osdmap
= get_osdmap();
6272 // if our map within recent history, try to add ourselves to the osdmap.
6273 if (osdmap
->get_epoch() == 0) {
6274 derr
<< "waiting for initial osdmap" << dendl
;
6275 } else if (osdmap
->is_destroyed(whoami
)) {
6276 derr
<< "osdmap says I am destroyed" << dendl
;
6277 // provide a small margin so we don't livelock seeing if we
6278 // un-destroyed ourselves.
6279 if (osdmap
->get_epoch() > newest
- 1) {
6282 } else if (osdmap
->is_noup(whoami
)) {
6283 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
6284 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
6285 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6287 } else if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
6288 derr
<< "osdmap require_osd_release < luminous; please upgrade to luminous"
6290 } else if (service
.need_fullness_update()) {
6291 derr
<< "osdmap fullness state needs update" << dendl
;
6293 } else if (monmap
.min_mon_release
>= ceph_release_t::octopus
&&
6294 superblock
.purged_snaps_last
< superblock
.current_epoch
) {
6295 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6296 << " < newest_map " << superblock
.current_epoch
<< dendl
;
6297 _get_purged_snaps();
6298 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
6299 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
6301 // wait for pgs to fully catch up in a different thread, since
6302 // this thread might be required for splitting and merging PGs to
6304 boot_finisher
.queue(
6307 std::unique_lock
l(osd_lock
);
6309 dout(10) << __func__
<< " waiting for peering work to drain"
6312 for (auto shard
: shards
) {
6313 shard
->wait_min_pg_epoch(get_osdmap_epoch());
6324 // get all the latest maps
6325 if (osdmap
->get_epoch() + 1 >= oldest
)
6326 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6328 osdmap_subscribe(oldest
- 1, true);
6331 void OSD::_get_purged_snaps()
6333 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6334 // overlapping requests to the mon, which will be somewhat inefficient, but
6335 // it should be reliable.
6336 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6337 << ", newest_map " << superblock
.current_epoch
<< dendl
;
6338 MMonGetPurgedSnaps
*m
= new MMonGetPurgedSnaps(
6339 superblock
.purged_snaps_last
+ 1,
6340 superblock
.current_epoch
+ 1);
6341 monc
->send_mon_message(m
);
6344 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply
*m
)
6346 dout(10) << __func__
<< " " << *m
<< dendl
;
6347 ObjectStore::Transaction t
;
6348 if (!is_preboot() ||
6349 m
->last
< superblock
.purged_snaps_last
) {
6352 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
6353 make_purged_snaps_oid(), &t
,
6355 superblock
.purged_snaps_last
= m
->last
;
6356 write_superblock(t
);
6357 store
->queue_transaction(
6360 service
.publish_superblock(superblock
);
6361 if (m
->last
< superblock
.current_epoch
) {
6362 _get_purged_snaps();
6370 void OSD::send_full_update()
6372 if (!service
.need_fullness_update())
6375 if (service
.is_full()) {
6376 state
= CEPH_OSD_FULL
;
6377 } else if (service
.is_backfillfull()) {
6378 state
= CEPH_OSD_BACKFILLFULL
;
6379 } else if (service
.is_nearfull()) {
6380 state
= CEPH_OSD_NEARFULL
;
6383 OSDMap::calc_state_set(state
, s
);
6384 dout(10) << __func__
<< " want state " << s
<< dendl
;
6385 monc
->send_mon_message(new MOSDFull(get_osdmap_epoch(), state
));
6388 void OSD::start_waiting_for_healthy()
6390 dout(1) << "start_waiting_for_healthy" << dendl
;
6391 set_state(STATE_WAITING_FOR_HEALTHY
);
6392 last_heartbeat_resample
= utime_t();
6394 // subscribe to osdmap updates, in case our peers really are known to be dead
6395 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6398 bool OSD::_is_healthy()
6400 if (!cct
->get_heartbeat_map()->is_healthy()) {
6401 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
6405 if (is_waiting_for_healthy()) {
6406 utime_t now
= ceph_clock_now();
6407 if (osd_markdown_log
.empty()) {
6408 dout(5) << __func__
<< " force returning true since last markdown"
6409 << " was " << cct
->_conf
->osd_max_markdown_period
6410 << "s ago" << dendl
;
6413 std::lock_guard
l(heartbeat_lock
);
6414 int num
= 0, up
= 0;
6415 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
6416 p
!= heartbeat_peers
.end();
6418 if (p
->second
.is_healthy(now
))
6422 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
6423 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
6424 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
6432 void OSD::_send_boot()
6434 dout(10) << "_send_boot" << dendl
;
6435 Connection
*local_connection
=
6436 cluster_messenger
->get_loopback_connection().get();
6437 entity_addrvec_t client_addrs
= client_messenger
->get_myaddrs();
6438 entity_addrvec_t cluster_addrs
= cluster_messenger
->get_myaddrs();
6439 entity_addrvec_t hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6440 entity_addrvec_t hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6442 dout(20) << " initial client_addrs " << client_addrs
6443 << ", cluster_addrs " << cluster_addrs
6444 << ", hb_back_addrs " << hb_back_addrs
6445 << ", hb_front_addrs " << hb_front_addrs
6447 if (cluster_messenger
->set_addr_unknowns(client_addrs
)) {
6448 dout(10) << " assuming cluster_addrs match client_addrs "
6449 << client_addrs
<< dendl
;
6450 cluster_addrs
= cluster_messenger
->get_myaddrs();
6452 if (auto session
= local_connection
->get_priv(); !session
) {
6453 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6456 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
6457 if (hb_back_server_messenger
->set_addr_unknowns(cluster_addrs
)) {
6458 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6459 << cluster_addrs
<< dendl
;
6460 hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6462 if (auto session
= local_connection
->get_priv(); !session
) {
6463 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6466 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
6467 if (hb_front_server_messenger
->set_addr_unknowns(client_addrs
)) {
6468 dout(10) << " assuming hb_front_addrs match client_addrs "
6469 << client_addrs
<< dendl
;
6470 hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6472 if (auto session
= local_connection
->get_priv(); !session
) {
6473 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6476 // we now know what our front and back addrs will be, and we are
6477 // about to tell the mon what our metadata (including numa bindings)
6478 // are, so now is a good time!
6479 set_numa_affinity();
6481 MOSDBoot
*mboot
= new MOSDBoot(
6482 superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
6483 hb_back_addrs
, hb_front_addrs
, cluster_addrs
,
6485 dout(10) << " final client_addrs " << client_addrs
6486 << ", cluster_addrs " << cluster_addrs
6487 << ", hb_back_addrs " << hb_back_addrs
6488 << ", hb_front_addrs " << hb_front_addrs
6490 _collect_metadata(&mboot
->metadata
);
6491 monc
->send_mon_message(mboot
);
6492 set_state(STATE_BOOTING
);
6495 void OSD::_collect_metadata(map
<string
,string
> *pm
)
6498 (*pm
)["osd_data"] = dev_path
;
6499 if (store
->get_type() == "filestore") {
6500 // not applicable for bluestore
6501 (*pm
)["osd_journal"] = journal_path
;
6503 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddrs());
6504 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddrs());
6505 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddrs());
6506 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddrs());
6509 (*pm
)["osd_objectstore"] = store
->get_type();
6510 (*pm
)["rotational"] = store_is_rotational
? "1" : "0";
6511 (*pm
)["journal_rotational"] = journal_is_rotational
? "1" : "0";
6512 (*pm
)["default_device_class"] = store
->get_default_device_class();
6513 string osdspec_affinity
;
6514 int r
= store
->read_meta("osdspec_affinity", &osdspec_affinity
);
6515 if (r
< 0 || osdspec_affinity
.empty()) {
6516 osdspec_affinity
= "";
6518 (*pm
)["osdspec_affinity"] = osdspec_affinity
;
6519 store
->collect_metadata(pm
);
6521 collect_sys_info(pm
, cct
);
6523 (*pm
)["front_iface"] = pick_iface(
6525 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
6526 (*pm
)["back_iface"] = pick_iface(
6528 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
6534 set
<string
> unknown
;
6535 for (auto nm
: { "front_iface", "back_iface" }) {
6536 if (!(*pm
)[nm
].size()) {
6541 int r
= get_iface_numa_node((*pm
)[nm
], &n
);
6543 unknown
.insert((*pm
)[nm
]);
6551 if (unknown
.size()) {
6552 (*pm
)["network_numa_unknown_ifaces"] = stringify(unknown
);
6554 if (!nodes
.empty()) {
6555 (*pm
)["network_numa_nodes"] = stringify(nodes
);
6557 if (node
>= 0 && nodes
.size() == 1 && unknown
.empty()) {
6558 (*pm
)["network_numa_node"] = stringify(node
);
6562 if (numa_node
>= 0) {
6563 (*pm
)["numa_node"] = stringify(numa_node
);
6564 (*pm
)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size
,
6568 set
<string
> devnames
;
6569 store
->get_devices(&devnames
);
6570 map
<string
,string
> errs
;
6571 get_device_metadata(devnames
, pm
, &errs
);
6572 for (auto& i
: errs
) {
6573 dout(1) << __func__
<< " " << i
.first
<< ": " << i
.second
<< dendl
;
6575 dout(10) << __func__
<< " " << *pm
<< dendl
;
6578 void OSD::queue_want_up_thru(epoch_t want
)
6580 std::shared_lock map_locker
{map_lock
};
6581 epoch_t cur
= get_osdmap()->get_up_thru(whoami
);
6582 std::lock_guard
report_locker(mon_report_lock
);
6583 if (want
> up_thru_wanted
) {
6584 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
6585 << ", currently " << cur
6587 up_thru_wanted
= want
;
6590 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
6591 << ", currently " << cur
6596 void OSD::send_alive()
6598 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6599 const auto osdmap
= get_osdmap();
6600 if (!osdmap
->exists(whoami
))
6602 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
6603 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
6604 if (up_thru_wanted
> up_thru
) {
6605 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
6606 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
6610 void OSD::request_full_map(epoch_t first
, epoch_t last
)
6612 dout(10) << __func__
<< " " << first
<< ".." << last
6613 << ", previously requested "
6614 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
6615 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6616 ceph_assert(first
> 0 && last
> 0);
6617 ceph_assert(first
<= last
);
6618 ceph_assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
6619 if (requested_full_first
== 0) {
6621 requested_full_first
= first
;
6622 requested_full_last
= last
;
6623 } else if (last
<= requested_full_last
) {
6627 // additional request
6628 first
= requested_full_last
+ 1;
6629 requested_full_last
= last
;
6631 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
6632 req
->request_full(first
, last
);
6633 monc
->send_mon_message(req
);
6636 void OSD::got_full_map(epoch_t e
)
6638 ceph_assert(requested_full_first
<= requested_full_last
);
6639 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6640 if (requested_full_first
== 0) {
6641 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
6644 if (e
< requested_full_first
) {
6645 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6646 << ".." << requested_full_last
6647 << ", ignoring" << dendl
;
6650 if (e
>= requested_full_last
) {
6651 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6652 << ".." << requested_full_last
<< ", resetting" << dendl
;
6653 requested_full_first
= requested_full_last
= 0;
6657 requested_full_first
= e
+ 1;
6659 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6660 << ".." << requested_full_last
6661 << ", still need more" << dendl
;
6664 void OSD::requeue_failures()
6666 std::lock_guard
l(heartbeat_lock
);
6667 unsigned old_queue
= failure_queue
.size();
6668 unsigned old_pending
= failure_pending
.size();
6669 for (auto p
= failure_pending
.begin(); p
!= failure_pending
.end(); ) {
6670 failure_queue
[p
->first
] = p
->second
.first
;
6671 failure_pending
.erase(p
++);
6673 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
6674 << failure_queue
.size() << dendl
;
6677 void OSD::send_failures()
6679 ceph_assert(ceph_mutex_is_locked(map_lock
));
6680 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6681 std::lock_guard
l(heartbeat_lock
);
6682 utime_t now
= ceph_clock_now();
6683 const auto osdmap
= get_osdmap();
6684 while (!failure_queue
.empty()) {
6685 int osd
= failure_queue
.begin()->first
;
6686 if (!failure_pending
.count(osd
)) {
6687 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
6688 monc
->send_mon_message(
6692 osdmap
->get_addrs(osd
),
6694 osdmap
->get_epoch()));
6695 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
,
6696 osdmap
->get_addrs(osd
));
6698 failure_queue
.erase(osd
);
6702 void OSD::send_still_alive(epoch_t epoch
, int osd
, const entity_addrvec_t
&addrs
)
6704 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), osd
, addrs
, 0, epoch
,
6705 MOSDFailure::FLAG_ALIVE
);
6706 monc
->send_mon_message(m
);
6709 void OSD::cancel_pending_failures()
6711 std::lock_guard
l(heartbeat_lock
);
6712 auto it
= failure_pending
.begin();
6713 while (it
!= failure_pending
.end()) {
6714 dout(10) << __func__
<< " canceling in-flight failure report for osd."
6715 << it
->first
<< dendl
;
6716 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
6717 failure_pending
.erase(it
++);
6721 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
6723 const auto& monmap
= monc
->monmap
;
6724 // send beacon to mon even if we are just connected, and the monmap is not
6725 // initialized yet by then.
6726 if (monmap
.epoch
> 0 &&
6727 monmap
.get_required_features().contains_all(
6728 ceph::features::mon::FEATURE_LUMINOUS
)) {
6729 dout(20) << __func__
<< " sending" << dendl
;
6730 MOSDBeacon
* beacon
= nullptr;
6732 std::lock_guard l
{min_last_epoch_clean_lock
};
6733 beacon
= new MOSDBeacon(get_osdmap_epoch(),
6734 min_last_epoch_clean
,
6735 superblock
.last_purged_snaps_scrub
);
6736 beacon
->pgs
= min_last_epoch_clean_pgs
;
6737 last_sent_beacon
= now
;
6739 monc
->send_mon_message(beacon
);
6741 dout(20) << __func__
<< " not sending" << dendl
;
6745 void OSD::handle_command(MCommand
*m
)
6747 ConnectionRef con
= m
->get_connection();
6748 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6750 con
->send_message(new MCommandReply(m
, -EACCES
));
6754 if (!session
->caps
.allow_all()) {
6755 con
->send_message(new MCommandReply(m
, -EACCES
));
6759 cct
->get_admin_socket()->queue_tell_command(m
);
6764 class unlock_guard
{
6767 explicit unlock_guard(ceph::mutex
& mutex
)
6772 unlock_guard(unlock_guard
&) = delete;
6779 void OSD::scrub_purged_snaps()
6781 dout(10) << __func__
<< dendl
;
6782 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6783 SnapMapper::Scrubber
s(cct
, store
, service
.meta_ch
,
6784 make_snapmapper_oid(),
6785 make_purged_snaps_oid());
6786 clog
->debug() << "purged_snaps scrub starts";
6789 if (s
.stray
.size()) {
6790 clog
->debug() << "purged_snaps scrub found " << s
.stray
.size() << " strays";
6792 clog
->debug() << "purged_snaps scrub ok";
6794 set
<pair
<spg_t
,snapid_t
>> queued
;
6795 for (auto& [pool
, snap
, hash
, shard
] : s
.stray
) {
6796 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(pool
);
6798 dout(20) << __func__
<< " pool " << pool
<< " dne" << dendl
;
6801 pg_t
pgid(pi
->raw_hash_to_pg(hash
), pool
);
6802 spg_t
spgid(pgid
, shard
);
6803 pair
<spg_t
,snapid_t
> p(spgid
, snap
);
6804 if (queued
.count(p
)) {
6805 dout(20) << __func__
<< " pg " << spgid
<< " snap " << snap
6806 << " already queued" << dendl
;
6809 PGRef pg
= lookup_lock_pg(spgid
);
6811 dout(20) << __func__
<< " pg " << spgid
<< " not found" << dendl
;
6815 dout(10) << __func__
<< " requeue pg " << spgid
<< " " << pg
<< " snap "
6817 pg
->queue_snap_retrim(snap
);
6821 if (is_stopping()) {
6824 dout(10) << __func__
<< " done queueing pgs, updating superblock" << dendl
;
6825 ObjectStore::Transaction t
;
6826 superblock
.last_purged_snaps_scrub
= ceph_clock_now();
6827 write_superblock(t
);
6828 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
6829 ceph_assert(tr
== 0);
6831 send_beacon(ceph::coarse_mono_clock::now());
6833 dout(10) << __func__
<< " done" << dendl
;
6836 void OSD::probe_smart(const string
& only_devid
, ostream
& ss
)
6838 set
<string
> devnames
;
6839 store
->get_devices(&devnames
);
6840 uint64_t smart_timeout
= cct
->_conf
.get_val
<uint64_t>(
6841 "osd_smart_report_timeout");
6843 // == typedef std::map<std::string, mValue> mObject;
6844 json_spirit::mObject json_map
;
6846 for (auto dev
: devnames
) {
6847 // smartctl works only on physical devices; filter out any logical device
6848 if (dev
.find("dm-") == 0) {
6853 string devid
= get_device_id(dev
, &err
);
6854 if (devid
.size() == 0) {
6855 dout(10) << __func__
<< " no unique id for dev " << dev
<< " ("
6856 << err
<< "), skipping" << dendl
;
6859 if (only_devid
.size() && devid
!= only_devid
) {
6863 json_spirit::mValue smart_json
;
6864 if (block_device_get_metrics(dev
, smart_timeout
,
6866 dout(10) << "block_device_get_metrics failed for /dev/" << dev
<< dendl
;
6869 json_map
[devid
] = smart_json
;
6871 json_spirit::write(json_map
, ss
, json_spirit::pretty_print
);
6874 bool OSD::heartbeat_dispatch(Message
*m
)
6876 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
6877 switch (m
->get_type()) {
6880 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
6885 handle_osd_ping(static_cast<MOSDPing
*>(m
));
6889 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
6896 bool OSD::ms_dispatch(Message
*m
)
6898 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
6899 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
6900 service
.got_stop_ack();
6908 if (is_stopping()) {
6922 void OSDService::maybe_share_map(
6924 const OSDMapRef
& osdmap
,
6925 epoch_t peer_epoch_lb
)
6927 // NOTE: we assume caller hold something that keeps the Connection itself
6928 // pinned (e.g., an OpRequest's MessageRef).
6929 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6934 // assume the peer has the newer of the op's sent_epoch and what
6935 // we think we sent them.
6936 session
->sent_epoch_lock
.lock();
6937 if (peer_epoch_lb
> session
->last_sent_epoch
) {
6938 dout(10) << __func__
<< " con " << con
6939 << " " << con
->get_peer_addr()
6940 << " map epoch " << session
->last_sent_epoch
6941 << " -> " << peer_epoch_lb
<< " (as per caller)" << dendl
;
6942 session
->last_sent_epoch
= peer_epoch_lb
;
6944 epoch_t last_sent_epoch
= session
->last_sent_epoch
;
6945 session
->sent_epoch_lock
.unlock();
6947 if (osdmap
->get_epoch() <= last_sent_epoch
) {
6951 send_incremental_map(last_sent_epoch
, con
, osdmap
);
6952 last_sent_epoch
= osdmap
->get_epoch();
6954 session
->sent_epoch_lock
.lock();
6955 if (session
->last_sent_epoch
< last_sent_epoch
) {
6956 dout(10) << __func__
<< " con " << con
6957 << " " << con
->get_peer_addr()
6958 << " map epoch " << session
->last_sent_epoch
6959 << " -> " << last_sent_epoch
<< " (shared)" << dendl
;
6960 session
->last_sent_epoch
= last_sent_epoch
;
6962 session
->sent_epoch_lock
.unlock();
6965 void OSD::dispatch_session_waiting(const ceph::ref_t
<Session
>& session
, OSDMapRef osdmap
)
6967 ceph_assert(ceph_mutex_is_locked(session
->session_dispatch_lock
));
6969 auto i
= session
->waiting_on_map
.begin();
6970 while (i
!= session
->waiting_on_map
.end()) {
6971 OpRequestRef op
= &(*i
);
6972 ceph_assert(ms_can_fast_dispatch(op
->get_req()));
6973 auto m
= op
->get_req
<MOSDFastDispatchOp
>();
6974 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
6977 session
->waiting_on_map
.erase(i
++);
6981 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
6982 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
6983 static_cast<const MOSDOp
*>(m
)->get_pg());
6984 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
6988 pgid
= m
->get_spg();
6990 enqueue_op(pgid
, std::move(op
), m
->get_map_epoch());
6993 if (session
->waiting_on_map
.empty()) {
6994 clear_session_waiting_on_map(session
);
6996 register_session_waiting_on_map(session
);
7000 void OSD::ms_fast_dispatch(Message
*m
)
7003 if (service
.is_stopping()) {
7009 switch (m
->get_type()) {
7011 dout(10) << "ping from " << m
->get_source() << dendl
;
7014 case MSG_OSD_FORCE_RECOVERY
:
7015 handle_fast_force_recovery(static_cast<MOSDForceRecovery
*>(m
));
7017 case MSG_OSD_SCRUB2
:
7018 handle_fast_scrub(static_cast<MOSDScrub2
*>(m
));
7021 case MSG_OSD_PG_CREATE2
:
7022 return handle_fast_pg_create(static_cast<MOSDPGCreate2
*>(m
));
7023 case MSG_OSD_PG_QUERY
:
7024 return handle_fast_pg_query(static_cast<MOSDPGQuery
*>(m
));
7025 case MSG_OSD_PG_NOTIFY
:
7026 return handle_fast_pg_notify(static_cast<MOSDPGNotify
*>(m
));
7027 case MSG_OSD_PG_INFO
:
7028 return handle_fast_pg_info(static_cast<MOSDPGInfo
*>(m
));
7029 case MSG_OSD_PG_REMOVE
:
7030 return handle_fast_pg_remove(static_cast<MOSDPGRemove
*>(m
));
7032 // these are single-pg messages that handle themselves
7033 case MSG_OSD_PG_LOG
:
7034 case MSG_OSD_PG_TRIM
:
7035 case MSG_OSD_PG_NOTIFY2
:
7036 case MSG_OSD_PG_QUERY2
:
7037 case MSG_OSD_PG_INFO2
:
7038 case MSG_OSD_BACKFILL_RESERVE
:
7039 case MSG_OSD_RECOVERY_RESERVE
:
7040 case MSG_OSD_PG_LEASE
:
7041 case MSG_OSD_PG_LEASE_ACK
:
7043 MOSDPeeringOp
*pm
= static_cast<MOSDPeeringOp
*>(m
);
7044 if (require_osd_peer(pm
)) {
7045 enqueue_peering_evt(
7047 PGPeeringEventRef(pm
->get_event()));
7054 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7057 osd_reqid_t reqid
= op
->get_reqid();
7059 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
7060 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
7064 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7066 // note sender epoch, min req's epoch
7067 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
7068 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
7069 ceph_assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
7071 service
.maybe_inject_dispatch_delay();
7073 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
7074 m
->get_type() != CEPH_MSG_OSD_OP
) {
7075 // queue it directly
7077 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
7079 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
7081 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7082 // message that didn't have an explicit spg_t); we need to map
7083 // them to an spg_t while preserving delivery order.
7084 auto priv
= m
->get_connection()->get_priv();
7085 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
7086 std::lock_guard l
{session
->session_dispatch_lock
};
7088 session
->waiting_on_map
.push_back(*op
);
7089 OSDMapRef nextmap
= service
.get_nextmap_reserved();
7090 dispatch_session_waiting(session
, nextmap
);
7091 service
.release_map(nextmap
);
7094 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
7097 int OSD::ms_handle_authentication(Connection
*con
)
7100 auto s
= ceph::ref_cast
<Session
>(con
->get_priv());
7102 s
= ceph::make_ref
<Session
>(cct
, con
);
7104 s
->entity_name
= con
->get_peer_entity_name();
7105 dout(10) << __func__
<< " new session " << s
<< " con " << s
->con
7106 << " entity " << s
->entity_name
7107 << " addr " << con
->get_peer_addrs() << dendl
;
7109 dout(10) << __func__
<< " existing session " << s
<< " con " << s
->con
7110 << " entity " << s
->entity_name
7111 << " addr " << con
->get_peer_addrs() << dendl
;
7114 AuthCapsInfo
&caps_info
= con
->get_peer_caps_info();
7115 if (caps_info
.allow_all
) {
7116 s
->caps
.set_allow_all();
7117 } else if (caps_info
.caps
.length() > 0) {
7118 bufferlist::const_iterator p
= caps_info
.caps
.cbegin();
7123 catch (buffer::error
& e
) {
7124 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7125 << " failed to decode caps string" << dendl
;
7129 bool success
= s
->caps
.parse(str
);
7131 dout(10) << __func__
<< " session " << s
7132 << " " << s
->entity_name
7133 << " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
7136 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7137 << " failed to parse caps '" << str
<< "'" << dendl
;
7145 void OSD::do_waiters()
7147 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7149 dout(10) << "do_waiters -- start" << dendl
;
7150 while (!finished
.empty()) {
7151 OpRequestRef next
= finished
.front();
7152 finished
.pop_front();
7155 dout(10) << "do_waiters -- finish" << dendl
;
7158 void OSD::dispatch_op(OpRequestRef op
)
7160 switch (op
->get_req()->get_type()) {
7162 case MSG_OSD_PG_CREATE
:
7163 handle_pg_create(op
);
7168 void OSD::_dispatch(Message
*m
)
7170 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7171 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
7173 switch (m
->get_type()) {
7174 // -- don't need OSDMap --
7176 // map and replication
7177 case CEPH_MSG_OSD_MAP
:
7178 handle_osd_map(static_cast<MOSDMap
*>(m
));
7180 case MSG_MON_GET_PURGED_SNAPS_REPLY
:
7181 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply
*>(m
));
7186 handle_scrub(static_cast<MOSDScrub
*>(m
));
7190 handle_command(static_cast<MCommand
*>(m
));
7193 // -- need OSDMap --
7195 case MSG_OSD_PG_CREATE
:
7197 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7199 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7200 // no map? starting up?
7201 if (!get_osdmap()) {
7202 dout(7) << "no OSDMap, not booted" << dendl
;
7203 logger
->inc(l_osd_waiting_for_map
);
7204 waiting_for_osdmap
.push_back(op
);
7205 op
->mark_delayed("no osdmap");
7215 // remove me post-nautilus
7216 void OSD::handle_scrub(MOSDScrub
*m
)
7218 dout(10) << "handle_scrub " << *m
<< dendl
;
7219 if (!require_mon_or_mgr_peer(m
)) {
7223 if (m
->fsid
!= monc
->get_fsid()) {
7224 dout(0) << "handle_scrub fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7233 if (!m
->scrub_pgs
.empty()) {
7235 for (auto pgid
: m
->scrub_pgs
) {
7237 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
7238 std::find(spgs
.begin(), spgs
.end(), pcand
) != spgs
.end()) {
7245 for (auto pgid
: spgs
) {
7246 enqueue_peering_evt(
7249 std::make_shared
<PGPeeringEvent
>(
7252 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7258 void OSD::handle_fast_scrub(MOSDScrub2
*m
)
7260 dout(10) << __func__
<< " " << *m
<< dendl
;
7261 if (!require_mon_or_mgr_peer(m
)) {
7265 if (m
->fsid
!= monc
->get_fsid()) {
7266 dout(0) << __func__
<< " fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7271 for (auto pgid
: m
->scrub_pgs
) {
7272 enqueue_peering_evt(
7275 std::make_shared
<PGPeeringEvent
>(
7278 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7283 bool OSD::scrub_random_backoff()
7285 bool coin_flip
= (rand() / (double)RAND_MAX
>=
7286 cct
->_conf
->osd_scrub_backoff_ratio
);
7288 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl
;
7294 OSDService::ScrubJob::ScrubJob(CephContext
* cct
,
7295 const spg_t
& pg
, const utime_t
& timestamp
,
7296 double pool_scrub_min_interval
,
7297 double pool_scrub_max_interval
, bool must
)
7300 sched_time(timestamp
),
7303 // if not explicitly requested, postpone the scrub with a random delay
7305 double scrub_min_interval
= pool_scrub_min_interval
> 0 ?
7306 pool_scrub_min_interval
: cct
->_conf
->osd_scrub_min_interval
;
7307 double scrub_max_interval
= pool_scrub_max_interval
> 0 ?
7308 pool_scrub_max_interval
: cct
->_conf
->osd_scrub_max_interval
;
7310 sched_time
+= scrub_min_interval
;
7311 double r
= rand() / (double)RAND_MAX
;
7313 scrub_min_interval
* cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
7314 if (scrub_max_interval
== 0) {
7315 deadline
= utime_t();
7317 deadline
+= scrub_max_interval
;
7323 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob
& rhs
) const {
7324 if (sched_time
< rhs
.sched_time
)
7326 if (sched_time
> rhs
.sched_time
)
7328 return pgid
< rhs
.pgid
;
7331 double OSD::scrub_sleep_time(bool must_scrub
)
7334 return cct
->_conf
->osd_scrub_sleep
;
7336 utime_t now
= ceph_clock_now();
7337 if (scrub_time_permit(now
)) {
7338 return cct
->_conf
->osd_scrub_sleep
;
7340 double normal_sleep
= cct
->_conf
->osd_scrub_sleep
;
7341 double extended_sleep
= cct
->_conf
->osd_scrub_extended_sleep
;
7342 return std::max(extended_sleep
, normal_sleep
);
7345 bool OSD::scrub_time_permit(utime_t now
)
7348 time_t tt
= now
.sec();
7349 localtime_r(&tt
, &bdt
);
7351 bool day_permit
= false;
7352 if (cct
->_conf
->osd_scrub_begin_week_day
< cct
->_conf
->osd_scrub_end_week_day
) {
7353 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
&& bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7357 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
|| bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7363 dout(20) << __func__
<< " should run between week day " << cct
->_conf
->osd_scrub_begin_week_day
7364 << " - " << cct
->_conf
->osd_scrub_end_week_day
7365 << " now " << bdt
.tm_wday
<< " = no" << dendl
;
7369 bool time_permit
= false;
7370 if (cct
->_conf
->osd_scrub_begin_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7371 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
&& bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7375 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
|| bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7380 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7381 << " - " << cct
->_conf
->osd_scrub_end_hour
7382 << " now " << bdt
.tm_hour
<< " = no" << dendl
;
7384 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7385 << " - " << cct
->_conf
->osd_scrub_end_hour
7386 << " now " << bdt
.tm_hour
<< " = yes" << dendl
;
7391 bool OSD::scrub_load_below_threshold()
7394 if (getloadavg(loadavgs
, 3) != 3) {
7395 dout(10) << __func__
<< " couldn't read loadavgs\n" << dendl
;
7399 // allow scrub if below configured threshold
7400 long cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
7401 double loadavg_per_cpu
= cpus
> 0 ? loadavgs
[0] / cpus
: loadavgs
[0];
7402 if (loadavg_per_cpu
< cct
->_conf
->osd_scrub_load_threshold
) {
7403 dout(20) << __func__
<< " loadavg per cpu " << loadavg_per_cpu
7404 << " < max " << cct
->_conf
->osd_scrub_load_threshold
7405 << " = yes" << dendl
;
7409 // allow scrub if below daily avg and currently decreasing
7410 if (loadavgs
[0] < daily_loadavg
&& loadavgs
[0] < loadavgs
[2]) {
7411 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7412 << " < daily_loadavg " << daily_loadavg
7413 << " and < 15m avg " << loadavgs
[2]
7414 << " = yes" << dendl
;
7418 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7419 << " >= max " << cct
->_conf
->osd_scrub_load_threshold
7420 << " and ( >= daily_loadavg " << daily_loadavg
7421 << " or >= 15m avg " << loadavgs
[2]
7422 << ") = no" << dendl
;
7426 void OSD::sched_scrub()
7428 // if not permitted, fail fast
7429 if (!service
.can_inc_scrubs()) {
7432 bool allow_requested_repair_only
= false;
7433 if (service
.is_recovery_active() && !cct
->_conf
->osd_scrub_during_recovery
) {
7434 if (!cct
->_conf
->osd_repair_during_recovery
) {
7435 dout(20) << __func__
<< " not scheduling scrubs due to active recovery" << dendl
;
7438 dout(10) << __func__
7439 << " will only schedule explicitly requested repair due to active recovery"
7441 allow_requested_repair_only
= true;
7444 utime_t now
= ceph_clock_now();
7445 bool time_permit
= scrub_time_permit(now
);
7446 bool load_is_low
= scrub_load_below_threshold();
7447 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low
<< dendl
;
7449 OSDService::ScrubJob scrub
;
7450 if (service
.first_scrub_stamp(&scrub
)) {
7452 dout(30) << "sched_scrub examine " << scrub
.pgid
<< " at " << scrub
.sched_time
<< dendl
;
7454 if (scrub
.sched_time
> now
) {
7455 // save ourselves some effort
7456 dout(10) << "sched_scrub " << scrub
.pgid
<< " scheduled at " << scrub
.sched_time
7457 << " > " << now
<< dendl
;
7461 if ((scrub
.deadline
.is_zero() || scrub
.deadline
>= now
) && !(time_permit
&& load_is_low
)) {
7462 dout(10) << __func__
<< " not scheduling scrub for " << scrub
.pgid
<< " due to "
7463 << (!time_permit
? "time not permit" : "high load") << dendl
;
7467 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7470 // This has already started, so go on to the next scrub job
7471 if (pg
->scrubber
.active
) {
7473 dout(30) << __func__
<< ": already in progress pgid " << scrub
.pgid
<< dendl
;
7476 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7477 if (allow_requested_repair_only
&& !pg
->scrubber
.must_repair
) {
7479 dout(10) << __func__
<< " skip " << scrub
.pgid
7480 << " because repairing is not explicitly requested on it"
7484 // If it is reserving, let it resolve before going to the next scrub job
7485 if (pg
->scrubber
.local_reserved
&& !pg
->scrubber
.active
) {
7487 dout(30) << __func__
<< ": reserve in progress pgid " << scrub
.pgid
<< dendl
;
7490 dout(10) << "sched_scrub scrubbing " << scrub
.pgid
<< " at " << scrub
.sched_time
7491 << (pg
->get_must_scrub() ? ", explicitly requested" :
7492 (load_is_low
? ", load_is_low" : " deadline < now"))
7494 if (pg
->sched_scrub()) {
7499 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7501 dout(20) << "sched_scrub done" << dendl
;
7504 void OSD::resched_all_scrubs()
7506 dout(10) << __func__
<< ": start" << dendl
;
7507 OSDService::ScrubJob scrub
;
7508 if (service
.first_scrub_stamp(&scrub
)) {
7510 dout(20) << __func__
<< ": examine " << scrub
.pgid
<< dendl
;
7512 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7515 if (!pg
->scrubber
.must_scrub
&& !pg
->scrubber
.need_auto
) {
7516 dout(20) << __func__
<< ": reschedule " << scrub
.pgid
<< dendl
;
7517 pg
->on_info_history_change();
7520 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7522 dout(10) << __func__
<< ": done" << dendl
;
7525 MPGStats
* OSD::collect_pg_stats()
7527 // This implementation unconditionally sends every is_primary PG's
7528 // stats every time we're called. This has equivalent cost to the
7529 // previous implementation's worst case where all PGs are busy and
7530 // their stats are always enqueued for sending.
7531 std::shared_lock l
{map_lock
};
7533 osd_stat_t cur_stat
= service
.get_osd_stat();
7534 cur_stat
.os_perf_stat
= store
->get_cur_stats();
7536 auto m
= new MPGStats(monc
->get_fsid(), get_osdmap_epoch());
7537 m
->osd_stat
= cur_stat
;
7539 std::lock_guard lec
{min_last_epoch_clean_lock
};
7540 min_last_epoch_clean
= get_osdmap_epoch();
7541 min_last_epoch_clean_pgs
.clear();
7543 std::set
<int64_t> pool_set
;
7546 for (auto& pg
: pgs
) {
7547 auto pool
= pg
->pg_id
.pgid
.pool();
7548 pool_set
.emplace((int64_t)pool
);
7549 if (!pg
->is_primary()) {
7552 pg
->get_pg_stats([&](const pg_stat_t
& s
, epoch_t lec
) {
7553 m
->pg_stat
[pg
->pg_id
.pgid
] = s
;
7554 min_last_epoch_clean
= min(min_last_epoch_clean
, lec
);
7555 min_last_epoch_clean_pgs
.push_back(pg
->pg_id
.pgid
);
7559 bool per_pool_stats
= false;
7560 bool per_pool_omap_stats
= false;
7561 for (auto p
: pool_set
) {
7562 int r
= store
->pool_statfs(p
, &st
, &per_pool_omap_stats
);
7563 if (r
== -ENOTSUP
) {
7567 m
->pool_stat
[p
] = st
;
7568 per_pool_stats
= true;
7572 // indicate whether we are reporting per-pool stats
7573 m
->osd_stat
.num_osds
= 1;
7574 m
->osd_stat
.num_per_pool_osds
= per_pool_stats
? 1 : 0;
7575 m
->osd_stat
.num_per_pool_omap_osds
= per_pool_omap_stats
? 1 : 0;
7580 vector
<DaemonHealthMetric
> OSD::get_health_metrics()
7582 vector
<DaemonHealthMetric
> metrics
;
7584 utime_t oldest_secs
;
7585 const utime_t now
= ceph_clock_now();
7587 too_old
-= cct
->_conf
.get_val
<double>("osd_op_complaint_time");
7589 TrackedOpRef oldest_op
;
7590 auto count_slow_ops
= [&](TrackedOp
& op
) {
7591 if (op
.get_initiated() < too_old
) {
7593 ss
<< "slow request " << op
.get_desc()
7595 << op
.get_initiated()
7597 << op
.state_string();
7598 lgeneric_subdout(cct
,osd
,20) << ss
.str() << dendl
;
7599 clog
->warn() << ss
.str();
7601 if (!oldest_op
|| op
.get_initiated() < oldest_op
->get_initiated()) {
7609 if (op_tracker
.visit_ops_in_flight(&oldest_secs
, count_slow_ops
)) {
7611 derr
<< __func__
<< " reporting " << slow
<< " slow ops, oldest is "
7612 << oldest_op
->get_desc() << dendl
;
7614 metrics
.emplace_back(daemon_metric::SLOW_OPS
, slow
, oldest_secs
);
7616 // no news is not good news.
7617 metrics
.emplace_back(daemon_metric::SLOW_OPS
, 0, 0);
7621 std::lock_guard
l(pending_creates_lock
);
7622 auto n_primaries
= pending_creates_from_mon
;
7623 for (const auto& create
: pending_creates_from_osd
) {
7624 if (create
.second
) {
7628 metrics
.emplace_back(daemon_metric::PENDING_CREATING_PGS
, n_primaries
);
7633 // =====================================================
7636 void OSD::wait_for_new_map(OpRequestRef op
)
7639 if (waiting_for_osdmap
.empty()) {
7640 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7643 logger
->inc(l_osd_waiting_for_map
);
7644 waiting_for_osdmap
.push_back(op
);
7645 op
->mark_delayed("wait for new map");
7650 * assimilate new OSDMap(s). scan pgs, etc.
7653 void OSD::note_down_osd(int peer
)
7655 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7656 cluster_messenger
->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer
));
7658 std::lock_guard l
{heartbeat_lock
};
7659 failure_queue
.erase(peer
);
7660 failure_pending
.erase(peer
);
7661 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
7662 if (p
!= heartbeat_peers
.end()) {
7663 p
->second
.clear_mark_down();
7664 heartbeat_peers
.erase(p
);
7668 void OSD::note_up_osd(int peer
)
7670 heartbeat_set_peers_need_update();
7673 struct C_OnMapCommit
: public Context
{
7675 epoch_t first
, last
;
7677 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
7678 : osd(o
), first(f
), last(l
), msg(m
) {}
7679 void finish(int r
) override
{
7680 osd
->_committed_osd_maps(first
, last
, msg
);
7685 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
7687 std::lock_guard
l(osdmap_subscribe_lock
);
7688 if (latest_subscribed_epoch
>= epoch
&& !force_request
)
7691 latest_subscribed_epoch
= std::max
<uint64_t>(epoch
, latest_subscribed_epoch
);
7693 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
7699 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
7701 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
7702 if (min
<= superblock
.oldest_map
)
7706 ObjectStore::Transaction t
;
7707 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
7708 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
7709 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
7710 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
7711 superblock
.oldest_map
= e
+ 1;
7713 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
7714 service
.publish_superblock(superblock
);
7715 write_superblock(t
);
7716 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7717 ceph_assert(tr
== 0);
7720 // skip_maps leaves us with a range of old maps if we fail to remove all
7721 // of them before moving superblock.oldest_map forward to the first map
7722 // in the incoming MOSDMap msg. so we should continue removing them in
7723 // this case, even we could do huge series of delete transactions all at
7730 service
.publish_superblock(superblock
);
7731 write_superblock(t
);
7732 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7733 ceph_assert(tr
== 0);
7735 // we should not remove the cached maps
7736 ceph_assert(min
<= service
.map_cache
.cached_key_lower_bound());
7739 void OSD::handle_osd_map(MOSDMap
*m
)
7741 // wait for pgs to catch up
7743 // we extend the map cache pins to accomodate pgs slow to consume maps
7744 // for some period, until we hit the max_lag_factor bound, at which point
7745 // we block here to stop injesting more maps than they are able to keep
7747 epoch_t max_lag
= cct
->_conf
->osd_map_cache_size
*
7748 m_osd_pg_epoch_max_lag_factor
;
7749 ceph_assert(max_lag
> 0);
7750 epoch_t osd_min
= 0;
7751 for (auto shard
: shards
) {
7752 epoch_t min
= shard
->get_min_pg_epoch();
7753 if (osd_min
== 0 || min
< osd_min
) {
7757 epoch_t osdmap_epoch
= get_osdmap_epoch();
7759 osdmap_epoch
> max_lag
&&
7760 osdmap_epoch
- max_lag
> osd_min
) {
7761 epoch_t need
= osdmap_epoch
- max_lag
;
7762 dout(10) << __func__
<< " waiting for pgs to catch up (need " << need
7763 << " max_lag " << max_lag
<< ")" << dendl
;
7764 for (auto shard
: shards
) {
7765 epoch_t min
= shard
->get_min_pg_epoch();
7767 dout(10) << __func__
<< " waiting for pgs to consume " << need
7768 << " (shard " << shard
->shard_id
<< " min " << min
7769 << ", map cache is " << cct
->_conf
->osd_map_cache_size
7770 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7772 unlock_guard unlock
{osd_lock
};
7773 shard
->wait_min_pg_epoch(need
);
7779 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7780 map
<epoch_t
,OSDMapRef
> added_maps
;
7781 map
<epoch_t
,bufferlist
> added_maps_bl
;
7782 if (m
->fsid
!= monc
->get_fsid()) {
7783 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
7784 << monc
->get_fsid() << dendl
;
7788 if (is_initializing()) {
7789 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
7794 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
7795 if (session
&& !(session
->entity_name
.is_mon() ||
7796 session
->entity_name
.is_osd())) {
7798 dout(10) << "got osd map from Session " << session
7799 << " which we can't take maps from (not a mon or osd)" << dendl
;
7804 // share with the objecter
7806 service
.objecter
->handle_osd_map(m
);
7808 epoch_t first
= m
->get_first();
7809 epoch_t last
= m
->get_last();
7810 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
7811 << superblock
.newest_map
7812 << ", src has [" << m
->oldest_map
<< "," << m
->newest_map
<< "]"
7815 logger
->inc(l_osd_map
);
7816 logger
->inc(l_osd_mape
, last
- first
+ 1);
7817 if (first
<= superblock
.newest_map
)
7818 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
7819 if (service
.max_oldest_map
< m
->oldest_map
) {
7820 service
.max_oldest_map
= m
->oldest_map
;
7821 ceph_assert(service
.max_oldest_map
>= superblock
.oldest_map
);
7824 // make sure there is something new, here, before we bother flushing
7825 // the queues and such
7826 if (last
<= superblock
.newest_map
) {
7827 dout(10) << " no new maps here, dropping" << dendl
;
7833 bool skip_maps
= false;
7834 if (first
> superblock
.newest_map
+ 1) {
7835 dout(10) << "handle_osd_map message skips epochs "
7836 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
7837 if (m
->oldest_map
<= superblock
.newest_map
+ 1) {
7838 osdmap_subscribe(superblock
.newest_map
+ 1, false);
7842 // always try to get the full range of maps--as many as we can. this
7843 // 1- is good to have
7844 // 2- is at present the only way to ensure that we get a *full* map as
7846 if (m
->oldest_map
< first
) {
7847 osdmap_subscribe(m
->oldest_map
- 1, true);
7854 ObjectStore::Transaction t
;
7855 uint64_t txn_size
= 0;
7857 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> purged_snaps
;
7859 // store new maps: queue for disk and put in the osdmap cache
7860 epoch_t start
= std::max(superblock
.newest_map
+ 1, first
);
7861 for (epoch_t e
= start
; e
<= last
; e
++) {
7862 if (txn_size
>= t
.get_num_bytes()) {
7863 derr
<< __func__
<< " transaction size overflowed" << dendl
;
7864 ceph_assert(txn_size
< t
.get_num_bytes());
7866 txn_size
= t
.get_num_bytes();
7867 map
<epoch_t
,bufferlist
>::iterator p
;
7868 p
= m
->maps
.find(e
);
7869 if (p
!= m
->maps
.end()) {
7870 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
7871 OSDMap
*o
= new OSDMap
;
7872 bufferlist
& bl
= p
->second
;
7876 purged_snaps
[e
] = o
->get_new_purged_snaps();
7878 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
7879 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
7880 added_maps
[e
] = add_map(o
);
7881 added_maps_bl
[e
] = bl
;
7886 p
= m
->incremental_maps
.find(e
);
7887 if (p
!= m
->incremental_maps
.end()) {
7888 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
7889 bufferlist
& bl
= p
->second
;
7890 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
7891 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
7893 OSDMap
*o
= new OSDMap
;
7896 bool got
= get_map_bl(e
- 1, obl
);
7898 auto p
= added_maps_bl
.find(e
- 1);
7899 ceph_assert(p
!= added_maps_bl
.end());
7905 OSDMap::Incremental inc
;
7906 auto p
= bl
.cbegin();
7909 if (o
->apply_incremental(inc
) < 0) {
7910 derr
<< "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
7911 ceph_abort_msg("bad fsid");
7915 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
7917 bool injected_failure
= false;
7918 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
7919 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
7920 derr
<< __func__
<< " injecting map crc failure" << dendl
;
7921 injected_failure
= true;
7924 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
7925 dout(2) << "got incremental " << e
7926 << " but failed to encode full with correct crc; requesting"
7928 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
7929 dout(20) << "my encoded map was:\n";
7930 fbl
.hexdump(*_dout
);
7933 request_full_map(e
, last
);
7936 // don't continue committing if we failed to enc the first inc map
7938 dout(10) << __func__
<< " bailing because last < start (" << last
<< "<" << start
<< ")" << dendl
;
7945 purged_snaps
[e
] = o
->get_new_purged_snaps();
7947 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
7948 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
7949 added_maps
[e
] = add_map(o
);
7950 added_maps_bl
[e
] = fbl
;
7954 ceph_abort_msg("MOSDMap lied about what maps it had?");
7957 // even if this map isn't from a mon, we may have satisfied our subscription
7958 monc
->sub_got("osdmap", last
);
7960 if (!m
->maps
.empty() && requested_full_first
) {
7961 dout(10) << __func__
<< " still missing full maps " << requested_full_first
7962 << ".." << requested_full_last
<< dendl
;
7963 rerequest_full_maps();
7966 if (superblock
.oldest_map
) {
7967 // make sure we at least keep pace with incoming maps
7968 trim_maps(m
->oldest_map
, last
- first
+ 1, skip_maps
);
7969 pg_num_history
.prune(superblock
.oldest_map
);
7972 if (!superblock
.oldest_map
|| skip_maps
)
7973 superblock
.oldest_map
= first
;
7974 superblock
.newest_map
= last
;
7975 superblock
.current_epoch
= last
;
7977 // note in the superblock that we were clean thru the prior epoch
7978 epoch_t boot_epoch
= service
.get_boot_epoch();
7979 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
7980 superblock
.mounted
= boot_epoch
;
7981 superblock
.clean_thru
= last
;
7984 // check for pg_num changes and deleted pools
7986 for (auto& i
: added_maps
) {
7988 if (!(lastmap
= service
.try_get_map(i
.first
- 1))) {
7989 dout(10) << __func__
<< " can't get previous map " << i
.first
- 1
7990 << " probably first start of this osd" << dendl
;
7994 ceph_assert(lastmap
->get_epoch() + 1 == i
.second
->get_epoch());
7995 for (auto& j
: lastmap
->get_pools()) {
7996 if (!i
.second
->have_pg_pool(j
.first
)) {
7997 pg_num_history
.log_pool_delete(i
.first
, j
.first
);
7998 dout(10) << __func__
<< " recording final pg_pool_t for pool "
7999 << j
.first
<< dendl
;
8000 // this information is needed by _make_pg() if have to restart before
8001 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8002 ghobject_t obj
= make_final_pool_info_oid(j
.first
);
8004 encode(j
.second
, bl
, CEPH_FEATURES_ALL
);
8005 string name
= lastmap
->get_pool_name(j
.first
);
8007 map
<string
,string
> profile
;
8008 if (lastmap
->get_pg_pool(j
.first
)->is_erasure()) {
8009 profile
= lastmap
->get_erasure_code_profile(
8010 lastmap
->get_pg_pool(j
.first
)->erasure_code_profile
);
8012 encode(profile
, bl
);
8013 t
.write(coll_t::meta(), obj
, 0, bl
.length(), bl
);
8014 } else if (unsigned new_pg_num
= i
.second
->get_pg_num(j
.first
);
8015 new_pg_num
!= j
.second
.get_pg_num()) {
8016 dout(10) << __func__
<< " recording pool " << j
.first
<< " pg_num "
8017 << j
.second
.get_pg_num() << " -> " << new_pg_num
<< dendl
;
8018 pg_num_history
.log_pg_num_change(i
.first
, j
.first
, new_pg_num
);
8021 for (auto& j
: i
.second
->get_pools()) {
8022 if (!lastmap
->have_pg_pool(j
.first
)) {
8023 dout(10) << __func__
<< " recording new pool " << j
.first
<< " pg_num "
8024 << j
.second
.get_pg_num() << dendl
;
8025 pg_num_history
.log_pg_num_change(i
.first
, j
.first
,
8026 j
.second
.get_pg_num());
8031 pg_num_history
.epoch
= last
;
8034 ::encode(pg_num_history
, bl
);
8035 t
.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl
.length(), bl
);
8036 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
8039 // record new purged_snaps
8040 if (superblock
.purged_snaps_last
== start
- 1) {
8041 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
8042 make_purged_snaps_oid(), &t
,
8044 superblock
.purged_snaps_last
= last
;
8046 dout(10) << __func__
<< " superblock purged_snaps_last is "
8047 << superblock
.purged_snaps_last
8048 << ", not recording new purged_snaps" << dendl
;
8051 // superblock and commit
8052 write_superblock(t
);
8053 t
.register_on_commit(new C_OnMapCommit(this, start
, last
, m
));
8054 store
->queue_transaction(
8057 service
.publish_superblock(superblock
);
8060 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
8062 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
8063 if (is_stopping()) {
8064 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8067 std::lock_guard
l(osd_lock
);
8068 if (is_stopping()) {
8069 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8074 ceph_assert(first
<= last
);
8076 bool do_shutdown
= false;
8077 bool do_restart
= false;
8078 bool network_error
= false;
8079 OSDMapRef osdmap
= get_osdmap();
8081 // advance through the new maps
8082 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
8083 dout(10) << " advance to epoch " << cur
8084 << " (<= last " << last
8085 << " <= newest_map " << superblock
.newest_map
8088 OSDMapRef newmap
= get_map(cur
);
8089 ceph_assert(newmap
); // we just cached it above!
8091 // start blacklisting messages sent to peers that go down.
8092 service
.pre_publish_map(newmap
);
8094 // kill connections to newly down osds
8095 bool waited_for_reservations
= false;
8097 osdmap
= get_osdmap();
8098 osdmap
->get_all_osds(old
);
8099 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
8101 osdmap
->is_up(*p
) && // in old map
8102 newmap
->is_down(*p
)) { // but not the new one
8103 if (!waited_for_reservations
) {
8104 service
.await_reserved_maps();
8105 waited_for_reservations
= true;
8108 } else if (*p
!= whoami
&&
8109 osdmap
->is_down(*p
) &&
8110 newmap
->is_up(*p
)) {
8115 if (osdmap
->is_noup(whoami
) != newmap
->is_noup(whoami
)) {
8116 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
8119 // this captures the case where we sent the boot message while
8120 // NOUP was being set on the mon and our boot request was
8121 // dropped, and then later it is cleared. it imperfectly
8122 // handles the case where our original boot message was not
8123 // dropped and we restart even though we might have booted, but
8124 // that is harmless (boot will just take slightly longer).
8129 osdmap
= std::move(newmap
);
8133 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
8135 osdmap
->is_up(whoami
) &&
8136 osdmap
->get_addrs(whoami
) == client_messenger
->get_myaddrs()) {
8137 up_epoch
= osdmap
->get_epoch();
8138 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
8140 boot_epoch
= osdmap
->get_epoch();
8141 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
8143 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
8147 epoch_t _bind_epoch
= service
.get_bind_epoch();
8148 if (osdmap
->is_up(whoami
) &&
8149 osdmap
->get_addrs(whoami
).legacy_equals(
8150 client_messenger
->get_myaddrs()) &&
8151 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
8154 dout(1) << "state: booting -> active" << dendl
;
8155 set_state(STATE_ACTIVE
);
8158 // set incarnation so that osd_reqid_t's we generate for our
8159 // objecter requests are unique across restarts.
8160 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
8161 cancel_pending_failures();
8165 if (osdmap
->get_epoch() > 0 &&
8167 if (!osdmap
->exists(whoami
)) {
8168 derr
<< "map says i do not exist. shutting down." << dendl
;
8169 do_shutdown
= true; // don't call shutdown() while we have
8170 // everything paused
8171 } else if (osdmap
->is_stop(whoami
)) {
8172 derr
<< "map says i am stopped by admin. shutting down." << dendl
;
8174 } else if (!osdmap
->is_up(whoami
) ||
8175 !osdmap
->get_addrs(whoami
).legacy_equals(
8176 client_messenger
->get_myaddrs()) ||
8177 !osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8178 cluster_messenger
->get_myaddrs()) ||
8179 !osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8180 hb_back_server_messenger
->get_myaddrs()) ||
8181 !osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8182 hb_front_server_messenger
->get_myaddrs())) {
8183 if (!osdmap
->is_up(whoami
)) {
8184 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
8185 service
.got_stop_ack();
8187 clog
->warn() << "Monitor daemon marked osd." << whoami
<< " down, "
8188 "but it is still running";
8189 clog
->debug() << "map e" << osdmap
->get_epoch()
8190 << " wrongly marked me down at e"
8191 << osdmap
->get_down_at(whoami
);
8193 if (monc
->monmap
.min_mon_release
>= ceph_release_t::octopus
) {
8194 // note that this is best-effort...
8195 monc
->send_mon_message(
8199 osdmap
->get_epoch()));
8201 } else if (!osdmap
->get_addrs(whoami
).legacy_equals(
8202 client_messenger
->get_myaddrs())) {
8203 clog
->error() << "map e" << osdmap
->get_epoch()
8204 << " had wrong client addr (" << osdmap
->get_addrs(whoami
)
8205 << " != my " << client_messenger
->get_myaddrs() << ")";
8206 } else if (!osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8207 cluster_messenger
->get_myaddrs())) {
8208 clog
->error() << "map e" << osdmap
->get_epoch()
8209 << " had wrong cluster addr ("
8210 << osdmap
->get_cluster_addrs(whoami
)
8211 << " != my " << cluster_messenger
->get_myaddrs() << ")";
8212 } else if (!osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8213 hb_back_server_messenger
->get_myaddrs())) {
8214 clog
->error() << "map e" << osdmap
->get_epoch()
8215 << " had wrong heartbeat back addr ("
8216 << osdmap
->get_hb_back_addrs(whoami
)
8217 << " != my " << hb_back_server_messenger
->get_myaddrs()
8219 } else if (!osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8220 hb_front_server_messenger
->get_myaddrs())) {
8221 clog
->error() << "map e" << osdmap
->get_epoch()
8222 << " had wrong heartbeat front addr ("
8223 << osdmap
->get_hb_front_addrs(whoami
)
8224 << " != my " << hb_front_server_messenger
->get_myaddrs()
8228 if (!service
.is_stopping()) {
8229 epoch_t up_epoch
= 0;
8230 epoch_t bind_epoch
= osdmap
->get_epoch();
8231 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
8235 utime_t now
= ceph_clock_now();
8236 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
8237 osd_markdown_log
.push_back(now
);
8238 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
8239 derr
<< __func__
<< " marked down "
8240 << osd_markdown_log
.size()
8241 << " > osd_max_markdown_count "
8242 << cct
->_conf
->osd_max_markdown_count
8243 << " in last " << grace
<< " seconds, shutting down"
8249 start_waiting_for_healthy();
8251 set
<int> avoid_ports
;
8252 #if defined(__FreeBSD__)
8253 // prevent FreeBSD from grabbing the client_messenger port during
8254 // rebinding. In which case a cluster_meesneger will connect also
8256 client_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8258 cluster_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8260 int r
= cluster_messenger
->rebind(avoid_ports
);
8262 do_shutdown
= true; // FIXME: do_restart?
8263 network_error
= true;
8264 derr
<< __func__
<< " marked down:"
8265 << " rebind cluster_messenger failed" << dendl
;
8268 hb_back_server_messenger
->mark_down_all();
8269 hb_front_server_messenger
->mark_down_all();
8270 hb_front_client_messenger
->mark_down_all();
8271 hb_back_client_messenger
->mark_down_all();
8273 reset_heartbeat_peers(true);
8280 check_osdmap_features();
8285 if (is_active() || is_waiting_for_healthy())
8286 maybe_update_heartbeat_peers();
8293 if (network_error
) {
8294 cancel_pending_failures();
8296 // trigger shutdown in a different thread
8297 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
8298 queue_async_signal(SIGINT
);
8300 else if (m
->newest_map
&& m
->newest_map
> last
) {
8301 dout(10) << " msg say newest map is " << m
->newest_map
8302 << ", requesting more" << dendl
;
8303 osdmap_subscribe(osdmap
->get_epoch()+1, false);
8305 else if (is_preboot()) {
8306 if (m
->get_source().is_mon())
8307 _preboot(m
->oldest_map
, m
->newest_map
);
8311 else if (do_restart
)
8316 void OSD::check_osdmap_features()
8318 // adjust required feature bits?
8320 // we have to be a bit careful here, because we are accessing the
8321 // Policy structures without taking any lock. in particular, only
8322 // modify integer values that can safely be read by a racing CPU.
8323 // since we are only accessing existing Policy structures a their
8324 // current memory location, and setting or clearing bits in integer
8325 // fields, and we are the only writer, this is not a problem.
8327 const auto osdmap
= get_osdmap();
8329 Messenger::Policy p
= client_messenger
->get_default_policy();
8331 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
8332 if ((p
.features_required
& mask
) != features
) {
8333 dout(0) << "crush map has features " << features
8334 << ", adjusting msgr requires for clients" << dendl
;
8335 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8336 client_messenger
->set_default_policy(p
);
8340 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
8342 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
8343 if ((p
.features_required
& mask
) != features
) {
8344 dout(0) << "crush map has features " << features
8345 << " was " << p
.features_required
8346 << ", adjusting msgr requires for mons" << dendl
;
8347 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8348 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
8352 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
8354 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
8356 if ((p
.features_required
& mask
) != features
) {
8357 dout(0) << "crush map has features " << features
8358 << ", adjusting msgr requires for osds" << dendl
;
8359 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8360 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
8363 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
8364 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
8365 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
8366 ObjectStore::Transaction t
;
8367 write_superblock(t
);
8368 int err
= store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
8369 ceph_assert(err
== 0);
8373 if (osdmap
->require_osd_release
< ceph_release_t::nautilus
) {
8374 hb_front_server_messenger
->set_require_authorizer(false);
8375 hb_back_server_messenger
->set_require_authorizer(false);
8377 hb_front_server_messenger
->set_require_authorizer(true);
8378 hb_back_server_messenger
->set_require_authorizer(true);
8381 if (osdmap
->require_osd_release
!= last_require_osd_release
) {
8382 dout(1) << __func__
<< " require_osd_release " << last_require_osd_release
8383 << " -> " << to_string(osdmap
->require_osd_release
) << dendl
;
8384 store
->write_meta("require_osd_release",
8385 stringify((int)osdmap
->require_osd_release
));
8386 last_require_osd_release
= osdmap
->require_osd_release
;
8390 struct C_FinishSplits
: public Context
{
8393 C_FinishSplits(OSD
*osd
, const set
<PGRef
> &in
)
8394 : osd(osd
), pgs(in
) {}
8395 void finish(int r
) override
{
8396 osd
->_finish_splits(pgs
);
8400 void OSD::_finish_splits(set
<PGRef
>& pgs
)
8402 dout(10) << __func__
<< " " << pgs
<< dendl
;
8405 for (set
<PGRef
>::iterator i
= pgs
.begin();
8410 PeeringCtx rctx
= create_context();
8412 dout(10) << __func__
<< " " << *pg
<< dendl
;
8413 epoch_t e
= pg
->get_osdmap_epoch();
8414 pg
->handle_initialize(rctx
);
8415 pg
->queue_null(e
, e
);
8416 dispatch_context(rctx
, pg
, service
.get_osdmap());
8419 unsigned shard_index
= pg
->pg_id
.hash_to_shard(num_shards
);
8420 shards
[shard_index
]->register_and_wake_split_child(pg
);
8424 bool OSD::add_merge_waiter(OSDMapRef nextmap
, spg_t target
, PGRef src
,
8427 std::lock_guard
l(merge_lock
);
8428 auto& p
= merge_waiters
[nextmap
->get_epoch()][target
];
8429 p
[src
->pg_id
] = src
;
8430 dout(10) << __func__
<< " added merge_waiter " << src
->pg_id
8431 << " for " << target
<< ", have " << p
.size() << "/" << need
8433 return p
.size() == need
;
8436 bool OSD::advance_pg(
8439 ThreadPool::TPHandle
&handle
,
8442 if (osd_epoch
<= pg
->get_osdmap_epoch()) {
8445 ceph_assert(pg
->is_locked());
8446 OSDMapRef lastmap
= pg
->get_osdmap();
8447 ceph_assert(lastmap
->get_epoch() < osd_epoch
);
8448 set
<PGRef
> new_pgs
; // any split children
8451 unsigned old_pg_num
= lastmap
->have_pg_pool(pg
->pg_id
.pool()) ?
8452 lastmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8453 for (epoch_t next_epoch
= pg
->get_osdmap_epoch() + 1;
8454 next_epoch
<= osd_epoch
;
8456 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
8458 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
8462 unsigned new_pg_num
=
8463 (old_pg_num
&& nextmap
->have_pg_pool(pg
->pg_id
.pool())) ?
8464 nextmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8465 if (old_pg_num
&& new_pg_num
&& old_pg_num
!= new_pg_num
) {
8467 if (nextmap
->have_pg_pool(pg
->pg_id
.pool())) {
8469 if (pg
->pg_id
.is_merge_source(
8473 // we are merge source
8474 PGRef spg
= pg
; // carry a ref
8475 dout(1) << __func__
<< " " << pg
->pg_id
8476 << " is merge source, target is " << parent
8478 pg
->write_if_dirty(rctx
);
8479 if (!new_pgs
.empty()) {
8480 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8484 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8486 // release backoffs explicitly, since the on_shutdown path
8487 // aggressively tears down backoff state.
8488 if (pg
->is_primary()) {
8489 pg
->release_pg_backoffs();
8492 OSDShard
*sdata
= pg
->osd_shard
;
8494 std::lock_guard
l(sdata
->shard_lock
);
8496 sdata
->_detach_pg(pg
->pg_slot
);
8497 // update pg count now since we might not get an osdmap
8499 if (pg
->is_primary())
8500 logger
->dec(l_osd_pg_primary
);
8501 else if (pg
->is_nonprimary())
8502 logger
->dec(l_osd_pg_replica
); // misnomer
8504 logger
->dec(l_osd_pg_stray
);
8509 set
<spg_t
> children
;
8510 parent
.is_split(new_pg_num
, old_pg_num
, &children
);
8511 if (add_merge_waiter(nextmap
, parent
, pg
, children
.size())) {
8512 enqueue_peering_evt(
8515 std::make_shared
<PGPeeringEvent
>(
8516 nextmap
->get_epoch(),
8517 nextmap
->get_epoch(),
8522 } else if (pg
->pg_id
.is_merge_target(old_pg_num
, new_pg_num
)) {
8523 // we are merge target
8524 set
<spg_t
> children
;
8525 pg
->pg_id
.is_split(new_pg_num
, old_pg_num
, &children
);
8526 dout(20) << __func__
<< " " << pg
->pg_id
8527 << " is merge target, sources are " << children
8529 map
<spg_t
,PGRef
> sources
;
8531 std::lock_guard
l(merge_lock
);
8532 auto& s
= merge_waiters
[nextmap
->get_epoch()][pg
->pg_id
];
8533 unsigned need
= children
.size();
8534 dout(20) << __func__
<< " have " << s
.size() << "/"
8536 if (s
.size() == need
) {
8538 merge_waiters
[nextmap
->get_epoch()].erase(pg
->pg_id
);
8539 if (merge_waiters
[nextmap
->get_epoch()].empty()) {
8540 merge_waiters
.erase(nextmap
->get_epoch());
8544 if (!sources
.empty()) {
8545 unsigned new_pg_num
= nextmap
->get_pg_num(pg
->pg_id
.pool());
8546 unsigned split_bits
= pg
->pg_id
.get_split_bits(new_pg_num
);
8547 dout(1) << __func__
<< " merging " << pg
->pg_id
<< dendl
;
8549 sources
, rctx
, split_bits
,
8550 nextmap
->get_pg_pool(
8551 pg
->pg_id
.pool())->last_pg_merge_meta
);
8552 pg
->pg_slot
->waiting_for_merge_epoch
= 0;
8554 dout(20) << __func__
<< " not ready to merge yet" << dendl
;
8555 pg
->write_if_dirty(rctx
);
8556 if (!new_pgs
.empty()) {
8557 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8561 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8563 // kick source(s) to get them ready
8564 for (auto& i
: children
) {
8565 dout(20) << __func__
<< " kicking source " << i
<< dendl
;
8566 enqueue_peering_evt(
8569 std::make_shared
<PGPeeringEvent
>(
8570 nextmap
->get_epoch(),
8571 nextmap
->get_epoch(),
8581 vector
<int> newup
, newacting
;
8582 int up_primary
, acting_primary
;
8583 nextmap
->pg_to_up_acting_osds(
8585 &newup
, &up_primary
,
8586 &newacting
, &acting_primary
);
8587 pg
->handle_advance_map(
8588 nextmap
, lastmap
, newup
, up_primary
,
8589 newacting
, acting_primary
, rctx
);
8591 auto oldpool
= lastmap
->get_pools().find(pg
->pg_id
.pool());
8592 auto newpool
= nextmap
->get_pools().find(pg
->pg_id
.pool());
8593 if (oldpool
!= lastmap
->get_pools().end()
8594 && newpool
!= nextmap
->get_pools().end()) {
8595 dout(20) << __func__
8596 << " new pool opts " << newpool
->second
.opts
8597 << " old pool opts " << oldpool
->second
.opts
8600 double old_min_interval
= 0, new_min_interval
= 0;
8601 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &old_min_interval
);
8602 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &new_min_interval
);
8604 double old_max_interval
= 0, new_max_interval
= 0;
8605 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &old_max_interval
);
8606 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &new_max_interval
);
8608 // Assume if an interval is change from set to unset or vice versa the actual config
8609 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8611 if (old_min_interval
!= new_min_interval
|| old_max_interval
!= new_max_interval
) {
8612 pg
->on_info_history_change();
8616 if (new_pg_num
&& old_pg_num
!= new_pg_num
) {
8618 set
<spg_t
> children
;
8619 if (pg
->pg_id
.is_split(
8624 pg
, children
, &new_pgs
, lastmap
, nextmap
,
8630 old_pg_num
= new_pg_num
;
8631 handle
.reset_tp_timeout();
8633 pg
->handle_activate_map(rctx
);
8637 if (!new_pgs
.empty()) {
8638 rctx
.transaction
.register_on_applied(new C_FinishSplits(this, new_pgs
));
8643 void OSD::consume_map()
8645 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8646 auto osdmap
= get_osdmap();
8647 dout(7) << "consume_map version " << osdmap
->get_epoch() << dendl
;
8649 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8650 * speak the older sorting version any more. Be careful not to force
8651 * a shutdown if we are merely processing old maps, though.
8653 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
) && is_active()) {
8654 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
8658 service
.pre_publish_map(osdmap
);
8659 service
.await_reserved_maps();
8660 service
.publish_map(osdmap
);
8662 // prime splits and merges
8663 set
<pair
<spg_t
,epoch_t
>> newly_split
; // splits, and when
8664 set
<pair
<spg_t
,epoch_t
>> merge_pgs
; // merge participants, and when
8665 for (auto& shard
: shards
) {
8666 shard
->identify_splits_and_merges(osdmap
, &newly_split
, &merge_pgs
);
8668 if (!newly_split
.empty()) {
8669 for (auto& shard
: shards
) {
8670 shard
->prime_splits(osdmap
, &newly_split
);
8672 ceph_assert(newly_split
.empty());
8675 // prune sent_ready_to_merge
8676 service
.prune_sent_ready_to_merge(osdmap
);
8678 // FIXME, maybe: We could race against an incoming peering message
8679 // that instantiates a merge PG after identify_merges() below and
8680 // never set up its peer to complete the merge. An OSD restart
8681 // would clear it up. This is a hard race to resolve,
8682 // extraordinarily rare (we only merge PGs that are stable and
8683 // clean, so it'd have to be an imported PG to an OSD with a
8684 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8685 // replace all of this with a seastar-based code soon anyway.
8686 if (!merge_pgs
.empty()) {
8687 // mark the pgs we already have, or create new and empty merge
8688 // participants for those we are missing. do this all under the
8689 // shard lock so we don't have to worry about racing pg creates
8691 for (auto& shard
: shards
) {
8692 shard
->prime_merges(osdmap
, &merge_pgs
);
8694 ceph_assert(merge_pgs
.empty());
8697 service
.prune_pg_created();
8699 unsigned pushes_to_free
= 0;
8700 for (auto& shard
: shards
) {
8701 shard
->consume_map(osdmap
, &pushes_to_free
);
8704 vector
<spg_t
> pgids
;
8707 // count (FIXME, probably during seastar rewrite)
8708 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
8711 for (auto& pg
: pgs
) {
8712 // FIXME (probably during seastar rewrite): this is lockless and
8713 // racy, but we don't want to take pg lock here.
8714 if (pg
->is_primary())
8716 else if (pg
->is_nonprimary())
8717 num_pg_replica
++; // misnomer
8723 // FIXME (as part of seastar rewrite): move to OSDShard
8724 std::lock_guard
l(pending_creates_lock
);
8725 for (auto pg
= pending_creates_from_osd
.begin();
8726 pg
!= pending_creates_from_osd
.end();) {
8727 if (osdmap
->get_pg_acting_role(pg
->first
, whoami
) < 0) {
8728 dout(10) << __func__
<< " pg " << pg
->first
<< " doesn't map here, "
8729 << "discarding pending_create_from_osd" << dendl
;
8730 pg
= pending_creates_from_osd
.erase(pg
);
8737 service
.maybe_inject_dispatch_delay();
8739 dispatch_sessions_waiting_on_map();
8741 service
.maybe_inject_dispatch_delay();
8743 service
.release_reserved_pushes(pushes_to_free
);
8745 // queue null events to push maps down to individual PGs
8746 for (auto pgid
: pgids
) {
8747 enqueue_peering_evt(
8750 std::make_shared
<PGPeeringEvent
>(
8751 osdmap
->get_epoch(),
8752 osdmap
->get_epoch(),
8755 logger
->set(l_osd_pg
, pgids
.size());
8756 logger
->set(l_osd_pg_primary
, num_pg_primary
);
8757 logger
->set(l_osd_pg_replica
, num_pg_replica
);
8758 logger
->set(l_osd_pg_stray
, num_pg_stray
);
8761 void OSD::activate_map()
8763 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8764 auto osdmap
= get_osdmap();
8766 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
8769 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
8770 if (!service
.recovery_is_paused()) {
8771 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
8772 service
.pause_recovery();
8775 if (service
.recovery_is_paused()) {
8776 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
8777 service
.unpause_recovery();
8781 service
.activate_map();
8784 take_waiters(waiting_for_osdmap
);
8787 bool OSD::require_mon_peer(const Message
*m
)
8789 if (!m
->get_connection()->peer_is_mon()) {
8790 dout(0) << "require_mon_peer received from non-mon "
8791 << m
->get_connection()->get_peer_addr()
8792 << " " << *m
<< dendl
;
8798 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
8800 if (!m
->get_connection()->peer_is_mon() &&
8801 !m
->get_connection()->peer_is_mgr()) {
8802 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8803 << m
->get_connection()->get_peer_addr()
8804 << " " << *m
<< dendl
;
8810 bool OSD::require_osd_peer(const Message
*m
)
8812 if (!m
->get_connection()->peer_is_osd()) {
8813 dout(0) << "require_osd_peer received from non-osd "
8814 << m
->get_connection()->get_peer_addr()
8815 << " " << *m
<< dendl
;
8821 bool OSD::require_self_aliveness(const Message
*m
, epoch_t epoch
)
8823 epoch_t up_epoch
= service
.get_up_epoch();
8824 if (epoch
< up_epoch
) {
8825 dout(7) << "from pre-up epoch " << epoch
<< " < " << up_epoch
<< dendl
;
8830 dout(7) << "still in boot state, dropping message " << *m
<< dendl
;
8837 bool OSD::require_same_peer_instance(const Message
*m
, const OSDMapRef
& map
,
8838 bool is_fast_dispatch
)
8840 int from
= m
->get_source().num();
8842 if (map
->is_down(from
) ||
8843 (map
->get_cluster_addrs(from
) != m
->get_source_addrs())) {
8844 dout(5) << "from dead osd." << from
<< ", marking down, "
8845 << " msg was " << m
->get_source_inst().addr
8847 << (map
->is_up(from
) ?
8848 map
->get_cluster_addrs(from
) : entity_addrvec_t())
8850 ConnectionRef con
= m
->get_connection();
8852 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); s
) {
8853 if (!is_fast_dispatch
)
8854 s
->session_dispatch_lock
.lock();
8855 clear_session_waiting_on_map(s
);
8856 con
->set_priv(nullptr); // break ref <-> session cycle, if any
8858 if (!is_fast_dispatch
)
8859 s
->session_dispatch_lock
.unlock();
8868 * require that we have same (or newer) map, and that
8869 * the source is the pg primary.
8871 bool OSD::require_same_or_newer_map(OpRequestRef
& op
, epoch_t epoch
,
8872 bool is_fast_dispatch
)
8874 const Message
*m
= op
->get_req();
8875 const auto osdmap
= get_osdmap();
8876 dout(15) << "require_same_or_newer_map " << epoch
8877 << " (i am " << osdmap
->get_epoch() << ") " << m
<< dendl
;
8879 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8881 // do they have a newer map?
8882 if (epoch
> osdmap
->get_epoch()) {
8883 dout(7) << "waiting for newer map epoch " << epoch
8884 << " > my " << osdmap
->get_epoch() << " with " << m
<< dendl
;
8885 wait_for_new_map(op
);
8889 if (!require_self_aliveness(op
->get_req(), epoch
)) {
8893 // ok, our map is same or newer.. do they still exist?
8894 if (m
->get_connection()->get_messenger() == cluster_messenger
&&
8895 !require_same_peer_instance(op
->get_req(), osdmap
, is_fast_dispatch
)) {
8906 // ----------------------------------------
8909 void OSD::split_pgs(
8911 const set
<spg_t
> &childpgids
, set
<PGRef
> *out_pgs
,
8916 unsigned pg_num
= nextmap
->get_pg_num(parent
->pg_id
.pool());
8917 parent
->update_snap_mapper_bits(parent
->get_pgid().get_split_bits(pg_num
));
8919 vector
<object_stat_sum_t
> updated_stats
;
8920 parent
->start_split_stats(childpgids
, &updated_stats
);
8922 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
8923 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
8924 i
!= childpgids
.end();
8926 ceph_assert(stat_iter
!= updated_stats
.end());
8927 dout(10) << __func__
<< " splitting " << *parent
<< " into " << *i
<< dendl
;
8928 PG
* child
= _make_pg(nextmap
, *i
);
8930 out_pgs
->insert(child
);
8931 child
->ch
= store
->create_new_collection(child
->coll
);
8934 uint32_t shard_index
= i
->hash_to_shard(shards
.size());
8935 assert(NULL
!= shards
[shard_index
]);
8936 store
->set_collection_commit_queue(child
->coll
, &(shards
[shard_index
]->context_queue
));
8939 unsigned split_bits
= i
->get_split_bits(pg_num
);
8940 dout(10) << " pg_num is " << pg_num
8941 << ", m_seed " << i
->ps()
8942 << ", split_bits is " << split_bits
<< dendl
;
8943 parent
->split_colls(
8947 &child
->get_pool().info
,
8954 child
->init_collection_pool_opts();
8956 child
->finish_split_stats(*stat_iter
, rctx
.transaction
);
8959 ceph_assert(stat_iter
!= updated_stats
.end());
8960 parent
->finish_split_stats(*stat_iter
, rctx
.transaction
);
8966 void OSD::handle_pg_create(OpRequestRef op
)
8968 // NOTE: this can be removed in P release (mimic is the last version to
8969 // send MOSDPGCreate messages).
8971 auto m
= op
->get_req
<MOSDPGCreate
>();
8972 ceph_assert(m
->get_type() == MSG_OSD_PG_CREATE
);
8974 dout(10) << "handle_pg_create " << *m
<< dendl
;
8976 if (!require_mon_peer(op
->get_req())) {
8980 if (!require_same_or_newer_map(op
, m
->epoch
, false))
8985 const auto osdmap
= get_osdmap();
8986 map
<pg_t
,utime_t
>::const_iterator ci
= m
->ctimes
.begin();
8987 for (map
<pg_t
,pg_create_t
>::const_iterator p
= m
->mkpg
.begin();
8990 ceph_assert(ci
!= m
->ctimes
.end() && ci
->first
== p
->first
);
8991 epoch_t created
= p
->second
.created
;
8992 if (p
->second
.split_bits
) // Skip split pgs
8996 if (!osdmap
->have_pg_pool(on
.pool())) {
8997 dout(20) << "ignoring pg on deleted pool " << on
<< dendl
;
9001 dout(20) << "mkpg " << on
<< " e" << created
<< "@" << ci
->second
<< dendl
;
9004 bool mapped
= osdmap
->get_primary_shard(on
, &pgid
);
9005 ceph_assert(mapped
);
9007 // is it still ours?
9008 vector
<int> up
, acting
;
9009 int up_primary
= -1;
9010 int acting_primary
= -1;
9011 osdmap
->pg_to_up_acting_osds(on
, &up
, &up_primary
, &acting
, &acting_primary
);
9012 int role
= osdmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
9014 if (acting_primary
!= whoami
) {
9015 dout(10) << "mkpg " << on
<< " not acting_primary (" << acting_primary
9016 << "), my role=" << role
<< ", skipping" << dendl
;
9022 pg_history_t history
;
9023 build_initial_pg_history(pgid
, created
, ci
->second
, &history
, &pi
);
9025 // The mon won't resend unless the primary changed, so we ignore
9026 // same_interval_since. We'll pass this history with the current
9027 // epoch as the event.
9028 if (history
.same_primary_since
> m
->epoch
) {
9029 dout(10) << __func__
<< ": got obsolete pg create on pgid "
9030 << pgid
<< " from epoch " << m
->epoch
9031 << ", primary changed in " << history
.same_primary_since
9035 enqueue_peering_evt(
9038 std::make_shared
<PGPeeringEvent
>(
9039 osdmap
->get_epoch(),
9040 osdmap
->get_epoch(),
9045 osdmap
->get_epoch(),
9053 std::lock_guard
l(pending_creates_lock
);
9054 if (pending_creates_from_mon
== 0) {
9055 last_pg_create_epoch
= m
->epoch
;
9059 maybe_update_heartbeat_peers();
9063 // ----------------------------------------
9064 // peering and recovery
9066 PeeringCtx
OSD::create_context()
9068 return PeeringCtx(get_osdmap()->require_osd_release
);
9071 void OSD::dispatch_context(PeeringCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
9072 ThreadPool::TPHandle
*handle
)
9074 if (!service
.get_osdmap()->is_up(whoami
)) {
9075 dout(20) << __func__
<< " not up in osdmap" << dendl
;
9076 } else if (!is_active()) {
9077 dout(20) << __func__
<< " not active" << dendl
;
9079 for (auto& [osd
, ls
] : ctx
.message_map
) {
9080 if (!curmap
->is_up(osd
)) {
9081 dout(20) << __func__
<< " skipping down osd." << osd
<< dendl
;
9084 ConnectionRef con
= service
.get_con_osd_cluster(
9085 osd
, curmap
->get_epoch());
9087 dout(20) << __func__
<< " skipping osd." << osd
<< " (NULL con)"
9091 service
.maybe_share_map(con
.get(), curmap
);
9093 con
->send_message2(m
);
9098 if ((!ctx
.transaction
.empty() || ctx
.transaction
.has_contexts()) && pg
) {
9099 int tr
= store
->queue_transaction(
9101 std::move(ctx
.transaction
), TrackedOpRef(),
9103 ceph_assert(tr
== 0);
9107 void OSD::handle_fast_pg_create(MOSDPGCreate2
*m
)
9109 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9110 if (!require_mon_peer(m
)) {
9114 for (auto& p
: m
->pgs
) {
9115 spg_t pgid
= p
.first
;
9116 epoch_t created
= p
.second
.first
;
9117 utime_t created_stamp
= p
.second
.second
;
9118 auto q
= m
->pg_extra
.find(pgid
);
9119 if (q
== m
->pg_extra
.end()) {
9120 dout(20) << __func__
<< " " << pgid
<< " e" << created
9121 << "@" << created_stamp
9122 << " (no history or past_intervals)" << dendl
;
9123 // pre-octopus ... no pg history. this can be removed in Q release.
9124 enqueue_peering_evt(
9127 std::make_shared
<PGPeeringEvent
>(
9135 pg_history_t(created
, created_stamp
),
9140 dout(20) << __func__
<< " " << pgid
<< " e" << created
9141 << "@" << created_stamp
9142 << " history " << q
->second
.first
9143 << " pi " << q
->second
.second
<< dendl
;
9144 if (!q
->second
.second
.empty() &&
9145 m
->epoch
< q
->second
.second
.get_bounds().second
) {
9146 clog
->error() << "got pg_create on " << pgid
<< " epoch " << m
->epoch
9147 << " and unmatched past_intervals " << q
->second
.second
9148 << " (history " << q
->second
.first
<< ")";
9150 enqueue_peering_evt(
9153 std::make_shared
<PGPeeringEvent
>(
9170 std::lock_guard
l(pending_creates_lock
);
9171 if (pending_creates_from_mon
== 0) {
9172 last_pg_create_epoch
= m
->epoch
;
9179 void OSD::handle_fast_pg_query(MOSDPGQuery
*m
)
9181 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9182 if (!require_osd_peer(m
)) {
9186 int from
= m
->get_source().num();
9187 for (auto& p
: m
->pg_list
) {
9188 enqueue_peering_evt(
9191 std::make_shared
<PGPeeringEvent
>(
9192 p
.second
.epoch_sent
, p
.second
.epoch_sent
,
9195 pg_shard_t(from
, p
.second
.from
),
9197 p
.second
.epoch_sent
),
9204 void OSD::handle_fast_pg_notify(MOSDPGNotify
* m
)
9206 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9207 if (!require_osd_peer(m
)) {
9211 int from
= m
->get_source().num();
9212 for (auto& p
: m
->get_pg_list()) {
9213 spg_t
pgid(p
.info
.pgid
.pgid
, p
.to
);
9214 enqueue_peering_evt(
9217 std::make_shared
<PGPeeringEvent
>(
9221 pgid
, pg_shard_t(from
, p
.from
),
9223 m
->get_connection()->get_features()),
9236 void OSD::handle_fast_pg_info(MOSDPGInfo
* m
)
9238 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9239 if (!require_osd_peer(m
)) {
9243 int from
= m
->get_source().num();
9244 for (auto& p
: m
->pg_list
) {
9245 enqueue_peering_evt(
9246 spg_t(p
.info
.pgid
.pgid
, p
.to
),
9248 std::make_shared
<PGPeeringEvent
>(
9249 p
.epoch_sent
, p
.query_epoch
,
9251 pg_shard_t(from
, p
.from
),
9259 void OSD::handle_fast_pg_remove(MOSDPGRemove
*m
)
9261 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9262 if (!require_osd_peer(m
)) {
9266 for (auto& pgid
: m
->pg_list
) {
9267 enqueue_peering_evt(
9270 std::make_shared
<PGPeeringEvent
>(
9271 m
->get_epoch(), m
->get_epoch(),
9272 PeeringState::DeleteStart())));
9277 void OSD::handle_fast_force_recovery(MOSDForceRecovery
*m
)
9279 dout(10) << __func__
<< " " << *m
<< dendl
;
9280 if (!require_mon_or_mgr_peer(m
)) {
9284 epoch_t epoch
= get_osdmap_epoch();
9285 for (auto pgid
: m
->forced_pgs
) {
9286 if (m
->options
& OFR_BACKFILL
) {
9287 if (m
->options
& OFR_CANCEL
) {
9288 enqueue_peering_evt(
9291 std::make_shared
<PGPeeringEvent
>(
9293 PeeringState::UnsetForceBackfill())));
9295 enqueue_peering_evt(
9298 std::make_shared
<PGPeeringEvent
>(
9300 PeeringState::SetForceBackfill())));
9302 } else if (m
->options
& OFR_RECOVERY
) {
9303 if (m
->options
& OFR_CANCEL
) {
9304 enqueue_peering_evt(
9307 std::make_shared
<PGPeeringEvent
>(
9309 PeeringState::UnsetForceRecovery())));
9311 enqueue_peering_evt(
9314 std::make_shared
<PGPeeringEvent
>(
9316 PeeringState::SetForceRecovery())));
9323 void OSD::handle_pg_query_nopg(const MQuery
& q
)
9325 spg_t pgid
= q
.pgid
;
9326 dout(10) << __func__
<< " " << pgid
<< dendl
;
9328 OSDMapRef osdmap
= get_osdmap();
9329 if (!osdmap
->have_pg_pool(pgid
.pool()))
9332 dout(10) << " pg " << pgid
<< " dne" << dendl
;
9333 pg_info_t
empty(spg_t(pgid
.pgid
, q
.query
.to
));
9334 ConnectionRef con
= service
.get_con_osd_cluster(q
.from
.osd
, osdmap
->get_epoch());
9337 if (q
.query
.type
== pg_query_t::LOG
||
9338 q
.query
.type
== pg_query_t::FULLLOG
) {
9340 q
.query
.from
, q
.query
.to
,
9341 osdmap
->get_epoch(), empty
,
9342 q
.query
.epoch_sent
);
9344 vector
<pg_notify_t
> ls
;
9347 q
.query
.from
, q
.query
.to
,
9349 osdmap
->get_epoch(),
9352 m
= new MOSDPGNotify(osdmap
->get_epoch(), std::move(ls
));
9354 service
.maybe_share_map(con
.get(), osdmap
);
9355 con
->send_message(m
);
9359 void OSDService::queue_check_readable(spg_t spgid
,
9361 ceph::signedspan delay
)
9363 if (delay
== ceph::signedspan::zero()) {
9364 osd
->enqueue_peering_evt(
9367 std::make_shared
<PGPeeringEvent
>(
9369 PeeringState::CheckReadable())));
9371 mono_timer
.add_event(
9373 [this, spgid
, lpr
]() {
9374 queue_check_readable(spgid
, lpr
);
9380 // =========================================================
9383 void OSDService::_maybe_queue_recovery() {
9384 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
9385 uint64_t available_pushes
;
9386 while (!awaiting_throttle
.empty() &&
9387 _recover_now(&available_pushes
)) {
9388 uint64_t to_start
= std::min(
9390 cct
->_conf
->osd_recovery_max_single_start
);
9391 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
9392 awaiting_throttle
.pop_front();
9393 dout(10) << __func__
<< " starting " << to_start
9394 << ", recovery_ops_reserved " << recovery_ops_reserved
9395 << " -> " << (recovery_ops_reserved
+ to_start
) << dendl
;
9396 recovery_ops_reserved
+= to_start
;
9400 bool OSDService::_recover_now(uint64_t *available_pushes
)
9402 if (available_pushes
)
9403 *available_pushes
= 0;
9405 if (ceph_clock_now() < defer_recovery_until
) {
9406 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
9410 if (recovery_paused
) {
9411 dout(15) << __func__
<< " paused" << dendl
;
9415 uint64_t max
= osd
->get_recovery_max_active();
9416 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
9417 dout(15) << __func__
<< " active " << recovery_ops_active
9418 << " + reserved " << recovery_ops_reserved
9419 << " >= max " << max
<< dendl
;
9423 if (available_pushes
)
9424 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
9429 unsigned OSDService::get_target_pg_log_entries() const
9431 auto num_pgs
= osd
->get_num_pgs();
9432 auto target
= cct
->_conf
->osd_target_pg_log_entries_per_osd
;
9433 if (num_pgs
> 0 && target
> 0) {
9434 // target an even spread of our budgeted log entries across all
9435 // PGs. note that while we only get to control the entry count
9436 // for primary PGs, we'll normally be responsible for a mix of
9437 // primary and replica PGs (for the same pool(s) even), so this
9439 return std::max
<unsigned>(
9440 std::min
<unsigned>(target
/ num_pgs
,
9441 cct
->_conf
->osd_max_pg_log_entries
),
9442 cct
->_conf
->osd_min_pg_log_entries
);
9444 // fall back to a per-pg value.
9445 return cct
->_conf
->osd_min_pg_log_entries
;
9449 void OSD::do_recovery(
9450 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
,
9451 ThreadPool::TPHandle
&handle
)
9453 uint64_t started
= 0;
9456 * When the value of osd_recovery_sleep is set greater than zero, recovery
9457 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9458 * recovery event's schedule time. This is done by adding a
9459 * recovery_requeue_callback event, which re-queues the recovery op using
9460 * queue_recovery_after_sleep.
9462 float recovery_sleep
= get_osd_recovery_sleep();
9464 std::lock_guard
l(service
.sleep_lock
);
9465 if (recovery_sleep
> 0 && service
.recovery_needs_sleep
) {
9467 auto recovery_requeue_callback
= new LambdaContext([this, pgref
, queued
, reserved_pushes
](int r
) {
9468 dout(20) << "do_recovery wake up at "
9470 << ", re-queuing recovery" << dendl
;
9471 std::lock_guard
l(service
.sleep_lock
);
9472 service
.recovery_needs_sleep
= false;
9473 service
.queue_recovery_after_sleep(pgref
.get(), queued
, reserved_pushes
);
9476 // This is true for the first recovery op and when the previous recovery op
9477 // has been scheduled in the past. The next recovery op is scheduled after
9478 // completing the sleep from now.
9480 if (auto now
= ceph::real_clock::now();
9481 service
.recovery_schedule_time
< now
) {
9482 service
.recovery_schedule_time
= now
;
9484 service
.recovery_schedule_time
+= ceph::make_timespan(recovery_sleep
);
9485 service
.sleep_timer
.add_event_at(service
.recovery_schedule_time
,
9486 recovery_requeue_callback
);
9487 dout(20) << "Recovery event scheduled at "
9488 << service
.recovery_schedule_time
<< dendl
;
9495 std::lock_guard
l(service
.sleep_lock
);
9496 service
.recovery_needs_sleep
= true;
9499 if (pg
->pg_has_reset_since(queued
)) {
9503 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
9504 #ifdef DEBUG_RECOVERY_OIDS
9505 dout(20) << " active was " << service
.recovery_oids
[pg
->pg_id
] << dendl
;
9508 bool do_unfound
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
9509 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
9510 << " on " << *pg
<< dendl
;
9513 PeeringCtx rctx
= create_context();
9514 rctx
.handle
= &handle
;
9515 pg
->find_unfound(queued
, rctx
);
9516 dispatch_context(rctx
, pg
, pg
->get_osdmap());
9521 ceph_assert(started
<= reserved_pushes
);
9522 service
.release_reserved_pushes(reserved_pushes
);
9525 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
9527 std::lock_guard
l(recovery_lock
);
9528 dout(10) << "start_recovery_op " << *pg
<< " " << soid
9529 << " (" << recovery_ops_active
<< "/"
9530 << osd
->get_recovery_max_active() << " rops)"
9532 recovery_ops_active
++;
9534 #ifdef DEBUG_RECOVERY_OIDS
9535 dout(20) << " active was " << recovery_oids
[pg
->pg_id
] << dendl
;
9536 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
) == 0);
9537 recovery_oids
[pg
->pg_id
].insert(soid
);
9541 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
9543 std::lock_guard
l(recovery_lock
);
9544 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
9545 << " dequeue=" << dequeue
9546 << " (" << recovery_ops_active
<< "/"
9547 << osd
->get_recovery_max_active() << " rops)"
9551 ceph_assert(recovery_ops_active
> 0);
9552 recovery_ops_active
--;
9554 #ifdef DEBUG_RECOVERY_OIDS
9555 dout(20) << " active oids was " << recovery_oids
[pg
->pg_id
] << dendl
;
9556 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
));
9557 recovery_oids
[pg
->pg_id
].erase(soid
);
9560 _maybe_queue_recovery();
9563 bool OSDService::is_recovery_active()
9565 if (cct
->_conf
->osd_debug_pretend_recovery_active
) {
9568 return local_reserver
.has_reservation() || remote_reserver
.has_reservation();
9571 void OSDService::release_reserved_pushes(uint64_t pushes
)
9573 std::lock_guard
l(recovery_lock
);
9574 dout(10) << __func__
<< "(" << pushes
<< "), recovery_ops_reserved "
9575 << recovery_ops_reserved
<< " -> " << (recovery_ops_reserved
-pushes
)
9577 ceph_assert(recovery_ops_reserved
>= pushes
);
9578 recovery_ops_reserved
-= pushes
;
9579 _maybe_queue_recovery();
9582 // =========================================================
9585 bool OSD::op_is_discardable(const MOSDOp
*op
)
9587 // drop client request if they are not connected and can't get the
9589 if (!op
->get_connection()->is_connected()) {
9595 void OSD::enqueue_op(spg_t pg
, OpRequestRef
&& op
, epoch_t epoch
)
9597 const utime_t stamp
= op
->get_req()->get_recv_stamp();
9598 const utime_t latency
= ceph_clock_now() - stamp
;
9599 const unsigned priority
= op
->get_req()->get_priority();
9600 const int cost
= op
->get_req()->get_cost();
9601 const uint64_t owner
= op
->get_req()->get_source().num();
9603 dout(15) << "enqueue_op " << op
<< " prio " << priority
9605 << " latency " << latency
9606 << " epoch " << epoch
9607 << " " << *(op
->get_req()) << dendl
;
9608 op
->osd_trace
.event("enqueue op");
9609 op
->osd_trace
.keyval("priority", priority
);
9610 op
->osd_trace
.keyval("cost", cost
);
9611 op
->mark_queued_for_pg();
9612 logger
->tinc(l_osd_op_before_queue_op_lat
, latency
);
9615 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGOpItem(pg
, std::move(op
))),
9616 cost
, priority
, stamp
, owner
, epoch
));
9619 void OSD::enqueue_peering_evt(spg_t pgid
, PGPeeringEventRef evt
)
9621 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
9624 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
9626 cct
->_conf
->osd_peering_op_priority
,
9629 evt
->get_epoch_sent()));
9633 * NOTE: dequeue called in worker thread, with pg lock
9635 void OSD::dequeue_op(
9636 PGRef pg
, OpRequestRef op
,
9637 ThreadPool::TPHandle
&handle
)
9639 const Message
*m
= op
->get_req();
9642 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_BEGIN", false);
9644 utime_t now
= ceph_clock_now();
9645 op
->set_dequeued_time(now
);
9647 utime_t latency
= now
- m
->get_recv_stamp();
9648 dout(10) << "dequeue_op " << op
<< " prio " << m
->get_priority()
9649 << " cost " << m
->get_cost()
9650 << " latency " << latency
9652 << " pg " << *pg
<< dendl
;
9654 logger
->tinc(l_osd_op_before_dequeue_op_lat
, latency
);
9656 service
.maybe_share_map(m
->get_connection().get(),
9660 if (pg
->is_deleting())
9663 op
->mark_reached_pg();
9664 op
->osd_trace
.event("dequeue_op");
9666 pg
->do_request(op
, handle
);
9669 dout(10) << "dequeue_op " << op
<< " finish" << dendl
;
9670 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_END", false);
9674 void OSD::dequeue_peering_evt(
9677 PGPeeringEventRef evt
,
9678 ThreadPool::TPHandle
& handle
)
9680 PeeringCtx rctx
= create_context();
9681 auto curmap
= sdata
->get_osdmap();
9682 bool need_up_thru
= false;
9683 epoch_t same_interval_since
= 0;
9685 if (const MQuery
*q
= dynamic_cast<const MQuery
*>(evt
->evt
.get())) {
9686 handle_pg_query_nopg(*q
);
9688 derr
<< __func__
<< " unrecognized pg-less event " << evt
->get_desc() << dendl
;
9691 } else if (advance_pg(curmap
->get_epoch(), pg
, handle
, rctx
)) {
9692 pg
->do_peering_event(evt
, rctx
);
9693 if (pg
->is_deleted()) {
9697 dispatch_context(rctx
, pg
, curmap
, &handle
);
9698 need_up_thru
= pg
->get_need_up_thru();
9699 same_interval_since
= pg
->get_same_interval_since();
9704 queue_want_up_thru(same_interval_since
);
9707 service
.send_pg_temp();
9710 void OSD::dequeue_delete(
9714 ThreadPool::TPHandle
& handle
)
9716 dequeue_peering_evt(
9720 std::make_shared
<PGPeeringEvent
>(
9722 PeeringState::DeleteSome())),
9728 // --------------------------------
9730 const char** OSD::get_tracked_conf_keys() const
9732 static const char* KEYS
[] = {
9733 "osd_max_backfills",
9734 "osd_min_recovery_priority",
9735 "osd_max_trimming_pgs",
9736 "osd_op_complaint_time",
9737 "osd_op_log_threshold",
9738 "osd_op_history_size",
9739 "osd_op_history_duration",
9740 "osd_op_history_slow_op_size",
9741 "osd_op_history_slow_op_threshold",
9742 "osd_enable_op_tracker",
9743 "osd_map_cache_size",
9744 "osd_pg_epoch_max_lag_factor",
9745 "osd_pg_epoch_persisted_max_stale",
9746 // clog & admin clog
9749 "clog_to_syslog_facility",
9750 "clog_to_syslog_level",
9751 "osd_objectstore_fuse",
9753 "clog_to_graylog_host",
9754 "clog_to_graylog_port",
9757 "osd_recovery_delay_start",
9758 "osd_client_message_size_cap",
9759 "osd_client_message_cap",
9760 "osd_heartbeat_min_size",
9761 "osd_heartbeat_interval",
9762 "osd_object_clean_region_max_num_intervals",
9763 "osd_scrub_min_interval",
9764 "osd_scrub_max_interval",
9770 void OSD::handle_conf_change(const ConfigProxy
& conf
,
9771 const std::set
<std::string
> &changed
)
9773 std::lock_guard l
{osd_lock
};
9774 if (changed
.count("osd_max_backfills")) {
9775 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9776 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9778 if (changed
.count("osd_min_recovery_priority")) {
9779 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9780 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9782 if (changed
.count("osd_max_trimming_pgs")) {
9783 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
9785 if (changed
.count("osd_op_complaint_time") ||
9786 changed
.count("osd_op_log_threshold")) {
9787 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
9788 cct
->_conf
->osd_op_log_threshold
);
9790 if (changed
.count("osd_op_history_size") ||
9791 changed
.count("osd_op_history_duration")) {
9792 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
9793 cct
->_conf
->osd_op_history_duration
);
9795 if (changed
.count("osd_op_history_slow_op_size") ||
9796 changed
.count("osd_op_history_slow_op_threshold")) {
9797 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
9798 cct
->_conf
->osd_op_history_slow_op_threshold
);
9800 if (changed
.count("osd_enable_op_tracker")) {
9801 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
9803 if (changed
.count("osd_map_cache_size")) {
9804 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9805 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9806 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9808 if (changed
.count("clog_to_monitors") ||
9809 changed
.count("clog_to_syslog") ||
9810 changed
.count("clog_to_syslog_level") ||
9811 changed
.count("clog_to_syslog_facility") ||
9812 changed
.count("clog_to_graylog") ||
9813 changed
.count("clog_to_graylog_host") ||
9814 changed
.count("clog_to_graylog_port") ||
9815 changed
.count("host") ||
9816 changed
.count("fsid")) {
9817 update_log_config();
9819 if (changed
.count("osd_pg_epoch_max_lag_factor")) {
9820 m_osd_pg_epoch_max_lag_factor
= conf
.get_val
<double>(
9821 "osd_pg_epoch_max_lag_factor");
9825 if (changed
.count("osd_objectstore_fuse")) {
9827 enable_disable_fuse(false);
9832 if (changed
.count("osd_recovery_delay_start")) {
9833 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
9834 service
.kick_recovery_queue();
9837 if (changed
.count("osd_client_message_cap")) {
9838 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
9839 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9840 if (pol
.throttler_messages
&& newval
> 0) {
9841 pol
.throttler_messages
->reset_max(newval
);
9844 if (changed
.count("osd_client_message_size_cap")) {
9845 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
9846 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9847 if (pol
.throttler_bytes
&& newval
> 0) {
9848 pol
.throttler_bytes
->reset_max(newval
);
9851 if (changed
.count("osd_object_clean_region_max_num_intervals")) {
9852 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
9855 if (changed
.count("osd_scrub_min_interval") ||
9856 changed
.count("osd_scrub_max_interval")) {
9857 resched_all_scrubs();
9858 dout(0) << __func__
<< ": scrub interval change" << dendl
;
9863 void OSD::update_log_config()
9865 map
<string
,string
> log_to_monitors
;
9866 map
<string
,string
> log_to_syslog
;
9867 map
<string
,string
> log_channel
;
9868 map
<string
,string
> log_prio
;
9869 map
<string
,string
> log_to_graylog
;
9870 map
<string
,string
> log_to_graylog_host
;
9871 map
<string
,string
> log_to_graylog_port
;
9875 if (parse_log_client_options(cct
, log_to_monitors
, log_to_syslog
,
9876 log_channel
, log_prio
, log_to_graylog
,
9877 log_to_graylog_host
, log_to_graylog_port
,
9879 clog
->update_config(log_to_monitors
, log_to_syslog
,
9880 log_channel
, log_prio
, log_to_graylog
,
9881 log_to_graylog_host
, log_to_graylog_port
,
9883 derr
<< "log_to_monitors " << log_to_monitors
<< dendl
;
9886 void OSD::check_config()
9888 // some sanity checks
9889 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
9890 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
9891 << " is not > osd_pg_epoch_persisted_max_stale ("
9892 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
9894 if (cct
->_conf
->osd_object_clean_region_max_num_intervals
< 0) {
9895 clog
->warn() << "osd_object_clean_region_max_num_intervals ("
9896 << cct
->_conf
->osd_object_clean_region_max_num_intervals
9901 // --------------------------------
9903 void OSD::get_latest_osdmap()
9905 dout(10) << __func__
<< " -- start" << dendl
;
9908 service
.objecter
->wait_for_latest_osdmap(&cond
);
9911 dout(10) << __func__
<< " -- finish" << dendl
;
9914 // --------------------------------
9916 void OSD::set_perf_queries(const ConfigPayload
&config_payload
) {
9917 const OSDConfigPayload
&osd_config_payload
= boost::get
<OSDConfigPayload
>(config_payload
);
9918 const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
= osd_config_payload
.config
;
9919 dout(10) << "setting " << queries
.size() << " queries" << dendl
;
9921 std::list
<OSDPerfMetricQuery
> supported_queries
;
9922 for (auto &it
: queries
) {
9923 auto &query
= it
.first
;
9924 if (!query
.key_descriptor
.empty()) {
9925 supported_queries
.push_back(query
);
9928 if (supported_queries
.size() < queries
.size()) {
9929 dout(1) << queries
.size() - supported_queries
.size()
9930 << " unsupported queries" << dendl
;
9933 std::lock_guard locker
{m_perf_queries_lock
};
9934 m_perf_queries
= supported_queries
;
9935 m_perf_limits
= queries
;
9937 std::vector
<PGRef
> pgs
;
9939 for (auto& pg
: pgs
) {
9940 std::scoped_lock l
{*pg
};
9941 pg
->set_dynamic_perf_stats_queries(supported_queries
);
9945 MetricPayload
OSD::get_perf_reports() {
9946 OSDMetricPayload payload
;
9947 std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> &reports
= payload
.report
;
9949 std::vector
<PGRef
> pgs
;
9951 DynamicPerfStats dps
;
9952 for (auto& pg
: pgs
) {
9953 // m_perf_queries can be modified only in set_perf_queries by mgr client
9954 // request, and it is protected by by mgr client's lock, which is held
9955 // when set_perf_queries/get_perf_reports are called, so we may not hold
9956 // m_perf_queries_lock here.
9957 DynamicPerfStats
pg_dps(m_perf_queries
);
9959 pg
->get_dynamic_perf_stats(&pg_dps
);
9963 dps
.add_to_reports(m_perf_limits
, &reports
);
9964 dout(20) << "reports for " << reports
.size() << " queries" << dendl
;
9969 // =============================================================
9972 #define dout_context cct
9974 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
9976 void OSDShard::_attach_pg(OSDShardPGSlot
*slot
, PG
*pg
)
9978 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
9980 pg
->osd_shard
= this;
9984 slot
->epoch
= pg
->get_osdmap_epoch();
9985 pg_slots_by_epoch
.insert(*slot
);
9988 void OSDShard::_detach_pg(OSDShardPGSlot
*slot
)
9990 dout(10) << slot
->pg
->pg_id
<< " " << slot
->pg
<< dendl
;
9991 slot
->pg
->osd_shard
= nullptr;
9992 slot
->pg
->pg_slot
= nullptr;
9996 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
9998 if (waiting_for_min_pg_epoch
) {
9999 min_pg_epoch_cond
.notify_all();
10003 void OSDShard::update_pg_epoch(OSDShardPGSlot
*slot
, epoch_t e
)
10005 std::lock_guard
l(shard_lock
);
10006 dout(30) << "min was " << pg_slots_by_epoch
.begin()->epoch
10007 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10008 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10009 dout(20) << slot
->pg
->pg_id
<< " " << slot
->epoch
<< " -> " << e
<< dendl
;
10011 pg_slots_by_epoch
.insert(*slot
);
10012 dout(30) << "min is now " << pg_slots_by_epoch
.begin()->epoch
10013 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10014 if (waiting_for_min_pg_epoch
) {
10015 min_pg_epoch_cond
.notify_all();
10019 epoch_t
OSDShard::get_min_pg_epoch()
10021 std::lock_guard
l(shard_lock
);
10022 auto p
= pg_slots_by_epoch
.begin();
10023 if (p
== pg_slots_by_epoch
.end()) {
10029 void OSDShard::wait_min_pg_epoch(epoch_t need
)
10031 std::unique_lock l
{shard_lock
};
10032 ++waiting_for_min_pg_epoch
;
10033 min_pg_epoch_cond
.wait(l
, [need
, this] {
10034 if (pg_slots_by_epoch
.empty()) {
10036 } else if (pg_slots_by_epoch
.begin()->epoch
>= need
) {
10039 dout(10) << need
<< " waiting on "
10040 << pg_slots_by_epoch
.begin()->epoch
<< dendl
;
10044 --waiting_for_min_pg_epoch
;
10047 epoch_t
OSDShard::get_max_waiting_epoch()
10049 std::lock_guard
l(shard_lock
);
10051 for (auto& i
: pg_slots
) {
10052 if (!i
.second
->waiting_peering
.empty()) {
10053 r
= std::max(r
, i
.second
->waiting_peering
.rbegin()->first
);
10059 void OSDShard::consume_map(
10060 const OSDMapRef
& new_osdmap
,
10061 unsigned *pushes_to_free
)
10063 std::lock_guard
l(shard_lock
);
10064 OSDMapRef old_osdmap
;
10066 std::lock_guard
l(osdmap_lock
);
10067 old_osdmap
= std::move(shard_osdmap
);
10068 shard_osdmap
= new_osdmap
;
10070 dout(10) << new_osdmap
->get_epoch()
10071 << " (was " << (old_osdmap
? old_osdmap
->get_epoch() : 0) << ")"
10073 bool queued
= false;
10076 auto p
= pg_slots
.begin();
10077 while (p
!= pg_slots
.end()) {
10078 OSDShardPGSlot
*slot
= p
->second
.get();
10079 const spg_t
& pgid
= p
->first
;
10080 dout(20) << __func__
<< " " << pgid
<< dendl
;
10081 if (!slot
->waiting_for_split
.empty()) {
10082 dout(20) << __func__
<< " " << pgid
10083 << " waiting for split " << slot
->waiting_for_split
<< dendl
;
10087 if (slot
->waiting_for_merge_epoch
> new_osdmap
->get_epoch()) {
10088 dout(20) << __func__
<< " " << pgid
10089 << " waiting for merge by epoch " << slot
->waiting_for_merge_epoch
10094 if (!slot
->waiting_peering
.empty()) {
10095 epoch_t first
= slot
->waiting_peering
.begin()->first
;
10096 if (first
<= new_osdmap
->get_epoch()) {
10097 dout(20) << __func__
<< " " << pgid
10098 << " pending_peering first epoch " << first
10099 << " <= " << new_osdmap
->get_epoch() << ", requeueing" << dendl
;
10100 _wake_pg_slot(pgid
, slot
);
10106 if (!slot
->waiting
.empty()) {
10107 if (new_osdmap
->is_up_acting_osd_shard(pgid
, osd
->get_nodeid())) {
10108 dout(20) << __func__
<< " " << pgid
<< " maps to us, keeping"
10113 while (!slot
->waiting
.empty() &&
10114 slot
->waiting
.front().get_map_epoch() <= new_osdmap
->get_epoch()) {
10115 auto& qi
= slot
->waiting
.front();
10116 dout(20) << __func__
<< " " << pgid
10117 << " waiting item " << qi
10118 << " epoch " << qi
.get_map_epoch()
10119 << " <= " << new_osdmap
->get_epoch()
10121 << (qi
.get_map_epoch() < new_osdmap
->get_epoch() ? "stale" :
10123 << ", dropping" << dendl
;
10124 *pushes_to_free
+= qi
.get_reserved_pushes();
10125 slot
->waiting
.pop_front();
10128 if (slot
->waiting
.empty() &&
10129 slot
->num_running
== 0 &&
10130 slot
->waiting_for_split
.empty() &&
10132 dout(20) << __func__
<< " " << pgid
<< " empty, pruning" << dendl
;
10133 p
= pg_slots
.erase(p
);
10140 std::lock_guard l
{sdata_wait_lock
};
10141 sdata_cond
.notify_one();
10145 void OSDShard::_wake_pg_slot(
10147 OSDShardPGSlot
*slot
)
10149 dout(20) << __func__
<< " " << pgid
10150 << " to_process " << slot
->to_process
10151 << " waiting " << slot
->waiting
10152 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10153 for (auto i
= slot
->to_process
.rbegin();
10154 i
!= slot
->to_process
.rend();
10156 scheduler
->enqueue_front(std::move(*i
));
10158 slot
->to_process
.clear();
10159 for (auto i
= slot
->waiting
.rbegin();
10160 i
!= slot
->waiting
.rend();
10162 scheduler
->enqueue_front(std::move(*i
));
10164 slot
->waiting
.clear();
10165 for (auto i
= slot
->waiting_peering
.rbegin();
10166 i
!= slot
->waiting_peering
.rend();
10168 // this is overkill; we requeue everything, even if some of these
10169 // items are waiting for maps we don't have yet. FIXME, maybe,
10170 // someday, if we decide this inefficiency matters
10171 for (auto j
= i
->second
.rbegin(); j
!= i
->second
.rend(); ++j
) {
10172 scheduler
->enqueue_front(std::move(*j
));
10175 slot
->waiting_peering
.clear();
10176 ++slot
->requeue_seq
;
10179 void OSDShard::identify_splits_and_merges(
10180 const OSDMapRef
& as_of_osdmap
,
10181 set
<pair
<spg_t
,epoch_t
>> *split_pgs
,
10182 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10184 std::lock_guard
l(shard_lock
);
10185 if (shard_osdmap
) {
10186 for (auto& i
: pg_slots
) {
10187 const spg_t
& pgid
= i
.first
;
10188 auto *slot
= i
.second
.get();
10190 osd
->service
.identify_splits_and_merges(
10191 shard_osdmap
, as_of_osdmap
, pgid
,
10192 split_pgs
, merge_pgs
);
10193 } else if (!slot
->waiting_for_split
.empty()) {
10194 osd
->service
.identify_splits_and_merges(
10195 shard_osdmap
, as_of_osdmap
, pgid
,
10196 split_pgs
, nullptr);
10198 dout(20) << __func__
<< " slot " << pgid
10199 << " has no pg and waiting_for_split " << dendl
;
10205 void OSDShard::prime_splits(const OSDMapRef
& as_of_osdmap
,
10206 set
<pair
<spg_t
,epoch_t
>> *pgids
)
10208 std::lock_guard
l(shard_lock
);
10209 _prime_splits(pgids
);
10210 if (shard_osdmap
->get_epoch() > as_of_osdmap
->get_epoch()) {
10211 set
<pair
<spg_t
,epoch_t
>> newer_children
;
10212 for (auto i
: *pgids
) {
10213 osd
->service
.identify_splits_and_merges(
10214 as_of_osdmap
, shard_osdmap
, i
.first
,
10215 &newer_children
, nullptr);
10217 newer_children
.insert(pgids
->begin(), pgids
->end());
10218 dout(10) << "as_of_osdmap " << as_of_osdmap
->get_epoch() << " < shard "
10219 << shard_osdmap
->get_epoch() << ", new children " << newer_children
10221 _prime_splits(&newer_children
);
10222 // note: we don't care what is left over here for other shards.
10223 // if this shard is ahead of us and one isn't, e.g., one thread is
10224 // calling into prime_splits via _process (due to a newly created
10225 // pg) and this shard has a newer map due to a racing consume_map,
10226 // then any grandchildren left here will be identified (or were
10227 // identified) when the slower shard's osdmap is advanced.
10228 // _prime_splits() will tolerate the case where the pgid is
10233 void OSDShard::_prime_splits(set
<pair
<spg_t
,epoch_t
>> *pgids
)
10235 dout(10) << *pgids
<< dendl
;
10236 auto p
= pgids
->begin();
10237 while (p
!= pgids
->end()) {
10238 unsigned shard_index
= p
->first
.hash_to_shard(osd
->num_shards
);
10239 if (shard_index
== shard_id
) {
10240 auto r
= pg_slots
.emplace(p
->first
, nullptr);
10242 dout(10) << "priming slot " << p
->first
<< " e" << p
->second
<< dendl
;
10243 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10244 r
.first
->second
->waiting_for_split
.insert(p
->second
);
10247 ceph_assert(q
!= pg_slots
.end());
10248 dout(10) << "priming (existing) slot " << p
->first
<< " e" << p
->second
10250 q
->second
->waiting_for_split
.insert(p
->second
);
10252 p
= pgids
->erase(p
);
10259 void OSDShard::prime_merges(const OSDMapRef
& as_of_osdmap
,
10260 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10262 std::lock_guard
l(shard_lock
);
10263 dout(20) << __func__
<< " checking shard " << shard_id
10264 << " for remaining merge pgs " << merge_pgs
<< dendl
;
10265 auto p
= merge_pgs
->begin();
10266 while (p
!= merge_pgs
->end()) {
10267 spg_t pgid
= p
->first
;
10268 epoch_t epoch
= p
->second
;
10269 unsigned shard_index
= pgid
.hash_to_shard(osd
->num_shards
);
10270 if (shard_index
!= shard_id
) {
10274 OSDShardPGSlot
*slot
;
10275 auto r
= pg_slots
.emplace(pgid
, nullptr);
10277 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10279 slot
= r
.first
->second
.get();
10282 dout(20) << __func__
<< " have merge participant pg " << pgid
10283 << " " << slot
->pg
<< dendl
;
10284 } else if (!slot
->waiting_for_split
.empty() &&
10285 *slot
->waiting_for_split
.begin() < epoch
) {
10286 dout(20) << __func__
<< " pending split on merge participant pg " << pgid
10287 << " " << slot
->waiting_for_split
<< dendl
;
10289 dout(20) << __func__
<< " creating empty merge participant " << pgid
10290 << " for merge in " << epoch
<< dendl
;
10291 // leave history zeroed; PG::merge_from() will fill it in.
10292 pg_history_t history
;
10293 PGCreateInfo
cinfo(pgid
, epoch
- 1,
10294 history
, PastIntervals(), false);
10295 PGRef pg
= osd
->handle_pg_create_info(shard_osdmap
, &cinfo
);
10296 _attach_pg(r
.first
->second
.get(), pg
.get());
10297 _wake_pg_slot(pgid
, slot
);
10300 // mark slot for merge
10301 dout(20) << __func__
<< " marking merge participant " << pgid
<< dendl
;
10302 slot
->waiting_for_merge_epoch
= epoch
;
10303 p
= merge_pgs
->erase(p
);
10307 void OSDShard::register_and_wake_split_child(PG
*pg
)
10311 std::lock_guard
l(shard_lock
);
10312 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10313 auto p
= pg_slots
.find(pg
->pg_id
);
10314 ceph_assert(p
!= pg_slots
.end());
10315 auto *slot
= p
->second
.get();
10316 dout(20) << pg
->pg_id
<< " waiting_for_split " << slot
->waiting_for_split
10318 ceph_assert(!slot
->pg
);
10319 ceph_assert(!slot
->waiting_for_split
.empty());
10320 _attach_pg(slot
, pg
);
10322 epoch
= pg
->get_osdmap_epoch();
10323 ceph_assert(slot
->waiting_for_split
.count(epoch
));
10324 slot
->waiting_for_split
.erase(epoch
);
10325 if (slot
->waiting_for_split
.empty()) {
10326 _wake_pg_slot(pg
->pg_id
, slot
);
10328 dout(10) << __func__
<< " still waiting for split on "
10329 << slot
->waiting_for_split
<< dendl
;
10333 // kick child to ensure it pulls up to the latest osdmap
10334 osd
->enqueue_peering_evt(
10337 std::make_shared
<PGPeeringEvent
>(
10342 std::lock_guard l
{sdata_wait_lock
};
10343 sdata_cond
.notify_one();
10346 void OSDShard::unprime_split_children(spg_t parent
, unsigned old_pg_num
)
10348 std::lock_guard
l(shard_lock
);
10349 vector
<spg_t
> to_delete
;
10350 for (auto& i
: pg_slots
) {
10351 if (i
.first
!= parent
&&
10352 i
.first
.get_ancestor(old_pg_num
) == parent
) {
10353 dout(10) << __func__
<< " parent " << parent
<< " clearing " << i
.first
10355 _wake_pg_slot(i
.first
, i
.second
.get());
10356 to_delete
.push_back(i
.first
);
10359 for (auto pgid
: to_delete
) {
10360 pg_slots
.erase(pgid
);
10364 OSDShard::OSDShard(
10371 shard_name(string("OSDShard.") + stringify(id
)),
10372 sdata_wait_lock_name(shard_name
+ "::sdata_wait_lock"),
10373 sdata_wait_lock
{make_mutex(sdata_wait_lock_name
)},
10374 osdmap_lock
{make_mutex(shard_name
+ "::osdmap_lock")},
10375 shard_lock_name(shard_name
+ "::shard_lock"),
10376 shard_lock
{make_mutex(shard_lock_name
)},
10377 scheduler(ceph::osd::scheduler::make_scheduler(cct
)),
10378 context_queue(sdata_wait_lock
, sdata_cond
)
10380 dout(0) << "using op scheduler " << *scheduler
<< dendl
;
10384 // =============================================================
10386 #undef dout_context
10387 #define dout_context osd->cct
10389 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10391 void OSD::ShardedOpWQ::_add_slot_waiter(
10393 OSDShardPGSlot
*slot
,
10394 OpSchedulerItem
&& qi
)
10396 if (qi
.is_peering()) {
10397 dout(20) << __func__
<< " " << pgid
10398 << " peering, item epoch is "
10399 << qi
.get_map_epoch()
10400 << ", will wait on " << qi
<< dendl
;
10401 slot
->waiting_peering
[qi
.get_map_epoch()].push_back(std::move(qi
));
10403 dout(20) << __func__
<< " " << pgid
10404 << " item epoch is "
10405 << qi
.get_map_epoch()
10406 << ", will wait on " << qi
<< dendl
;
10407 slot
->waiting
.push_back(std::move(qi
));
10412 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10414 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
10416 uint32_t shard_index
= thread_index
% osd
->num_shards
;
10417 auto& sdata
= osd
->shards
[shard_index
];
10418 ceph_assert(sdata
);
10420 // If all threads of shards do oncommits, there is a out-of-order
10421 // problem. So we choose the thread which has the smallest
10422 // thread_index(thread_index < num_shards) of shard to do oncommit
10424 bool is_smallest_thread_index
= thread_index
< osd
->num_shards
;
10427 sdata
->shard_lock
.lock();
10428 if (sdata
->scheduler
->empty() &&
10429 (!is_smallest_thread_index
|| sdata
->context_queue
.empty())) {
10430 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10431 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
10432 // we raced with a context_queue addition, don't wait
10433 wait_lock
.unlock();
10434 } else if (!sdata
->stop_waiting
) {
10435 dout(20) << __func__
<< " empty q, waiting" << dendl
;
10436 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
10437 sdata
->shard_lock
.unlock();
10438 sdata
->sdata_cond
.wait(wait_lock
);
10439 wait_lock
.unlock();
10440 sdata
->shard_lock
.lock();
10441 if (sdata
->scheduler
->empty() &&
10442 !(is_smallest_thread_index
&& !sdata
->context_queue
.empty())) {
10443 sdata
->shard_lock
.unlock();
10446 // found a work item; reapply default wq timeouts
10447 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10448 timeout_interval
, suicide_interval
);
10450 dout(20) << __func__
<< " need return immediately" << dendl
;
10451 wait_lock
.unlock();
10452 sdata
->shard_lock
.unlock();
10457 list
<Context
*> oncommits
;
10458 if (is_smallest_thread_index
) {
10459 sdata
->context_queue
.move_to(oncommits
);
10462 if (sdata
->scheduler
->empty()) {
10463 if (osd
->is_stopping()) {
10464 sdata
->shard_lock
.unlock();
10465 for (auto c
: oncommits
) {
10466 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10469 return; // OSD shutdown, discard.
10471 sdata
->shard_lock
.unlock();
10472 handle_oncommits(oncommits
);
10476 OpSchedulerItem item
= sdata
->scheduler
->dequeue();
10477 if (osd
->is_stopping()) {
10478 sdata
->shard_lock
.unlock();
10479 for (auto c
: oncommits
) {
10480 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10483 return; // OSD shutdown, discard.
10486 const auto token
= item
.get_ordering_token();
10487 auto r
= sdata
->pg_slots
.emplace(token
, nullptr);
10489 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10491 OSDShardPGSlot
*slot
= r
.first
->second
.get();
10492 dout(20) << __func__
<< " " << token
10493 << (r
.second
? " (new)" : "")
10494 << " to_process " << slot
->to_process
10495 << " waiting " << slot
->waiting
10496 << " waiting_peering " << slot
->waiting_peering
10498 slot
->to_process
.push_back(std::move(item
));
10499 dout(20) << __func__
<< " " << slot
->to_process
.back()
10500 << " queued" << dendl
;
10503 PGRef pg
= slot
->pg
;
10505 // lock pg (if we have it)
10507 // note the requeue seq now...
10508 uint64_t requeue_seq
= slot
->requeue_seq
;
10509 ++slot
->num_running
;
10511 sdata
->shard_lock
.unlock();
10512 osd
->service
.maybe_inject_dispatch_delay();
10514 osd
->service
.maybe_inject_dispatch_delay();
10515 sdata
->shard_lock
.lock();
10517 auto q
= sdata
->pg_slots
.find(token
);
10518 if (q
== sdata
->pg_slots
.end()) {
10519 // this can happen if we race with pg removal.
10520 dout(20) << __func__
<< " slot " << token
<< " no longer there" << dendl
;
10522 sdata
->shard_lock
.unlock();
10523 handle_oncommits(oncommits
);
10526 slot
= q
->second
.get();
10527 --slot
->num_running
;
10529 if (slot
->to_process
.empty()) {
10530 // raced with _wake_pg_slot or consume_map
10531 dout(20) << __func__
<< " " << token
10532 << " nothing queued" << dendl
;
10534 sdata
->shard_lock
.unlock();
10535 handle_oncommits(oncommits
);
10538 if (requeue_seq
!= slot
->requeue_seq
) {
10539 dout(20) << __func__
<< " " << token
10540 << " requeue_seq " << slot
->requeue_seq
<< " > our "
10541 << requeue_seq
<< ", we raced with _wake_pg_slot"
10544 sdata
->shard_lock
.unlock();
10545 handle_oncommits(oncommits
);
10548 if (slot
->pg
!= pg
) {
10549 // this can happen if we race with pg removal.
10550 dout(20) << __func__
<< " slot " << token
<< " no longer attached to "
10557 dout(20) << __func__
<< " " << token
10558 << " to_process " << slot
->to_process
10559 << " waiting " << slot
->waiting
10560 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10562 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
10566 auto qi
= std::move(slot
->to_process
.front());
10567 slot
->to_process
.pop_front();
10568 dout(20) << __func__
<< " " << qi
<< " pg " << pg
<< dendl
;
10569 set
<pair
<spg_t
,epoch_t
>> new_children
;
10573 // should this pg shard exist on this osd in this (or a later) epoch?
10574 osdmap
= sdata
->shard_osdmap
;
10575 const PGCreateInfo
*create_info
= qi
.creates_pg();
10576 if (!slot
->waiting_for_split
.empty()) {
10577 dout(20) << __func__
<< " " << token
10578 << " splitting " << slot
->waiting_for_split
<< dendl
;
10579 _add_slot_waiter(token
, slot
, std::move(qi
));
10580 } else if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10581 dout(20) << __func__
<< " " << token
10582 << " map " << qi
.get_map_epoch() << " > "
10583 << osdmap
->get_epoch() << dendl
;
10584 _add_slot_waiter(token
, slot
, std::move(qi
));
10585 } else if (qi
.is_peering()) {
10586 if (!qi
.peering_requires_pg()) {
10587 // for pg-less events, we run them under the ordering lock, since
10588 // we don't have the pg lock to keep them ordered.
10589 qi
.run(osd
, sdata
, pg
, tp_handle
);
10590 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10592 if (create_info
->by_mon
&&
10593 osdmap
->get_pg_acting_primary(token
.pgid
) != osd
->whoami
) {
10594 dout(20) << __func__
<< " " << token
10595 << " no pg, no longer primary, ignoring mon create on "
10598 dout(20) << __func__
<< " " << token
10599 << " no pg, should create on " << qi
<< dendl
;
10600 pg
= osd
->handle_pg_create_info(osdmap
, create_info
);
10602 // we created the pg! drop out and continue "normally"!
10603 sdata
->_attach_pg(slot
, pg
.get());
10604 sdata
->_wake_pg_slot(token
, slot
);
10606 // identify split children between create epoch and shard epoch.
10607 osd
->service
.identify_splits_and_merges(
10608 pg
->get_osdmap(), osdmap
, pg
->pg_id
, &new_children
, nullptr);
10609 sdata
->_prime_splits(&new_children
);
10610 // distribute remaining split children to other shards below!
10613 dout(20) << __func__
<< " ignored create on " << qi
<< dendl
;
10616 dout(20) << __func__
<< " " << token
10617 << " no pg, peering, !create, discarding " << qi
<< dendl
;
10620 dout(20) << __func__
<< " " << token
10621 << " no pg, peering, doesn't map here e" << osdmap
->get_epoch()
10622 << ", discarding " << qi
10625 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10626 dout(20) << __func__
<< " " << token
10627 << " no pg, should exist e" << osdmap
->get_epoch()
10628 << ", will wait on " << qi
<< dendl
;
10629 _add_slot_waiter(token
, slot
, std::move(qi
));
10631 dout(20) << __func__
<< " " << token
10632 << " no pg, shouldn't exist e" << osdmap
->get_epoch()
10633 << ", dropping " << qi
<< dendl
;
10634 // share map with client?
10635 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10636 osd
->service
.maybe_share_map((*_op
)->get_req()->get_connection().get(),
10637 sdata
->shard_osdmap
,
10638 (*_op
)->sent_epoch
);
10640 unsigned pushes_to_free
= qi
.get_reserved_pushes();
10641 if (pushes_to_free
> 0) {
10642 sdata
->shard_lock
.unlock();
10643 osd
->service
.release_reserved_pushes(pushes_to_free
);
10644 handle_oncommits(oncommits
);
10648 sdata
->shard_lock
.unlock();
10649 handle_oncommits(oncommits
);
10652 if (qi
.is_peering()) {
10653 OSDMapRef osdmap
= sdata
->shard_osdmap
;
10654 if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10655 _add_slot_waiter(token
, slot
, std::move(qi
));
10656 sdata
->shard_lock
.unlock();
10658 handle_oncommits(oncommits
);
10662 sdata
->shard_lock
.unlock();
10664 if (!new_children
.empty()) {
10665 for (auto shard
: osd
->shards
) {
10666 shard
->prime_splits(osdmap
, &new_children
);
10668 ceph_assert(new_children
.empty());
10671 // osd_opwq_process marks the point at which an operation has been dequeued
10672 // and will begin to be handled by a worker thread.
10676 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10677 reqid
= (*_op
)->get_reqid();
10680 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
10681 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
10684 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
10685 Formatter
*f
= Formatter::create("json");
10686 f
->open_object_section("q");
10688 f
->close_section();
10693 qi
.run(osd
, sdata
, pg
, tp_handle
);
10698 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10699 reqid
= (*_op
)->get_reqid();
10702 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
10703 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
10706 handle_oncommits(oncommits
);
10709 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem
&& item
) {
10710 uint32_t shard_index
=
10711 item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
10713 dout(20) << __func__
<< " " << item
<< dendl
;
10715 OSDShard
* sdata
= osd
->shards
[shard_index
];
10716 assert (NULL
!= sdata
);
10720 std::lock_guard l
{sdata
->shard_lock
};
10721 empty
= sdata
->scheduler
->empty();
10722 sdata
->scheduler
->enqueue(std::move(item
));
10726 std::lock_guard l
{sdata
->sdata_wait_lock
};
10727 sdata
->sdata_cond
.notify_all();
10731 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem
&& item
)
10733 auto shard_index
= item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
10734 auto& sdata
= osd
->shards
[shard_index
];
10735 ceph_assert(sdata
);
10736 sdata
->shard_lock
.lock();
10737 auto p
= sdata
->pg_slots
.find(item
.get_ordering_token());
10738 if (p
!= sdata
->pg_slots
.end() &&
10739 !p
->second
->to_process
.empty()) {
10740 // we may be racing with _process, which has dequeued a new item
10741 // from scheduler, put it on to_process, and is now busy taking the
10742 // pg lock. ensure this old requeued item is ordered before any
10743 // such newer item in to_process.
10744 p
->second
->to_process
.push_front(std::move(item
));
10745 item
= std::move(p
->second
->to_process
.back());
10746 p
->second
->to_process
.pop_back();
10747 dout(20) << __func__
10748 << " " << p
->second
->to_process
.front()
10749 << " shuffled w/ " << item
<< dendl
;
10751 dout(20) << __func__
<< " " << item
<< dendl
;
10753 sdata
->scheduler
->enqueue_front(std::move(item
));
10754 sdata
->shard_lock
.unlock();
10755 std::lock_guard l
{sdata
->sdata_wait_lock
};
10756 sdata
->sdata_cond
.notify_one();
10760 namespace osd_cmds
{
10762 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
,
10765 if (!ceph_using_tcmalloc()) {
10766 os
<< "could not issue heap profiler command -- not using tcmalloc!";
10767 return -EOPNOTSUPP
;
10771 if (!cmd_getval(cmdmap
, "heapcmd", cmd
)) {
10772 os
<< "unable to get value for command \"" << cmd
<< "\"";
10776 std::vector
<std::string
> cmd_vec
;
10777 get_str_vec(cmd
, cmd_vec
);
10780 if (cmd_getval(cmdmap
, "value", val
)) {
10781 cmd_vec
.push_back(val
);
10784 ceph_heap_profiler_handle_command(cmd_vec
, os
);
10789 }} // namespace ceph::osd_cmds