1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
47 #include "osdc/Objecter.h"
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_time.h"
52 #include "common/version.h"
53 #include "common/pick_address.h"
54 #include "common/blkdev.h"
55 #include "common/numa.h"
57 #include "os/ObjectStore.h"
59 #include "os/FuseStore.h"
62 #include "PrimaryLogPG.h"
64 #include "msg/Messenger.h"
65 #include "msg/Message.h"
67 #include "mon/MonClient.h"
69 #include "messages/MLog.h"
71 #include "messages/MGenericMessage.h"
72 #include "messages/MOSDPing.h"
73 #include "messages/MOSDFailure.h"
74 #include "messages/MOSDMarkMeDown.h"
75 #include "messages/MOSDFull.h"
76 #include "messages/MOSDOp.h"
77 #include "messages/MOSDOpReply.h"
78 #include "messages/MOSDBackoff.h"
79 #include "messages/MOSDBeacon.h"
80 #include "messages/MOSDRepOp.h"
81 #include "messages/MOSDRepOpReply.h"
82 #include "messages/MOSDBoot.h"
83 #include "messages/MOSDPGTemp.h"
84 #include "messages/MOSDPGReadyToMerge.h"
86 #include "messages/MOSDMap.h"
87 #include "messages/MMonGetOSDMap.h"
88 #include "messages/MOSDPGNotify.h"
89 #include "messages/MOSDPGQuery.h"
90 #include "messages/MOSDPGLog.h"
91 #include "messages/MOSDPGRemove.h"
92 #include "messages/MOSDPGInfo.h"
93 #include "messages/MOSDPGCreate.h"
94 #include "messages/MOSDPGCreate2.h"
95 #include "messages/MOSDPGTrim.h"
96 #include "messages/MOSDPGScan.h"
97 #include "messages/MBackfillReserve.h"
98 #include "messages/MRecoveryReserve.h"
99 #include "messages/MOSDForceRecovery.h"
100 #include "messages/MOSDECSubOpWrite.h"
101 #include "messages/MOSDECSubOpWriteReply.h"
102 #include "messages/MOSDECSubOpRead.h"
103 #include "messages/MOSDECSubOpReadReply.h"
104 #include "messages/MOSDPGCreated.h"
105 #include "messages/MOSDPGUpdateLogMissing.h"
106 #include "messages/MOSDPGUpdateLogMissingReply.h"
108 #include "messages/MOSDPeeringOp.h"
110 #include "messages/MOSDAlive.h"
112 #include "messages/MOSDScrub.h"
113 #include "messages/MOSDScrub2.h"
114 #include "messages/MOSDRepScrub.h"
116 #include "messages/MMonCommand.h"
117 #include "messages/MCommand.h"
118 #include "messages/MCommandReply.h"
120 #include "messages/MPGStats.h"
121 #include "messages/MPGStatsAck.h"
123 #include "messages/MWatchNotify.h"
124 #include "messages/MOSDPGPush.h"
125 #include "messages/MOSDPGPushReply.h"
126 #include "messages/MOSDPGPull.h"
128 #include "common/perf_counters.h"
129 #include "common/Timer.h"
130 #include "common/LogClient.h"
131 #include "common/AsyncReserver.h"
132 #include "common/HeartbeatMap.h"
133 #include "common/admin_socket.h"
134 #include "common/ceph_context.h"
136 #include "global/signal_handler.h"
137 #include "global/pidfile.h"
139 #include "include/color.h"
140 #include "perfglue/cpu_profiler.h"
141 #include "perfglue/heap_profiler.h"
143 #include "osd/OpRequest.h"
145 #include "auth/AuthAuthorizeHandler.h"
146 #include "auth/RotatingKeyRing.h"
148 #include "objclass/objclass.h"
150 #include "common/cmdparse.h"
151 #include "include/str_list.h"
152 #include "include/util.h"
154 #include "include/ceph_assert.h"
155 #include "common/config.h"
156 #include "common/EventTrace.h"
158 #include "json_spirit/json_spirit_reader.h"
159 #include "json_spirit/json_spirit_writer.h"
162 #define TRACEPOINT_DEFINE
163 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
164 #include "tracing/osd.h"
165 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
166 #undef TRACEPOINT_DEFINE
168 #define tracepoint(...)
171 #define dout_context cct
172 #define dout_subsys ceph_subsys_osd
174 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
177 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
178 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
181 //Initial features in new superblock.
182 //Features here are also automatically upgraded
183 CompatSet
OSD::get_osd_initial_compat_set() {
184 CompatSet::FeatureSet ceph_osd_feature_compat
;
185 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
186 CompatSet::FeatureSet ceph_osd_feature_incompat
;
187 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
188 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
189 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
190 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
191 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
192 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
193 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
194 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
195 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
196 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
197 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
198 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
199 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
200 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
201 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES
);
202 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
203 ceph_osd_feature_incompat
);
206 //Features are added here that this OSD supports.
207 CompatSet
OSD::get_osd_compat_set() {
208 CompatSet compat
= get_osd_initial_compat_set();
209 //Any features here can be set in code, but not in initial superblock
210 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
214 OSDService::OSDService(OSD
*osd
) :
217 whoami(osd
->whoami
), store(osd
->store
),
218 log_client(osd
->log_client
), clog(osd
->clog
),
219 pg_recovery_stats(osd
->pg_recovery_stats
),
220 cluster_messenger(osd
->cluster_messenger
),
221 client_messenger(osd
->client_messenger
),
223 recoverystate_perf(osd
->recoverystate_perf
),
225 class_handler(osd
->class_handler
),
226 osd_max_object_size(cct
->_conf
, "osd_max_object_size"),
227 osd_skip_data_digest(cct
->_conf
, "osd_skip_data_digest"),
228 publish_lock
{ceph::make_mutex("OSDService::publish_lock")},
229 pre_publish_lock
{ceph::make_mutex("OSDService::pre_publish_lock")},
231 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
232 sched_scrub_lock("OSDService::sched_scrub_lock"),
235 agent_lock("OSDService::agent_lock"),
236 agent_valid_iterator(false),
238 flush_mode_high_count(0),
241 agent_stop_flag(false),
242 agent_timer_lock("OSDService::agent_timer_lock"),
243 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
244 last_recalibrate(ceph_clock_now()),
245 promote_max_objects(0),
246 promote_max_bytes(0),
247 objecter(new Objecter(osd
->client_messenger
->cct
, osd
->objecter_messenger
, osd
->monc
, NULL
, 0, 0)),
248 m_objecter_finishers(cct
->_conf
->osd_objecter_finishers
),
249 watch_lock("OSDService::watch_lock"),
250 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
252 recovery_request_lock("OSDService::recovery_request_lock"),
253 recovery_request_timer(cct
, recovery_request_lock
, false),
254 sleep_lock("OSDService::sleep_lock"),
255 sleep_timer(cct
, sleep_lock
, false),
256 reserver_finisher(cct
),
257 local_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
258 cct
->_conf
->osd_min_recovery_priority
),
259 remote_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
260 cct
->_conf
->osd_min_recovery_priority
),
261 pg_temp_lock("OSDService::pg_temp_lock"),
262 snap_reserver(cct
, &reserver_finisher
,
263 cct
->_conf
->osd_max_trimming_pgs
),
264 recovery_lock("OSDService::recovery_lock"),
265 recovery_ops_active(0),
266 recovery_ops_reserved(0),
267 recovery_paused(false),
268 map_cache_lock("OSDService::map_cache_lock"),
269 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
270 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
271 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
272 stat_lock("OSDService::stat_lock"),
273 full_status_lock("OSDService::full_status_lock"),
275 cur_ratio(0), physical_ratio(0),
276 epoch_lock("OSDService::epoch_lock"),
277 boot_epoch(0), up_epoch(0), bind_epoch(0),
278 is_stopping_lock("OSDService::is_stopping_lock")
280 , pgid_lock("OSDService::pgid_lock")
285 for (int i
= 0; i
< m_objecter_finishers
; i
++) {
287 str
<< "objecter-finisher-" << i
;
288 Finisher
*fin
= new Finisher(osd
->client_messenger
->cct
, str
.str(), "finisher");
289 objecter_finishers
.push_back(fin
);
293 OSDService::~OSDService()
297 for (auto f
: objecter_finishers
) {
306 void OSDService::add_pgid(spg_t pgid
, PG
*pg
){
307 std::lock_guard
l(pgid_lock
);
308 if (!pgid_tracker
.count(pgid
)) {
311 pgid_tracker
[pgid
]++;
313 void OSDService::remove_pgid(spg_t pgid
, PG
*pg
)
315 std::lock_guard
l(pgid_lock
);
316 ceph_assert(pgid_tracker
.count(pgid
));
317 ceph_assert(pgid_tracker
[pgid
] > 0);
318 pgid_tracker
[pgid
]--;
319 if (pgid_tracker
[pgid
] == 0) {
320 pgid_tracker
.erase(pgid
);
321 live_pgs
.erase(pgid
);
324 void OSDService::dump_live_pgids()
326 std::lock_guard
l(pgid_lock
);
327 derr
<< "live pgids:" << dendl
;
328 for (map
<spg_t
, int>::const_iterator i
= pgid_tracker
.cbegin();
329 i
!= pgid_tracker
.cend();
331 derr
<< "\t" << *i
<< dendl
;
332 live_pgs
[i
->first
]->dump_live_ids();
339 void OSDService::identify_splits_and_merges(
343 set
<pair
<spg_t
,epoch_t
>> *split_children
,
344 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
346 if (!old_map
->have_pg_pool(pgid
.pool())) {
349 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
350 auto p
= osd
->pg_num_history
.pg_nums
.find(pgid
.pool());
351 if (p
== osd
->pg_num_history
.pg_nums
.end()) {
354 dout(20) << __func__
<< " " << pgid
<< " e" << old_map
->get_epoch()
355 << " to e" << new_map
->get_epoch()
356 << " pg_nums " << p
->second
<< dendl
;
358 queue
.push_back(pgid
);
360 while (!queue
.empty()) {
361 auto cur
= queue
.front();
364 unsigned pgnum
= old_pgnum
;
365 for (auto q
= p
->second
.lower_bound(old_map
->get_epoch());
366 q
!= p
->second
.end() &&
367 q
->first
<= new_map
->get_epoch();
369 if (pgnum
< q
->second
) {
371 if (cur
.ps() < pgnum
) {
373 if (cur
.is_split(pgnum
, q
->second
, &children
)) {
374 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
375 << " pg_num " << pgnum
<< " -> " << q
->second
376 << " children " << children
<< dendl
;
377 for (auto i
: children
) {
378 split_children
->insert(make_pair(i
, q
->first
));
383 } else if (cur
.ps() < q
->second
) {
384 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
385 << " pg_num " << pgnum
<< " -> " << q
->second
386 << " is a child" << dendl
;
387 // normally we'd capture this from the parent, but it's
388 // possible the parent doesn't exist yet (it will be
389 // fabricated to allow an intervening merge). note this PG
390 // as a split child here to be sure we catch it.
391 split_children
->insert(make_pair(cur
, q
->first
));
393 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
394 << " pg_num " << pgnum
<< " -> " << q
->second
395 << " is post-split, skipping" << dendl
;
397 } else if (merge_pgs
) {
399 if (cur
.ps() >= q
->second
) {
400 if (cur
.ps() < pgnum
) {
402 if (cur
.is_merge_source(pgnum
, q
->second
, &parent
)) {
404 parent
.is_split(q
->second
, pgnum
, &children
);
405 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
406 << " pg_num " << pgnum
<< " -> " << q
->second
407 << " is merge source, target " << parent
408 << ", source(s) " << children
<< dendl
;
409 merge_pgs
->insert(make_pair(parent
, q
->first
));
410 if (!did
.count(parent
)) {
411 // queue (and re-scan) parent in case it might not exist yet
412 // and there are some future splits pending on it
413 queue
.push_back(parent
);
415 for (auto c
: children
) {
416 merge_pgs
->insert(make_pair(c
, q
->first
));
422 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
423 << " pg_num " << pgnum
<< " -> " << q
->second
424 << " is beyond old pgnum, skipping" << dendl
;
428 if (cur
.is_split(q
->second
, pgnum
, &children
)) {
429 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
430 << " pg_num " << pgnum
<< " -> " << q
->second
431 << " is merge target, source " << children
<< dendl
;
432 for (auto c
: children
) {
433 merge_pgs
->insert(make_pair(c
, q
->first
));
437 merge_pgs
->insert(make_pair(cur
, q
->first
));
446 void OSDService::need_heartbeat_peer_update()
448 osd
->need_heartbeat_peer_update();
451 void OSDService::start_shutdown()
454 std::lock_guard
l(agent_timer_lock
);
455 agent_timer
.shutdown();
459 std::lock_guard
l(sleep_lock
);
460 sleep_timer
.shutdown();
464 std::lock_guard
l(recovery_request_lock
);
465 recovery_request_timer
.shutdown();
469 void OSDService::shutdown_reserver()
471 reserver_finisher
.wait_for_empty();
472 reserver_finisher
.stop();
475 void OSDService::shutdown()
478 std::lock_guard
l(watch_lock
);
479 watch_timer
.shutdown();
482 objecter
->shutdown();
483 for (auto f
: objecter_finishers
) {
488 publish_map(OSDMapRef());
489 next_osdmap
= OSDMapRef();
492 void OSDService::init()
494 reserver_finisher
.start();
495 for (auto f
: objecter_finishers
) {
498 objecter
->set_client_incarnation(0);
500 // deprioritize objecter in daemonperf output
501 objecter
->get_logger()->set_prio_adjust(-3);
506 agent_thread
.create("osd_srv_agent");
508 if (cct
->_conf
->osd_recovery_delay_start
)
509 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
512 void OSDService::final_init()
514 objecter
->start(osdmap
.get());
517 void OSDService::activate_map()
519 // wake/unwake the tiering agent
522 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
528 void OSDService::request_osdmap_update(epoch_t e
)
530 osd
->osdmap_subscribe(e
, false);
533 class AgentTimeoutCB
: public Context
{
536 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
537 void finish(int) override
{
538 pg
->agent_choose_mode_restart();
542 void OSDService::agent_entry()
544 dout(10) << __func__
<< " start" << dendl
;
547 while (!agent_stop_flag
) {
548 if (agent_queue
.empty()) {
549 dout(20) << __func__
<< " empty queue" << dendl
;
550 agent_cond
.Wait(agent_lock
);
553 uint64_t level
= agent_queue
.rbegin()->first
;
554 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
556 << " tiers " << agent_queue
.size()
557 << ", top is " << level
558 << " with pgs " << top
.size()
559 << ", ops " << agent_ops
<< "/"
560 << cct
->_conf
->osd_agent_max_ops
561 << (agent_active
? " active" : " NOT ACTIVE")
563 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
564 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
565 int agent_flush_quota
= max
;
566 if (!flush_mode_high_count
)
567 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
568 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
569 agent_cond
.Wait(agent_lock
);
573 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
574 agent_queue_pos
= top
.begin();
575 agent_valid_iterator
= true;
577 PGRef pg
= *agent_queue_pos
;
578 dout(10) << "high_count " << flush_mode_high_count
579 << " agent_ops " << agent_ops
580 << " flush_quota " << agent_flush_quota
<< dendl
;
582 if (!pg
->agent_work(max
, agent_flush_quota
)) {
583 dout(10) << __func__
<< " " << pg
->pg_id
584 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
585 << " seconds" << dendl
;
587 osd
->logger
->inc(l_osd_tier_delay
);
588 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
589 agent_timer_lock
.Lock();
590 Context
*cb
= new AgentTimeoutCB(pg
);
591 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
592 agent_timer_lock
.Unlock();
597 dout(10) << __func__
<< " finish" << dendl
;
600 void OSDService::agent_stop()
603 std::lock_guard
l(agent_lock
);
605 // By this time all ops should be cancelled
606 ceph_assert(agent_ops
== 0);
607 // By this time all PGs are shutdown and dequeued
608 if (!agent_queue
.empty()) {
609 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
610 derr
<< "agent queue not empty, for example " << (*top
.begin())->get_pgid() << dendl
;
611 ceph_abort_msg("agent queue not empty");
614 agent_stop_flag
= true;
620 // -------------------------------------
622 void OSDService::promote_throttle_recalibrate()
624 utime_t now
= ceph_clock_now();
625 double dur
= now
- last_recalibrate
;
626 last_recalibrate
= now
;
627 unsigned prob
= promote_probability_millis
;
629 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
630 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
632 unsigned min_prob
= 1;
634 uint64_t attempts
, obj
, bytes
;
635 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
636 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
637 << obj
<< " objects and " << byte_u_t(bytes
) << "; target "
638 << target_obj_sec
<< " obj/sec or "
639 << byte_u_t(target_bytes_sec
) << "/sec"
642 // calculate what the probability *should* be, given the targets
644 if (attempts
&& dur
> 0) {
645 uint64_t avg_size
= 1;
647 avg_size
= std::max
<uint64_t>(bytes
/ obj
, 1);
648 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
649 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
651 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
652 << avg_size
<< dendl
;
653 if (target_obj_sec
&& target_bytes_sec
)
654 new_prob
= std::min(po
, pb
);
655 else if (target_obj_sec
)
657 else if (target_bytes_sec
)
664 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
666 // correct for persistent skew between target rate and actual rate, adjust
669 if (attempts
&& obj
) {
670 actual
= obj
* 1000 / attempts
;
671 ratio
= (double)actual
/ (double)prob
;
672 new_prob
= (double)new_prob
/ ratio
;
674 new_prob
= std::max(new_prob
, min_prob
);
675 new_prob
= std::min(new_prob
, 1000u);
678 prob
= (prob
+ new_prob
) / 2;
679 prob
= std::max(prob
, min_prob
);
680 prob
= std::min(prob
, 1000u);
681 dout(10) << __func__
<< " actual " << actual
682 << ", actual/prob ratio " << ratio
683 << ", adjusted new_prob " << new_prob
684 << ", prob " << promote_probability_millis
<< " -> " << prob
686 promote_probability_millis
= prob
;
688 // set hard limits for this interval to mitigate stampedes
689 promote_max_objects
= target_obj_sec
* osd
->OSD_TICK_INTERVAL
* 2;
690 promote_max_bytes
= target_bytes_sec
* osd
->OSD_TICK_INTERVAL
* 2;
693 // -------------------------------------
695 float OSDService::get_failsafe_full_ratio()
697 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
698 if (full_ratio
> 1.0) full_ratio
/= 100.0;
702 OSDService::s_names
OSDService::recalc_full_state(float ratio
, float pratio
, string
&inject
)
704 // The OSDMap ratios take precendence. So if the failsafe is .95 and
705 // the admin sets the cluster full to .96, the failsafe moves up to .96
706 // too. (Not that having failsafe == full is ideal, but it's better than
707 // dropping writes before the clusters appears full.)
708 OSDMapRef osdmap
= get_osdmap();
709 if (!osdmap
|| osdmap
->get_epoch() == 0) {
712 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
713 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
714 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
715 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
717 if (osdmap
->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
718 // use the failsafe for nearfull and full; the mon isn't using the
719 // flags anyway because we're mid-upgrade.
720 full_ratio
= failsafe_ratio
;
721 backfillfull_ratio
= failsafe_ratio
;
722 nearfull_ratio
= failsafe_ratio
;
723 } else if (full_ratio
<= 0 ||
724 backfillfull_ratio
<= 0 ||
725 nearfull_ratio
<= 0) {
726 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
727 // use failsafe flag. ick. the monitor did something wrong or the user
728 // did something stupid.
729 full_ratio
= failsafe_ratio
;
730 backfillfull_ratio
= failsafe_ratio
;
731 nearfull_ratio
= failsafe_ratio
;
734 if (injectfull_state
> NONE
&& injectfull
) {
735 inject
= "(Injected)";
736 return injectfull_state
;
737 } else if (pratio
> failsafe_ratio
) {
739 } else if (ratio
> full_ratio
) {
741 } else if (ratio
> backfillfull_ratio
) {
743 } else if (ratio
> nearfull_ratio
) {
749 void OSDService::check_full_status(float ratio
, float pratio
)
751 std::lock_guard
l(full_status_lock
);
754 physical_ratio
= pratio
;
758 new_state
= recalc_full_state(ratio
, pratio
, inject
);
760 dout(20) << __func__
<< " cur ratio " << ratio
761 << ", physical ratio " << pratio
762 << ", new state " << get_full_state_name(new_state
)
767 if (cur_state
!= new_state
) {
768 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
769 << " -> " << get_full_state_name(new_state
) << dendl
;
770 if (new_state
== FAILSAFE
) {
771 clog
->error() << "full status failsafe engaged, dropping updates, now "
772 << (int)roundf(ratio
* 100) << "% full";
773 } else if (cur_state
== FAILSAFE
) {
774 clog
->error() << "full status failsafe disengaged, no longer dropping "
775 << "updates, now " << (int)roundf(ratio
* 100) << "% full";
777 cur_state
= new_state
;
781 bool OSDService::need_fullness_update()
783 OSDMapRef osdmap
= get_osdmap();
785 if (osdmap
->exists(whoami
)) {
786 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
788 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
790 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
797 else if (is_backfillfull())
799 else if (is_nearfull())
804 bool OSDService::_check_inject_full(DoutPrefixProvider
*dpp
, s_names type
) const
806 if (injectfull
&& injectfull_state
>= type
) {
807 // injectfull is either a count of the number of times to return failsafe full
808 // or if -1 then always return full
811 ldpp_dout(dpp
, 10) << __func__
<< " Injected " << get_full_state_name(type
) << " OSD ("
812 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")"
819 bool OSDService::_check_full(DoutPrefixProvider
*dpp
, s_names type
) const
821 std::lock_guard
l(full_status_lock
);
823 if (_check_inject_full(dpp
, type
))
826 if (cur_state
>= type
)
827 ldpp_dout(dpp
, 10) << __func__
<< " current usage is " << cur_ratio
828 << " physical " << physical_ratio
<< dendl
;
830 return cur_state
>= type
;
833 bool OSDService::_tentative_full(DoutPrefixProvider
*dpp
, s_names type
, uint64_t adjust_used
, osd_stat_t adjusted_stat
)
835 ldpp_dout(dpp
, 20) << __func__
<< " type " << get_full_state_name(type
) << " adjust_used " << (adjust_used
>> 10) << "KiB" << dendl
;
837 std::lock_guard
l(full_status_lock
);
838 if (_check_inject_full(dpp
, type
)) {
844 float ratio
= compute_adjusted_ratio(adjusted_stat
, &pratio
, adjust_used
);
847 s_names tentative_state
= recalc_full_state(ratio
, pratio
, notused
);
849 if (tentative_state
>= type
)
850 ldpp_dout(dpp
, 10) << __func__
<< " tentative usage is " << ratio
<< dendl
;
852 return tentative_state
>= type
;
855 bool OSDService::check_failsafe_full(DoutPrefixProvider
*dpp
) const
857 return _check_full(dpp
, FAILSAFE
);
860 bool OSDService::check_full(DoutPrefixProvider
*dpp
) const
862 return _check_full(dpp
, FULL
);
865 bool OSDService::tentative_backfill_full(DoutPrefixProvider
*dpp
, uint64_t adjust_used
, osd_stat_t stats
)
867 return _tentative_full(dpp
, BACKFILLFULL
, adjust_used
, stats
);
870 bool OSDService::check_backfill_full(DoutPrefixProvider
*dpp
) const
872 return _check_full(dpp
, BACKFILLFULL
);
875 bool OSDService::check_nearfull(DoutPrefixProvider
*dpp
) const
877 return _check_full(dpp
, NEARFULL
);
880 bool OSDService::is_failsafe_full() const
882 std::lock_guard
l(full_status_lock
);
883 return cur_state
== FAILSAFE
;
886 bool OSDService::is_full() const
888 std::lock_guard
l(full_status_lock
);
889 return cur_state
>= FULL
;
892 bool OSDService::is_backfillfull() const
894 std::lock_guard
l(full_status_lock
);
895 return cur_state
>= BACKFILLFULL
;
898 bool OSDService::is_nearfull() const
900 std::lock_guard
l(full_status_lock
);
901 return cur_state
>= NEARFULL
;
904 void OSDService::set_injectfull(s_names type
, int64_t count
)
906 std::lock_guard
l(full_status_lock
);
907 injectfull_state
= type
;
911 void OSDService::set_statfs(const struct store_statfs_t
&stbuf
,
912 osd_alert_list_t
& alerts
)
914 uint64_t bytes
= stbuf
.total
;
915 uint64_t avail
= stbuf
.available
;
916 uint64_t used
= stbuf
.get_used_raw();
918 // For testing fake statfs values so it doesn't matter if all
919 // OSDs are using the same partition.
920 if (cct
->_conf
->fake_statfs_for_testing
) {
921 uint64_t total_num_bytes
= 0;
925 total_num_bytes
+= p
->get_stats_num_bytes();
927 bytes
= cct
->_conf
->fake_statfs_for_testing
;
928 if (total_num_bytes
< bytes
)
929 avail
= bytes
- total_num_bytes
;
932 dout(0) << __func__
<< " fake total " << cct
->_conf
->fake_statfs_for_testing
933 << " adjust available " << avail
935 used
= bytes
- avail
;
938 osd
->logger
->set(l_osd_stat_bytes
, bytes
);
939 osd
->logger
->set(l_osd_stat_bytes_used
, used
);
940 osd
->logger
->set(l_osd_stat_bytes_avail
, avail
);
942 std::lock_guard
l(stat_lock
);
943 osd_stat
.statfs
= stbuf
;
944 osd_stat
.os_alerts
.clear();
945 osd_stat
.os_alerts
[whoami
].swap(alerts
);
946 if (cct
->_conf
->fake_statfs_for_testing
) {
947 osd_stat
.statfs
.total
= bytes
;
948 osd_stat
.statfs
.available
= avail
;
949 // For testing don't want used to go negative, so clear reserved
950 osd_stat
.statfs
.internally_reserved
= 0;
954 osd_stat_t
OSDService::set_osd_stat(vector
<int>& hb_peers
,
957 utime_t now
= ceph_clock_now();
958 auto stale_time
= g_conf().get_val
<int64_t>("osd_mon_heartbeat_stat_stale");
959 std::lock_guard
l(stat_lock
);
960 osd_stat
.hb_peers
.swap(hb_peers
);
961 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
962 osd_stat
.num_pgs
= num_pgs
;
963 // Clean entries that aren't updated
964 // This is called often enough that we can just remove 1 at a time
965 for (auto i
: osd_stat
.hb_pingtime
) {
966 if (i
.second
.last_update
== 0)
968 if (stale_time
&& now
.sec() - i
.second
.last_update
> stale_time
) {
969 dout(20) << __func__
<< " time out heartbeat for osd " << i
.first
970 << " last_update " << i
.second
.last_update
<< dendl
;
971 osd_stat
.hb_pingtime
.erase(i
.first
);
978 void OSDService::inc_osd_stat_repaired()
980 std::lock_guard
l(stat_lock
);
981 osd_stat
.num_shards_repaired
++;
985 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat
, float *pratio
,
986 uint64_t adjust_used
)
989 ((float)new_stat
.statfs
.get_used()) / ((float)new_stat
.statfs
.total
);
992 dout(20) << __func__
<< " Before kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
993 if (new_stat
.statfs
.available
> adjust_used
)
994 new_stat
.statfs
.available
-= adjust_used
;
996 new_stat
.statfs
.available
= 0;
997 dout(20) << __func__
<< " After kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1000 // Check all pgs and adjust kb_used to include all pending backfill data
1001 int backfill_adjusted
= 0;
1003 osd
->_get_pgs(&pgs
);
1004 for (auto p
: pgs
) {
1005 backfill_adjusted
+= p
->pg_stat_adjust(&new_stat
);
1007 if (backfill_adjusted
) {
1008 dout(20) << __func__
<< " backfill adjusted " << new_stat
<< dendl
;
1010 return ((float)new_stat
.statfs
.get_used()) / ((float)new_stat
.statfs
.total
);
1013 bool OSDService::check_osdmap_full(const set
<pg_shard_t
> &missing_on
)
1015 OSDMapRef osdmap
= get_osdmap();
1016 for (auto shard
: missing_on
) {
1017 if (osdmap
->get_state(shard
.osd
) & CEPH_OSD_FULL
)
1023 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
1025 OSDMapRef next_map
= get_nextmap_reserved();
1026 // service map is always newer/newest
1027 ceph_assert(from_epoch
<= next_map
->get_epoch());
1029 if (next_map
->is_down(peer
) ||
1030 next_map
->get_info(peer
).up_from
> from_epoch
) {
1032 release_map(next_map
);
1035 ConnectionRef peer_con
= osd
->cluster_messenger
->connect_to_osd(
1036 next_map
->get_cluster_addrs(peer
));
1037 share_map_peer(peer
, peer_con
.get(), next_map
);
1038 peer_con
->send_message(m
);
1039 release_map(next_map
);
1042 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
1044 OSDMapRef next_map
= get_nextmap_reserved();
1045 // service map is always newer/newest
1046 ceph_assert(from_epoch
<= next_map
->get_epoch());
1048 if (next_map
->is_down(peer
) ||
1049 next_map
->get_info(peer
).up_from
> from_epoch
) {
1050 release_map(next_map
);
1053 ConnectionRef con
= osd
->cluster_messenger
->connect_to_osd(
1054 next_map
->get_cluster_addrs(peer
));
1055 release_map(next_map
);
1059 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
1061 OSDMapRef next_map
= get_nextmap_reserved();
1062 // service map is always newer/newest
1063 ceph_assert(from_epoch
<= next_map
->get_epoch());
1065 pair
<ConnectionRef
,ConnectionRef
> ret
;
1066 if (next_map
->is_down(peer
) ||
1067 next_map
->get_info(peer
).up_from
> from_epoch
) {
1068 release_map(next_map
);
1071 ret
.first
= osd
->hb_back_client_messenger
->connect_to_osd(
1072 next_map
->get_hb_back_addrs(peer
));
1073 ret
.second
= osd
->hb_front_client_messenger
->connect_to_osd(
1074 next_map
->get_hb_front_addrs(peer
));
1075 release_map(next_map
);
1079 entity_name_t
OSDService::get_cluster_msgr_name() const
1081 return cluster_messenger
->get_myname();
1084 void OSDService::queue_want_pg_temp(pg_t pgid
,
1085 const vector
<int>& want
,
1088 std::lock_guard
l(pg_temp_lock
);
1089 auto p
= pg_temp_pending
.find(pgid
);
1090 if (p
== pg_temp_pending
.end() ||
1091 p
->second
.acting
!= want
||
1093 pg_temp_wanted
[pgid
] = {want
, forced
};
1097 void OSDService::remove_want_pg_temp(pg_t pgid
)
1099 std::lock_guard
l(pg_temp_lock
);
1100 pg_temp_wanted
.erase(pgid
);
1101 pg_temp_pending
.erase(pgid
);
1104 void OSDService::_sent_pg_temp()
1106 #ifdef HAVE_STDLIB_MAP_SPLICING
1107 pg_temp_pending
.merge(pg_temp_wanted
);
1109 pg_temp_pending
.insert(make_move_iterator(begin(pg_temp_wanted
)),
1110 make_move_iterator(end(pg_temp_wanted
)));
1112 pg_temp_wanted
.clear();
1115 void OSDService::requeue_pg_temp()
1117 std::lock_guard
l(pg_temp_lock
);
1118 // wanted overrides pending. note that remove_want_pg_temp
1119 // clears the item out of both.
1120 unsigned old_wanted
= pg_temp_wanted
.size();
1121 unsigned old_pending
= pg_temp_pending
.size();
1123 pg_temp_wanted
.swap(pg_temp_pending
);
1124 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1125 << pg_temp_wanted
.size() << dendl
;
1128 std::ostream
& operator<<(std::ostream
& out
,
1129 const OSDService::pg_temp_t
& pg_temp
)
1131 out
<< pg_temp
.acting
;
1132 if (pg_temp
.forced
) {
1138 void OSDService::send_pg_temp()
1140 std::lock_guard
l(pg_temp_lock
);
1141 if (pg_temp_wanted
.empty())
1143 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1144 MOSDPGTemp
*ms
[2] = {nullptr, nullptr};
1145 for (auto& [pgid
, pg_temp
] : pg_temp_wanted
) {
1146 auto& m
= ms
[pg_temp
.forced
];
1148 m
= new MOSDPGTemp(osdmap
->get_epoch());
1149 m
->forced
= pg_temp
.forced
;
1151 m
->pg_temp
.emplace(pgid
, pg_temp
.acting
);
1155 monc
->send_mon_message(m
);
1161 void OSDService::send_pg_created(pg_t pgid
)
1163 std::lock_guard
l(pg_created_lock
);
1164 dout(20) << __func__
<< dendl
;
1165 auto o
= get_osdmap();
1166 if (o
->require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
1167 pg_created
.insert(pgid
);
1168 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1172 void OSDService::send_pg_created()
1174 std::lock_guard
l(pg_created_lock
);
1175 dout(20) << __func__
<< dendl
;
1176 auto o
= get_osdmap();
1177 if (o
->require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
1178 for (auto pgid
: pg_created
) {
1179 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1184 void OSDService::prune_pg_created()
1186 std::lock_guard
l(pg_created_lock
);
1187 dout(20) << __func__
<< dendl
;
1188 auto o
= get_osdmap();
1189 auto i
= pg_created
.begin();
1190 while (i
!= pg_created
.end()) {
1191 auto p
= o
->get_pg_pool(i
->pool());
1192 if (!p
|| !p
->has_flag(pg_pool_t::FLAG_CREATING
)) {
1193 dout(20) << __func__
<< " pruning " << *i
<< dendl
;
1194 i
= pg_created
.erase(i
);
1196 dout(20) << __func__
<< " keeping " << *i
<< dendl
;
1203 // --------------------------------------
1206 epoch_t
OSDService::get_peer_epoch(int peer
)
1208 std::lock_guard
l(peer_map_epoch_lock
);
1209 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1210 if (p
== peer_map_epoch
.end())
1215 epoch_t
OSDService::note_peer_epoch(int peer
, epoch_t e
)
1217 std::lock_guard
l(peer_map_epoch_lock
);
1218 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1219 if (p
!= peer_map_epoch
.end()) {
1220 if (p
->second
< e
) {
1221 dout(10) << "note_peer_epoch osd." << peer
<< " has " << e
<< dendl
;
1224 dout(30) << "note_peer_epoch osd." << peer
<< " has " << p
->second
<< " >= " << e
<< dendl
;
1228 dout(10) << "note_peer_epoch osd." << peer
<< " now has " << e
<< dendl
;
1229 peer_map_epoch
[peer
] = e
;
1234 void OSDService::forget_peer_epoch(int peer
, epoch_t as_of
)
1236 std::lock_guard
l(peer_map_epoch_lock
);
1237 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1238 if (p
!= peer_map_epoch
.end()) {
1239 if (p
->second
<= as_of
) {
1240 dout(10) << "forget_peer_epoch osd." << peer
<< " as_of " << as_of
1241 << " had " << p
->second
<< dendl
;
1242 peer_map_epoch
.erase(p
);
1244 dout(10) << "forget_peer_epoch osd." << peer
<< " as_of " << as_of
1245 << " has " << p
->second
<< " - not forgetting" << dendl
;
1250 bool OSDService::should_share_map(entity_name_t name
, Connection
*con
,
1251 epoch_t epoch
, const OSDMapRef
& osdmap
,
1252 const epoch_t
*sent_epoch_p
)
1254 dout(20) << "should_share_map "
1255 << name
<< " " << con
->get_peer_addr()
1256 << " " << epoch
<< dendl
;
1258 // does client have old map?
1259 if (name
.is_client()) {
1260 bool message_sendmap
= epoch
< osdmap
->get_epoch();
1261 if (message_sendmap
&& sent_epoch_p
) {
1262 dout(20) << "client session last_sent_epoch: "
1264 << " versus osdmap epoch " << osdmap
->get_epoch() << dendl
;
1265 if (*sent_epoch_p
< osdmap
->get_epoch()) {
1267 } // else we don't need to send it out again
1271 if (con
->get_messenger() == osd
->cluster_messenger
&&
1272 con
!= osd
->cluster_messenger
->get_loopback_connection() &&
1273 osdmap
->is_up(name
.num()) &&
1274 (osdmap
->get_cluster_addrs(name
.num()) == con
->get_peer_addrs() ||
1275 osdmap
->get_hb_back_addrs(name
.num()) == con
->get_peer_addrs())) {
1277 epoch_t has
= std::max(get_peer_epoch(name
.num()), epoch
);
1280 if (has
< osdmap
->get_epoch()) {
1281 dout(10) << name
<< " " << con
->get_peer_addr()
1282 << " has old map " << epoch
<< " < "
1283 << osdmap
->get_epoch() << dendl
;
1291 void OSDService::share_map(
1296 epoch_t
*sent_epoch_p
)
1298 dout(20) << "share_map "
1299 << name
<< " " << con
->get_peer_addr()
1300 << " " << epoch
<< dendl
;
1302 if (!osd
->is_active()) {
1303 /*It is safe not to proceed as OSD is not in healthy state*/
1307 bool want_shared
= should_share_map(name
, con
, epoch
,
1308 osdmap
, sent_epoch_p
);
1311 if (name
.is_client()) {
1312 dout(10) << name
<< " has old map " << epoch
1313 << " < " << osdmap
->get_epoch() << dendl
;
1314 // we know the Session is valid or we wouldn't be sending
1316 *sent_epoch_p
= osdmap
->get_epoch();
1318 send_incremental_map(epoch
, con
, osdmap
);
1319 } else if (con
->get_messenger() == osd
->cluster_messenger
&&
1320 osdmap
->is_up(name
.num()) &&
1321 (osdmap
->get_cluster_addrs(name
.num()) == con
->get_peer_addrs() ||
1322 osdmap
->get_hb_back_addrs(name
.num()) == con
->get_peer_addrs())) {
1323 dout(10) << name
<< " " << con
->get_peer_addrs()
1324 << " has old map " << epoch
<< " < "
1325 << osdmap
->get_epoch() << dendl
;
1326 note_peer_epoch(name
.num(), osdmap
->get_epoch());
1327 send_incremental_map(epoch
, con
, osdmap
);
1332 void OSDService::share_map_peer(int peer
, Connection
*con
, OSDMapRef map
)
1338 epoch_t pe
= get_peer_epoch(peer
);
1340 if (pe
< map
->get_epoch()) {
1341 send_incremental_map(pe
, con
, map
);
1342 note_peer_epoch(peer
, map
->get_epoch());
1344 dout(20) << "share_map_peer " << con
<< " already has epoch " << pe
<< dendl
;
1346 dout(20) << "share_map_peer " << con
<< " don't know epoch, doing nothing" << dendl
;
1347 // no idea about peer's epoch.
1348 // ??? send recent ???
1353 bool OSDService::can_inc_scrubs()
1355 bool can_inc
= false;
1356 std::lock_guard
l(sched_scrub_lock
);
1358 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1359 dout(20) << __func__
<< " == true " << scrubs_local
<< " local + " << scrubs_remote
1360 << " remote < max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1363 dout(20) << __func__
<< " == false " << scrubs_local
<< " local + " << scrubs_remote
1364 << " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1370 bool OSDService::inc_scrubs_local()
1372 bool result
= false;
1373 std::lock_guard l
{sched_scrub_lock
};
1374 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1375 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
+1)
1376 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1380 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1385 void OSDService::dec_scrubs_local()
1387 std::lock_guard l
{sched_scrub_lock
};
1388 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
-1)
1389 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1391 ceph_assert(scrubs_local
>= 0);
1394 bool OSDService::inc_scrubs_remote()
1396 bool result
= false;
1397 std::lock_guard l
{sched_scrub_lock
};
1398 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1399 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
+1)
1400 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1404 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1409 void OSDService::dec_scrubs_remote()
1411 std::lock_guard l
{sched_scrub_lock
};
1412 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
-1)
1413 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1415 ceph_assert(scrubs_remote
>= 0);
1418 void OSDService::dump_scrub_reservations(Formatter
*f
)
1420 std::lock_guard l
{sched_scrub_lock
};
1421 f
->dump_int("scrubs_local", scrubs_local
);
1422 f
->dump_int("scrubs_remote", scrubs_remote
);
1423 f
->dump_int("osd_max_scrubs", cct
->_conf
->osd_max_scrubs
);
1426 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1427 epoch_t
*_bind_epoch
) const
1429 std::lock_guard
l(epoch_lock
);
1431 *_boot_epoch
= boot_epoch
;
1433 *_up_epoch
= up_epoch
;
1435 *_bind_epoch
= bind_epoch
;
1438 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1439 const epoch_t
*_bind_epoch
)
1441 std::lock_guard
l(epoch_lock
);
1443 ceph_assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1444 boot_epoch
= *_boot_epoch
;
1447 ceph_assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1448 up_epoch
= *_up_epoch
;
1451 ceph_assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1452 bind_epoch
= *_bind_epoch
;
1456 bool OSDService::prepare_to_stop()
1458 std::lock_guard
l(is_stopping_lock
);
1459 if (get_state() != NOT_STOPPING
)
1462 OSDMapRef osdmap
= get_osdmap();
1463 if (osdmap
&& osdmap
->is_up(whoami
)) {
1464 dout(0) << __func__
<< " telling mon we are shutting down" << dendl
;
1465 set_state(PREPARING_TO_STOP
);
1466 monc
->send_mon_message(
1470 osdmap
->get_addrs(whoami
),
1471 osdmap
->get_epoch(),
1474 utime_t now
= ceph_clock_now();
1476 timeout
.set_from_double(now
+ cct
->_conf
->osd_mon_shutdown_timeout
);
1477 while ((ceph_clock_now() < timeout
) &&
1478 (get_state() != STOPPING
)) {
1479 is_stopping_cond
.WaitUntil(is_stopping_lock
, timeout
);
1482 dout(0) << __func__
<< " starting shutdown" << dendl
;
1483 set_state(STOPPING
);
1487 void OSDService::got_stop_ack()
1489 std::lock_guard
l(is_stopping_lock
);
1490 if (get_state() == PREPARING_TO_STOP
) {
1491 dout(0) << __func__
<< " starting shutdown" << dendl
;
1492 set_state(STOPPING
);
1493 is_stopping_cond
.Signal();
1495 dout(10) << __func__
<< " ignoring msg" << dendl
;
1499 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1500 OSDSuperblock
& sblock
)
1502 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1503 osdmap
->get_encoding_features());
1504 m
->oldest_map
= max_oldest_map
;
1505 m
->newest_map
= sblock
.newest_map
;
1507 int max
= cct
->_conf
->osd_map_message_max
;
1508 ssize_t max_bytes
= cct
->_conf
->osd_map_message_max_bytes
;
1510 if (since
< m
->oldest_map
) {
1511 // we don't have the next map the target wants, so start with a
1514 dout(10) << __func__
<< " oldest map " << max_oldest_map
<< " > since "
1515 << since
<< ", starting with full map" << dendl
;
1516 since
= m
->oldest_map
;
1517 if (!get_map_bl(since
, bl
)) {
1518 derr
<< __func__
<< " missing full map " << since
<< dendl
;
1522 max_bytes
-= bl
.length();
1523 m
->maps
[since
].claim(bl
);
1525 for (epoch_t e
= since
+ 1; e
<= to
; ++e
) {
1527 if (get_inc_map_bl(e
, bl
)) {
1528 m
->incremental_maps
[e
].claim(bl
);
1530 derr
<< __func__
<< " missing incremental map " << e
<< dendl
;
1531 if (!get_map_bl(e
, bl
)) {
1532 derr
<< __func__
<< " also missing full map " << e
<< dendl
;
1535 m
->maps
[e
].claim(bl
);
1538 max_bytes
-= bl
.length();
1539 if (max
<= 0 || max_bytes
<= 0) {
1546 if (!m
->maps
.empty() ||
1547 !m
->incremental_maps
.empty()) {
1548 // send what we have so far
1553 if (get_inc_map_bl(m
->newest_map
, bl
)) {
1554 m
->incremental_maps
[m
->newest_map
].claim(bl
);
1556 derr
<< __func__
<< " unable to load latest map " << m
->newest_map
<< dendl
;
1557 if (!get_map_bl(m
->newest_map
, bl
)) {
1558 derr
<< __func__
<< " unable to load latest full map " << m
->newest_map
1562 m
->maps
[m
->newest_map
].claim(bl
);
1567 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1569 con
->send_message(m
);
1572 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1575 epoch_t to
= osdmap
->get_epoch();
1576 dout(10) << "send_incremental_map " << since
<< " -> " << to
1577 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1581 OSDSuperblock
sblock(get_superblock());
1582 if (since
< sblock
.oldest_map
) {
1583 // just send latest full map
1584 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1585 osdmap
->get_encoding_features());
1586 m
->oldest_map
= max_oldest_map
;
1587 m
->newest_map
= sblock
.newest_map
;
1588 get_map_bl(to
, m
->maps
[to
]);
1593 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1594 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1595 << ", only sending most recent" << dendl
;
1596 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1599 m
= build_incremental_map_msg(since
, to
, sblock
);
1604 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1606 bool found
= map_bl_cache
.lookup(e
, &bl
);
1609 logger
->inc(l_osd_map_bl_cache_hit
);
1613 logger
->inc(l_osd_map_bl_cache_miss
);
1614 found
= store
->read(meta_ch
,
1615 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
,
1616 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1623 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1625 std::lock_guard
l(map_cache_lock
);
1626 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1629 logger
->inc(l_osd_map_bl_cache_hit
);
1633 logger
->inc(l_osd_map_bl_cache_miss
);
1634 found
= store
->read(meta_ch
,
1635 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
,
1636 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1638 _add_map_inc_bl(e
, bl
);
1643 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1645 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1646 // cache a contiguous buffer
1647 if (bl
.get_num_buffers() > 1) {
1650 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1651 map_bl_cache
.add(e
, bl
);
1654 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1656 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1657 // cache a contiguous buffer
1658 if (bl
.get_num_buffers() > 1) {
1661 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1662 map_bl_inc_cache
.add(e
, bl
);
1665 int OSDService::get_deleted_pool_pg_num(int64_t pool
)
1667 std::lock_guard
l(map_cache_lock
);
1668 auto p
= deleted_pool_pg_nums
.find(pool
);
1669 if (p
!= deleted_pool_pg_nums
.end()) {
1672 dout(20) << __func__
<< " " << pool
<< " loading" << dendl
;
1673 ghobject_t oid
= OSD::make_final_pool_info_oid(pool
);
1675 int r
= store
->read(meta_ch
, oid
, 0, 0, bl
);
1676 ceph_assert(r
>= 0);
1677 auto blp
= bl
.cbegin();
1680 deleted_pool_pg_nums
[pool
] = pi
.get_pg_num();
1681 dout(20) << __func__
<< " " << pool
<< " got " << pi
.get_pg_num() << dendl
;
1682 return pi
.get_pg_num();
1685 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1687 epoch_t e
= o
->get_epoch();
1689 if (cct
->_conf
->osd_map_dedup
) {
1690 // Dedup against an existing map at a nearby epoch
1691 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1693 OSDMap::dedup(for_dedup
.get(), o
);
1697 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1704 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1706 std::lock_guard
l(map_cache_lock
);
1707 OSDMapRef retval
= map_cache
.lookup(epoch
);
1709 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1711 logger
->inc(l_osd_map_cache_hit
);
1716 logger
->inc(l_osd_map_cache_miss
);
1717 epoch_t lb
= map_cache
.cached_key_lower_bound();
1719 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1720 logger
->inc(l_osd_map_cache_miss_low
);
1721 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1725 OSDMap
*map
= new OSDMap
;
1727 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1729 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1730 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1736 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1738 return _add_map(map
);
1744 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1746 reply_op_error(op
, err
, eversion_t(), 0);
1749 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1752 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1753 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1755 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1757 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
, true);
1758 reply
->set_reply_versions(v
, uv
);
1759 m
->get_connection()->send_message(reply
);
1762 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1764 if (!cct
->_conf
->osd_debug_misdirected_ops
) {
1768 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1769 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1771 ceph_assert(m
->get_map_epoch() >= pg
->get_history().same_primary_since
);
1773 if (pg
->is_ec_pg()) {
1775 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1776 * can get this result:
1777 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1778 * [CRUSH_ITEM_NONE, 2, 3]/3
1779 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1781 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1783 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1786 * We can't compute the op target based on the sending map epoch due to
1787 * splitting. The simplest thing is to detect such cases here and drop
1788 * them without an error (the client will resend anyway).
1790 ceph_assert(m
->get_map_epoch() <= superblock
.newest_map
);
1791 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1793 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1794 << m
->get_map_epoch() << ", dropping" << dendl
;
1797 pg_t _pgid
= m
->get_raw_pg();
1799 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1800 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1801 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1802 pgid
.shard
!= pg
->pg_id
.shard
) {
1803 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1804 << m
->get_map_epoch() << ", dropping" << dendl
;
1809 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1810 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1811 << " pg " << m
->get_raw_pg()
1812 << " to osd." << whoami
1813 << " not " << pg
->get_acting()
1814 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1817 void OSDService::enqueue_back(OpQueueItem
&& qi
)
1819 osd
->op_shardedwq
.queue(std::move(qi
));
1822 void OSDService::enqueue_front(OpQueueItem
&& qi
)
1824 osd
->op_shardedwq
.queue_front(std::move(qi
));
1827 void OSDService::queue_recovery_context(
1829 GenContext
<ThreadPool::TPHandle
&> *c
)
1831 epoch_t e
= get_osdmap_epoch();
1834 unique_ptr
<OpQueueItem::OpQueueable
>(
1835 new PGRecoveryContext(pg
->get_pgid(), c
, e
)),
1836 cct
->_conf
->osd_recovery_cost
,
1837 cct
->_conf
->osd_recovery_priority
,
1843 void OSDService::queue_for_snap_trim(PG
*pg
)
1845 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1848 unique_ptr
<OpQueueItem::OpQueueable
>(
1849 new PGSnapTrim(pg
->get_pgid(), pg
->get_osdmap_epoch())),
1850 cct
->_conf
->osd_snap_trim_cost
,
1851 cct
->_conf
->osd_snap_trim_priority
,
1854 pg
->get_osdmap_epoch()));
1857 void OSDService::queue_for_scrub(PG
*pg
, bool with_high_priority
)
1859 unsigned scrub_queue_priority
= pg
->scrubber
.priority
;
1860 if (with_high_priority
&& scrub_queue_priority
< cct
->_conf
->osd_client_op_priority
) {
1861 scrub_queue_priority
= cct
->_conf
->osd_client_op_priority
;
1863 const auto epoch
= pg
->get_osdmap_epoch();
1866 unique_ptr
<OpQueueItem::OpQueueable
>(new PGScrub(pg
->get_pgid(), epoch
)),
1867 cct
->_conf
->osd_scrub_cost
,
1868 scrub_queue_priority
,
1874 void OSDService::queue_for_pg_delete(spg_t pgid
, epoch_t e
)
1876 dout(10) << __func__
<< " on " << pgid
<< " e " << e
<< dendl
;
1879 unique_ptr
<OpQueueItem::OpQueueable
>(
1880 new PGDelete(pgid
, e
)),
1881 cct
->_conf
->osd_pg_delete_cost
,
1882 cct
->_conf
->osd_pg_delete_priority
,
1888 bool OSDService::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
1890 return osd
->try_finish_pg_delete(pg
, old_pg_num
);
1895 void OSDService::set_ready_to_merge_source(PG
*pg
, eversion_t version
)
1897 std::lock_guard
l(merge_lock
);
1898 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1899 ready_to_merge_source
[pg
->pg_id
.pgid
] = version
;
1900 assert(not_ready_to_merge_source
.count(pg
->pg_id
.pgid
) == 0);
1901 _send_ready_to_merge();
1904 void OSDService::set_ready_to_merge_target(PG
*pg
,
1906 epoch_t last_epoch_started
,
1907 epoch_t last_epoch_clean
)
1909 std::lock_guard
l(merge_lock
);
1910 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1911 ready_to_merge_target
.insert(make_pair(pg
->pg_id
.pgid
,
1914 last_epoch_clean
)));
1915 assert(not_ready_to_merge_target
.count(pg
->pg_id
.pgid
) == 0);
1916 _send_ready_to_merge();
1919 void OSDService::set_not_ready_to_merge_source(pg_t source
)
1921 std::lock_guard
l(merge_lock
);
1922 dout(10) << __func__
<< " " << source
<< dendl
;
1923 not_ready_to_merge_source
.insert(source
);
1924 assert(ready_to_merge_source
.count(source
) == 0);
1925 _send_ready_to_merge();
1928 void OSDService::set_not_ready_to_merge_target(pg_t target
, pg_t source
)
1930 std::lock_guard
l(merge_lock
);
1931 dout(10) << __func__
<< " " << target
<< " source " << source
<< dendl
;
1932 not_ready_to_merge_target
[target
] = source
;
1933 assert(ready_to_merge_target
.count(target
) == 0);
1934 _send_ready_to_merge();
1937 void OSDService::send_ready_to_merge()
1939 std::lock_guard
l(merge_lock
);
1940 _send_ready_to_merge();
1943 void OSDService::_send_ready_to_merge()
1945 dout(20) << __func__
1946 << " ready_to_merge_source " << ready_to_merge_source
1947 << " not_ready_to_merge_source " << not_ready_to_merge_source
1948 << " ready_to_merge_target " << ready_to_merge_target
1949 << " not_ready_to_merge_target " << not_ready_to_merge_target
1950 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1952 for (auto src
: not_ready_to_merge_source
) {
1953 if (sent_ready_to_merge_source
.count(src
) == 0) {
1954 monc
->send_mon_message(new MOSDPGReadyToMerge(
1958 osdmap
->get_epoch()));
1959 sent_ready_to_merge_source
.insert(src
);
1962 for (auto p
: not_ready_to_merge_target
) {
1963 if (sent_ready_to_merge_source
.count(p
.second
) == 0) {
1964 monc
->send_mon_message(new MOSDPGReadyToMerge(
1968 osdmap
->get_epoch()));
1969 sent_ready_to_merge_source
.insert(p
.second
);
1972 for (auto src
: ready_to_merge_source
) {
1973 if (not_ready_to_merge_source
.count(src
.first
) ||
1974 not_ready_to_merge_target
.count(src
.first
.get_parent())) {
1977 auto p
= ready_to_merge_target
.find(src
.first
.get_parent());
1978 if (p
!= ready_to_merge_target
.end() &&
1979 sent_ready_to_merge_source
.count(src
.first
) == 0) {
1980 monc
->send_mon_message(new MOSDPGReadyToMerge(
1981 src
.first
, // source pgid
1982 src
.second
, // src version
1983 std::get
<0>(p
->second
), // target version
1984 std::get
<1>(p
->second
), // PG's last_epoch_started
1985 std::get
<2>(p
->second
), // PG's last_epoch_clean
1987 osdmap
->get_epoch()));
1988 sent_ready_to_merge_source
.insert(src
.first
);
1993 void OSDService::clear_ready_to_merge(PG
*pg
)
1995 std::lock_guard
l(merge_lock
);
1996 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1997 ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1998 ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1999 not_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
2000 not_ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
2001 sent_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
2004 void OSDService::clear_sent_ready_to_merge()
2006 std::lock_guard
l(merge_lock
);
2007 sent_ready_to_merge_source
.clear();
2010 void OSDService::prune_sent_ready_to_merge(OSDMapRef
& osdmap
)
2012 std::lock_guard
l(merge_lock
);
2013 auto i
= sent_ready_to_merge_source
.begin();
2014 while (i
!= sent_ready_to_merge_source
.end()) {
2015 if (!osdmap
->pg_exists(*i
)) {
2016 dout(10) << __func__
<< " " << *i
<< dendl
;
2017 i
= sent_ready_to_merge_source
.erase(i
);
2026 void OSDService::_queue_for_recovery(
2027 std::pair
<epoch_t
, PGRef
> p
,
2028 uint64_t reserved_pushes
)
2030 ceph_assert(recovery_lock
.is_locked_by_me());
2033 unique_ptr
<OpQueueItem::OpQueueable
>(
2035 p
.second
->get_pgid(), p
.first
, reserved_pushes
)),
2036 cct
->_conf
->osd_recovery_cost
,
2037 cct
->_conf
->osd_recovery_priority
,
2043 // ====================================================================
2047 #define dout_prefix *_dout
2049 // Commands shared between OSD's console and admin console:
2051 namespace osd_cmds
{
2053 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
);
2055 }} // namespace ceph::osd_cmds
2057 int OSD::mkfs(CephContext
*cct
, ObjectStore
*store
, uuid_d fsid
, int whoami
)
2063 ObjectStore::CollectionHandle ch
;
2065 // if we are fed a uuid for this osd, use it.
2066 store
->set_fsid(cct
->_conf
->osd_uuid
);
2068 ret
= store
->mkfs();
2070 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error "
2071 << cpp_strerror(ret
) << dendl
;
2075 store
->set_cache_shards(1); // doesn't matter for mkfs!
2077 ret
= store
->mount();
2079 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error "
2080 << cpp_strerror(ret
) << dendl
;
2084 ch
= store
->open_collection(coll_t::meta());
2086 ret
= store
->read(ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
2088 derr
<< "OSD::mkfs: have meta collection but no superblock" << dendl
;
2091 /* if we already have superblock, check content of superblock */
2092 dout(0) << " have superblock" << dendl
;
2093 auto p
= sbbl
.cbegin();
2095 if (whoami
!= sb
.whoami
) {
2096 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
2101 if (fsid
!= sb
.cluster_fsid
) {
2102 derr
<< "provided cluster fsid " << fsid
2103 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
2108 // create superblock
2109 sb
.cluster_fsid
= fsid
;
2110 sb
.osd_fsid
= store
->get_fsid();
2112 sb
.compat_features
= get_osd_initial_compat_set();
2117 ObjectStore::CollectionHandle ch
= store
->create_new_collection(
2119 ObjectStore::Transaction t
;
2120 t
.create_collection(coll_t::meta(), 0);
2121 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
2122 ret
= store
->queue_transaction(ch
, std::move(t
));
2124 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2125 << "queue_transaction returned " << cpp_strerror(ret
) << dendl
;
2130 ret
= write_meta(cct
, store
, sb
.cluster_fsid
, sb
.osd_fsid
, whoami
);
2132 derr
<< "OSD::mkfs: failed to write fsid file: error "
2133 << cpp_strerror(ret
) << dendl
;
2147 int OSD::write_meta(CephContext
*cct
, ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
)
2152 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
2153 r
= store
->write_meta("magic", val
);
2157 snprintf(val
, sizeof(val
), "%d", whoami
);
2158 r
= store
->write_meta("whoami", val
);
2162 cluster_fsid
.print(val
);
2163 r
= store
->write_meta("ceph_fsid", val
);
2167 string key
= cct
->_conf
.get_val
<string
>("key");
2169 r
= store
->write_meta("osd_key", key
);
2173 string keyfile
= cct
->_conf
.get_val
<string
>("keyfile");
2174 if (!keyfile
.empty()) {
2177 r
= keybl
.read_file(keyfile
.c_str(), &err
);
2179 derr
<< __func__
<< " failed to read keyfile " << keyfile
<< ": "
2180 << err
<< ": " << cpp_strerror(r
) << dendl
;
2183 r
= store
->write_meta("osd_key", keybl
.to_str());
2189 r
= store
->write_meta("ready", "ready");
2196 int OSD::peek_meta(ObjectStore
*store
,
2198 uuid_d
*cluster_fsid
,
2201 int *require_osd_release
)
2205 int r
= store
->read_meta("magic", &val
);
2210 r
= store
->read_meta("whoami", &val
);
2213 *whoami
= atoi(val
.c_str());
2215 r
= store
->read_meta("ceph_fsid", &val
);
2218 r
= cluster_fsid
->parse(val
.c_str());
2222 r
= store
->read_meta("fsid", &val
);
2224 *osd_fsid
= uuid_d();
2226 r
= osd_fsid
->parse(val
.c_str());
2231 r
= store
->read_meta("require_osd_release", &val
);
2233 *require_osd_release
= atoi(val
.c_str());
2241 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2245 OSD::OSD(CephContext
*cct_
, ObjectStore
*store_
,
2247 Messenger
*internal_messenger
,
2248 Messenger
*external_messenger
,
2249 Messenger
*hb_client_front
,
2250 Messenger
*hb_client_back
,
2251 Messenger
*hb_front_serverm
,
2252 Messenger
*hb_back_serverm
,
2253 Messenger
*osdc_messenger
,
2255 const std::string
&dev
, const std::string
&jdev
) :
2257 osd_lock("OSD::osd_lock"),
2258 tick_timer(cct
, osd_lock
),
2259 tick_timer_lock("OSD::tick_timer_lock"),
2260 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
2261 gss_ktfile_client(cct
->_conf
.get_val
<std::string
>("gss_ktab_client_file")),
2262 cluster_messenger(internal_messenger
),
2263 client_messenger(external_messenger
),
2264 objecter_messenger(osdc_messenger
),
2266 mgrc(cct_
, client_messenger
),
2268 recoverystate_perf(NULL
),
2270 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
2271 clog(log_client
.create_channel()),
2273 dev_path(dev
), journal_path(jdev
),
2274 store_is_rotational(store
->is_rotational()),
2275 trace_endpoint("0.0.0.0", 0, "osd"),
2277 m_osd_pg_epoch_max_lag_factor(cct
->_conf
.get_val
<double>(
2278 "osd_pg_epoch_max_lag_factor")),
2279 osd_compat(get_osd_compat_set()),
2280 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
2281 get_num_op_threads()),
2282 command_tp(cct
, "OSD::command_tp", "tp_osd_cmd", 1),
2283 session_waiting_lock("OSD::session_waiting_lock"),
2284 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
2285 heartbeat_lock("OSD::heartbeat_lock"),
2286 heartbeat_stop(false),
2287 heartbeat_need_update(true),
2288 hb_front_client_messenger(hb_client_front
),
2289 hb_back_client_messenger(hb_client_back
),
2290 hb_front_server_messenger(hb_front_serverm
),
2291 hb_back_server_messenger(hb_back_serverm
),
2293 heartbeat_thread(this),
2294 heartbeat_dispatcher(this),
2295 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
2296 cct
->_conf
->osd_num_op_tracker_shard
),
2297 test_ops_hook(NULL
),
2298 op_queue(get_io_queue()),
2299 op_prio_cutoff(get_io_prio_cut()),
2302 cct
->_conf
->osd_op_thread_timeout
,
2303 cct
->_conf
->osd_op_thread_suicide_timeout
,
2305 map_lock("OSD::map_lock"),
2306 last_pg_create_epoch(0),
2307 mon_report_lock("OSD::mon_report_lock"),
2310 requested_full_first(0),
2311 requested_full_last(0),
2314 cct
->_conf
->osd_command_thread_timeout
,
2315 cct
->_conf
->osd_command_thread_suicide_timeout
,
2320 if (!gss_ktfile_client
.empty()) {
2321 // Assert we can export environment variable
2323 The default client keytab is used, if it is present and readable,
2324 to automatically obtain initial credentials for GSSAPI client
2325 applications. The principal name of the first entry in the client
2326 keytab is used by default when obtaining initial credentials.
2327 1. The KRB5_CLIENT_KTNAME environment variable.
2328 2. The default_client_keytab_name profile variable in [libdefaults].
2329 3. The hardcoded default, DEFCKTNAME.
2331 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2332 gss_ktfile_client
.c_str(), 1));
2333 ceph_assert(set_result
== 0);
2336 monc
->set_messenger(client_messenger
);
2337 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
2338 cct
->_conf
->osd_op_log_threshold
);
2339 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
2340 cct
->_conf
->osd_op_history_duration
);
2341 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
2342 cct
->_conf
->osd_op_history_slow_op_threshold
);
2344 std::stringstream ss
;
2345 ss
<< "osd." << whoami
;
2346 trace_endpoint
.copy_name(ss
.str());
2349 // initialize shards
2350 num_shards
= get_num_op_shards();
2351 for (uint32_t i
= 0; i
< num_shards
; i
++) {
2352 OSDShard
*one_shard
= new OSDShard(
2356 cct
->_conf
->osd_op_pq_max_tokens_per_priority
,
2357 cct
->_conf
->osd_op_pq_min_cost
,
2359 shards
.push_back(one_shard
);
2365 while (!shards
.empty()) {
2366 delete shards
.back();
2369 delete class_handler
;
2370 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
2371 cct
->get_perfcounters_collection()->remove(logger
);
2372 delete recoverystate_perf
;
2377 double OSD::get_tick_interval() const
2379 // vary +/- 5% to avoid scrub scheduling livelocks
2380 constexpr auto delta
= 0.05;
2381 return (OSD_TICK_INTERVAL
*
2382 ceph::util::generate_random_number(1.0 - delta
, 1.0 + delta
));
2385 void cls_initialize(ClassHandler
*ch
);
2387 void OSD::handle_signal(int signum
)
2389 ceph_assert(signum
== SIGINT
|| signum
== SIGTERM
);
2390 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
2396 std::lock_guard
lock(osd_lock
);
2400 if (store
->test_mount_in_use()) {
2401 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
2402 << "currently in use. (Is ceph-osd already running?)" << dendl
;
2406 cct
->_conf
.add_observer(this);
2410 int OSD::set_numa_affinity()
2412 // storage numa node
2413 int store_node
= -1;
2414 store
->get_numa_node(&store_node
, nullptr, nullptr);
2415 if (store_node
>= 0) {
2416 dout(1) << __func__
<< " storage numa node " << store_node
<< dendl
;
2419 // check network numa node(s)
2420 int front_node
= -1, back_node
= -1;
2421 string front_iface
= pick_iface(
2423 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
2424 string back_iface
= pick_iface(
2426 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
2427 int r
= get_iface_numa_node(front_iface
, &front_node
);
2429 dout(1) << __func__
<< " public network " << front_iface
<< " numa node "
2430 << front_node
<< dendl
;
2431 r
= get_iface_numa_node(back_iface
, &back_node
);
2433 dout(1) << __func__
<< " cluster network " << back_iface
<< " numa node "
2434 << back_node
<< dendl
;
2435 if (front_node
== back_node
&&
2436 front_node
== store_node
) {
2437 dout(1) << " objectstore and network numa nodes all match" << dendl
;
2438 if (g_conf().get_val
<bool>("osd_numa_auto_affinity")) {
2439 numa_node
= front_node
;
2442 dout(1) << __func__
<< " objectstore and network numa nodes do not match"
2447 derr
<< __func__
<< " unable to identify public interface '" << front_iface
2448 << "' numa node: " << cpp_strerror(r
) << dendl
;
2450 if (int node
= g_conf().get_val
<int64_t>("osd_numa_node"); node
>= 0) {
2451 // this takes precedence over the automagic logic above
2454 if (numa_node
>= 0) {
2455 int r
= get_numa_node_cpu_set(numa_node
, &numa_cpu_set_size
, &numa_cpu_set
);
2457 dout(1) << __func__
<< " unable to determine numa node " << numa_node
2458 << " CPUs" << dendl
;
2461 dout(1) << __func__
<< " setting numa affinity to node " << numa_node
2463 << cpu_set_to_str_list(numa_cpu_set_size
, &numa_cpu_set
)
2465 r
= sched_setaffinity(getpid(), numa_cpu_set_size
, &numa_cpu_set
);
2468 derr
<< __func__
<< " failed to set numa affinity: " << cpp_strerror(r
)
2474 dout(1) << __func__
<< " not setting numa affinity" << dendl
;
2481 class OSDSocketHook
: public AdminSocketHook
{
2484 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
2485 bool call(std::string_view admin_command
, const cmdmap_t
& cmdmap
,
2486 std::string_view format
, bufferlist
& out
) override
{
2490 r
= osd
->asok_command(admin_command
, cmdmap
, format
, ss
);
2491 } catch (const bad_cmd_get
& e
) {
2500 std::set
<int64_t> OSD::get_mapped_pools()
2502 std::set
<int64_t> pools
;
2503 std::vector
<spg_t
> pgids
;
2505 for (const auto &pgid
: pgids
) {
2506 pools
.insert(pgid
.pool());
2511 bool OSD::asok_command(std::string_view admin_command
, const cmdmap_t
& cmdmap
,
2512 std::string_view format
, ostream
& ss
)
2514 Formatter
*f
= Formatter::create(format
, "json-pretty", "json-pretty");
2515 if (admin_command
== "status") {
2516 f
->open_object_section("status");
2517 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
2518 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
2519 f
->dump_unsigned("whoami", superblock
.whoami
);
2520 f
->dump_string("state", get_state_name(get_state()));
2521 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
2522 f
->dump_unsigned("newest_map", superblock
.newest_map
);
2523 f
->dump_unsigned("num_pgs", num_pgs
);
2525 } else if (admin_command
== "flush_journal") {
2526 store
->flush_journal();
2527 } else if (admin_command
== "dump_ops_in_flight" ||
2528 admin_command
== "ops" ||
2529 admin_command
== "dump_blocked_ops" ||
2530 admin_command
== "dump_historic_ops" ||
2531 admin_command
== "dump_historic_ops_by_duration" ||
2532 admin_command
== "dump_historic_slow_ops") {
2534 const string error_str
= "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2535 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2536 will start to track new ops received afterwards.";
2538 set
<string
> filters
;
2539 vector
<string
> filter_str
;
2540 if (cmd_getval(cct
, cmdmap
, "filterstr", filter_str
)) {
2541 copy(filter_str
.begin(), filter_str
.end(),
2542 inserter(filters
, filters
.end()));
2545 if (admin_command
== "dump_ops_in_flight" ||
2546 admin_command
== "ops") {
2547 if (!op_tracker
.dump_ops_in_flight(f
, false, filters
)) {
2551 if (admin_command
== "dump_blocked_ops") {
2552 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
)) {
2556 if (admin_command
== "dump_historic_ops") {
2557 if (!op_tracker
.dump_historic_ops(f
, false, filters
)) {
2561 if (admin_command
== "dump_historic_ops_by_duration") {
2562 if (!op_tracker
.dump_historic_ops(f
, true, filters
)) {
2566 if (admin_command
== "dump_historic_slow_ops") {
2567 if (!op_tracker
.dump_historic_slow_ops(f
, filters
)) {
2571 } else if (admin_command
== "dump_op_pq_state") {
2572 f
->open_object_section("pq");
2573 op_shardedwq
.dump(f
);
2575 } else if (admin_command
== "dump_blacklist") {
2576 list
<pair
<entity_addr_t
,utime_t
> > bl
;
2577 OSDMapRef curmap
= service
.get_osdmap();
2579 f
->open_array_section("blacklist");
2580 curmap
->get_blacklist(&bl
);
2581 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
2582 it
!= bl
.end(); ++it
) {
2583 f
->open_object_section("entry");
2584 f
->open_object_section("entity_addr_t");
2586 f
->close_section(); //entity_addr_t
2587 it
->second
.localtime(f
->dump_stream("expire_time"));
2588 f
->close_section(); //entry
2590 f
->close_section(); //blacklist
2591 } else if (admin_command
== "dump_watchers") {
2592 list
<obj_watch_item_t
> watchers
;
2596 for (auto& pg
: pgs
) {
2597 list
<obj_watch_item_t
> pg_watchers
;
2598 pg
->get_watchers(&pg_watchers
);
2599 watchers
.splice(watchers
.end(), pg_watchers
);
2602 f
->open_array_section("watchers");
2603 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2604 it
!= watchers
.end(); ++it
) {
2606 f
->open_object_section("watch");
2608 f
->dump_string("namespace", it
->obj
.nspace
);
2609 f
->dump_string("object", it
->obj
.oid
.name
);
2611 f
->open_object_section("entity_name");
2612 it
->wi
.name
.dump(f
);
2613 f
->close_section(); //entity_name_t
2615 f
->dump_unsigned("cookie", it
->wi
.cookie
);
2616 f
->dump_unsigned("timeout", it
->wi
.timeout_seconds
);
2618 f
->open_object_section("entity_addr_t");
2619 it
->wi
.addr
.dump(f
);
2620 f
->close_section(); //entity_addr_t
2622 f
->close_section(); //watch
2625 f
->close_section(); //watchers
2626 } else if (admin_command
== "dump_recovery_reservations") {
2627 f
->open_object_section("reservations");
2628 f
->open_object_section("local_reservations");
2629 service
.local_reserver
.dump(f
);
2631 f
->open_object_section("remote_reservations");
2632 service
.remote_reserver
.dump(f
);
2635 } else if (admin_command
== "dump_scrub_reservations") {
2636 f
->open_object_section("scrub_reservations");
2637 service
.dump_scrub_reservations(f
);
2639 } else if (admin_command
== "get_latest_osdmap") {
2640 get_latest_osdmap();
2641 } else if (admin_command
== "heap") {
2642 auto result
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ss
);
2644 // Note: Failed heap profile commands won't necessarily trigger an error:
2645 f
->open_object_section("result");
2646 f
->dump_string("error", cpp_strerror(result
));
2647 f
->dump_bool("success", result
>= 0);
2649 } else if (admin_command
== "set_heap_property") {
2653 bool success
= false;
2654 if (!cmd_getval(cct
, cmdmap
, "property", property
)) {
2655 error
= "unable to get property";
2657 } else if (!cmd_getval(cct
, cmdmap
, "value", value
)) {
2658 error
= "unable to get value";
2660 } else if (value
< 0) {
2661 error
= "negative value not allowed";
2663 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2664 error
= "invalid property";
2669 f
->open_object_section("result");
2670 f
->dump_string("error", error
);
2671 f
->dump_bool("success", success
);
2673 } else if (admin_command
== "get_heap_property") {
2677 bool success
= false;
2678 if (!cmd_getval(cct
, cmdmap
, "property", property
)) {
2679 error
= "unable to get property";
2681 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2682 error
= "invalid property";
2687 f
->open_object_section("result");
2688 f
->dump_string("error", error
);
2689 f
->dump_bool("success", success
);
2690 f
->dump_int("value", value
);
2692 } else if (admin_command
== "dump_objectstore_kv_stats") {
2693 store
->get_db_statistics(f
);
2694 } else if (admin_command
== "dump_scrubs") {
2695 service
.dumps_scrub(f
);
2696 } else if (admin_command
== "calc_objectstore_db_histogram") {
2697 store
->generate_db_histogram(f
);
2698 } else if (admin_command
== "flush_store_cache") {
2699 store
->flush_cache(&ss
);
2700 } else if (admin_command
== "dump_pgstate_history") {
2701 f
->open_object_section("pgstate_history");
2704 for (auto& pg
: pgs
) {
2705 f
->dump_stream("pg") << pg
->pg_id
;
2706 pg
->dump_pgstate_history(f
);
2709 } else if (admin_command
== "compact") {
2710 dout(1) << "triggering manual compaction" << dendl
;
2711 auto start
= ceph::coarse_mono_clock::now();
2713 auto end
= ceph::coarse_mono_clock::now();
2714 double duration
= std::chrono::duration
<double>(end
-start
).count();
2715 dout(1) << "finished manual compaction in "
2717 << " seconds" << dendl
;
2718 f
->open_object_section("compact_result");
2719 f
->dump_float("elapsed_time", duration
);
2721 } else if (admin_command
== "get_mapped_pools") {
2722 f
->open_array_section("mapped_pools");
2723 set
<int64_t> poollist
= get_mapped_pools();
2724 for (auto pool
: poollist
) {
2725 f
->dump_int("pool_id", pool
);
2728 } else if (admin_command
== "smart") {
2730 cmd_getval(cct
, cmdmap
, "devid", devid
);
2731 probe_smart(devid
, ss
);
2732 } else if (admin_command
== "list_devices") {
2733 set
<string
> devnames
;
2734 store
->get_devices(&devnames
);
2735 f
->open_object_section("list_devices");
2736 for (auto dev
: devnames
) {
2737 if (dev
.find("dm-") == 0) {
2740 f
->dump_string("device", "/dev/" + dev
);
2743 } else if (admin_command
== "send_beacon") {
2745 send_beacon(ceph::coarse_mono_clock::now());
2747 } else if (admin_command
== "dump_osd_network") {
2749 if (!(cmd_getval(cct
, cmdmap
, "value", value
))) {
2750 // Convert milliseconds to microseconds
2751 value
= static_cast<int64_t>(g_conf().get_val
<double>("mon_warn_on_slow_ping_time")) * 1000;
2753 double ratio
= g_conf().get_val
<double>("mon_warn_on_slow_ping_ratio");
2754 value
= g_conf().get_val
<int64_t>("osd_heartbeat_grace");
2755 value
*= 1000000 * ratio
; // Seconds of grace to microseconds at ratio
2758 // Convert user input to microseconds
2761 if (value
< 0) value
= 0;
2763 struct osd_ping_time_t
{
2767 std::array
<uint32_t,3> times
;
2768 std::array
<uint32_t,3> min
;
2769 std::array
<uint32_t,3> max
;
2771 uint32_t last_update
;
2773 bool operator<(const osd_ping_time_t
& rhs
) const {
2774 if (pingtime
< rhs
.pingtime
)
2776 if (pingtime
> rhs
.pingtime
)
2786 set
<osd_ping_time_t
> sorted
;
2787 // Get pingtimes under lock and not on the stack
2788 map
<int, osd_stat_t::Interfaces
> *pingtimes
= new map
<int, osd_stat_t::Interfaces
>;
2789 service
.get_hb_pingtime(pingtimes
);
2790 for (auto j
: *pingtimes
) {
2791 if (j
.second
.last_update
== 0)
2793 osd_ping_time_t item
;
2794 item
.pingtime
= std::max(j
.second
.back_pingtime
[0], j
.second
.back_pingtime
[1]);
2795 item
.pingtime
= std::max(item
.pingtime
, j
.second
.back_pingtime
[2]);
2796 if (item
.pingtime
>= value
) {
2798 item
.times
[0] = j
.second
.back_pingtime
[0];
2799 item
.times
[1] = j
.second
.back_pingtime
[1];
2800 item
.times
[2] = j
.second
.back_pingtime
[2];
2801 item
.min
[0] = j
.second
.back_min
[0];
2802 item
.min
[1] = j
.second
.back_min
[1];
2803 item
.min
[2] = j
.second
.back_min
[2];
2804 item
.max
[0] = j
.second
.back_max
[0];
2805 item
.max
[1] = j
.second
.back_max
[1];
2806 item
.max
[2] = j
.second
.back_max
[2];
2807 item
.last
= j
.second
.back_last
;
2809 item
.last_update
= j
.second
.last_update
;
2810 sorted
.emplace(item
);
2812 if (j
.second
.front_last
== 0)
2814 item
.pingtime
= std::max(j
.second
.front_pingtime
[0], j
.second
.front_pingtime
[1]);
2815 item
.pingtime
= std::max(item
.pingtime
, j
.second
.front_pingtime
[2]);
2816 if (item
.pingtime
>= value
) {
2818 item
.times
[0] = j
.second
.front_pingtime
[0];
2819 item
.times
[1] = j
.second
.front_pingtime
[1];
2820 item
.times
[2] = j
.second
.front_pingtime
[2];
2821 item
.min
[0] = j
.second
.front_min
[0];
2822 item
.min
[1] = j
.second
.front_min
[1];
2823 item
.min
[2] = j
.second
.front_min
[2];
2824 item
.max
[0] = j
.second
.front_max
[0];
2825 item
.max
[1] = j
.second
.front_max
[1];
2826 item
.max
[2] = j
.second
.front_max
[2];
2827 item
.last
= j
.second
.front_last
;
2828 item
.last_update
= j
.second
.last_update
;
2830 sorted
.emplace(item
);
2835 // Network ping times (1min 5min 15min)
2836 f
->open_object_section("network_ping_times");
2837 f
->dump_int("threshold", value
/ 1000);
2838 f
->open_array_section("entries");
2839 for (auto &sitem
: boost::adaptors::reverse(sorted
)) {
2840 ceph_assert(sitem
.pingtime
>= value
);
2841 f
->open_object_section("entry");
2843 const time_t lu(sitem
.last_update
);
2845 string
lustr(ctime_r(&lu
, buffer
));
2846 lustr
.pop_back(); // Remove trailing \n
2847 auto stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
2848 f
->dump_string("last update", lustr
);
2849 f
->dump_bool("stale", ceph_clock_now().sec() - sitem
.last_update
> stale
);
2850 f
->dump_int("from osd", whoami
);
2851 f
->dump_int("to osd", sitem
.to
);
2852 f
->dump_string("interface", (sitem
.back
? "back" : "front"));
2853 f
->open_object_section("average");
2854 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.times
[0],3).c_str());
2855 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.times
[1],3).c_str());
2856 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.times
[2],3).c_str());
2857 f
->close_section(); // average
2858 f
->open_object_section("min");
2859 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
2860 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
2861 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
2862 f
->close_section(); // min
2863 f
->open_object_section("max");
2864 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
2865 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
2866 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
2867 f
->close_section(); // max
2868 f
->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem
.last
,3).c_str());
2869 f
->close_section(); // entry
2871 f
->close_section(); // entries
2872 f
->close_section(); // network_ping_times
2874 ceph_abort_msg("broken asok registration");
2881 class TestOpsSocketHook
: public AdminSocketHook
{
2882 OSDService
*service
;
2885 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
2886 bool call(std::string_view command
, const cmdmap_t
& cmdmap
,
2887 std::string_view format
, bufferlist
& out
) override
{
2890 test_ops(service
, store
, command
, cmdmap
, ss
);
2891 } catch (const bad_cmd_get
& e
) {
2897 void test_ops(OSDService
*service
, ObjectStore
*store
,
2898 std::string_view command
, const cmdmap_t
& cmdmap
, ostream
&ss
);
2902 class OSD::C_Tick
: public Context
{
2905 explicit C_Tick(OSD
*o
) : osd(o
) {}
2906 void finish(int r
) override
{
2911 class OSD::C_Tick_WithoutOSDLock
: public Context
{
2914 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
2915 void finish(int r
) override
{
2916 osd
->tick_without_osd_lock();
2920 int OSD::enable_disable_fuse(bool stop
)
2924 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
2925 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
2926 dout(1) << __func__
<< " disabling" << dendl
;
2930 r
= ::rmdir(mntpath
.c_str());
2933 derr
<< __func__
<< " failed to rmdir " << mntpath
<< ": "
2934 << cpp_strerror(r
) << dendl
;
2939 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
2940 dout(1) << __func__
<< " enabling" << dendl
;
2941 r
= ::mkdir(mntpath
.c_str(), 0700);
2944 if (r
< 0 && r
!= -EEXIST
) {
2945 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
2946 << cpp_strerror(r
) << dendl
;
2949 fuse_store
= new FuseStore(store
, mntpath
);
2950 r
= fuse_store
->start();
2952 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
2958 #endif // HAVE_LIBFUSE
2962 int OSD::get_num_op_shards()
2964 if (cct
->_conf
->osd_op_num_shards
)
2965 return cct
->_conf
->osd_op_num_shards
;
2966 if (store_is_rotational
)
2967 return cct
->_conf
->osd_op_num_shards_hdd
;
2969 return cct
->_conf
->osd_op_num_shards_ssd
;
2972 int OSD::get_num_op_threads()
2974 if (cct
->_conf
->osd_op_num_threads_per_shard
)
2975 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard
;
2976 if (store_is_rotational
)
2977 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_hdd
;
2979 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_ssd
;
2982 float OSD::get_osd_recovery_sleep()
2984 if (cct
->_conf
->osd_recovery_sleep
)
2985 return cct
->_conf
->osd_recovery_sleep
;
2986 if (!store_is_rotational
&& !journal_is_rotational
)
2987 return cct
->_conf
->osd_recovery_sleep_ssd
;
2988 else if (store_is_rotational
&& !journal_is_rotational
)
2989 return cct
->_conf
.get_val
<double>("osd_recovery_sleep_hybrid");
2991 return cct
->_conf
->osd_recovery_sleep_hdd
;
2994 float OSD::get_osd_delete_sleep()
2996 float osd_delete_sleep
= cct
->_conf
.get_val
<double>("osd_delete_sleep");
2997 if (osd_delete_sleep
> 0)
2998 return osd_delete_sleep
;
2999 if (!store_is_rotational
&& !journal_is_rotational
)
3000 return cct
->_conf
.get_val
<double>("osd_delete_sleep_ssd");
3001 if (store_is_rotational
&& !journal_is_rotational
)
3002 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hybrid");
3003 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hdd");
3006 float OSD::get_osd_snap_trim_sleep()
3008 float osd_snap_trim_sleep
= cct
->_conf
.get_val
<double>("osd_snap_trim_sleep");
3009 if (osd_snap_trim_sleep
> 0)
3010 return osd_snap_trim_sleep
;
3011 if (!store_is_rotational
&& !journal_is_rotational
)
3012 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_ssd");
3013 if (store_is_rotational
&& !journal_is_rotational
)
3014 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hybrid");
3015 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hdd");
3020 CompatSet initial
, diff
;
3021 std::lock_guard
lock(osd_lock
);
3026 tick_timer_without_osd_lock
.init();
3027 service
.recovery_request_timer
.init();
3028 service
.sleep_timer
.init();
3030 boot_finisher
.start();
3034 store
->read_meta("require_osd_release", &val
);
3035 last_require_osd_release
= atoi(val
.c_str());
3039 dout(2) << "init " << dev_path
3040 << " (looks like " << (store_is_rotational
? "hdd" : "ssd") << ")"
3042 dout(2) << "journal " << journal_path
<< dendl
;
3043 ceph_assert(store
); // call pre_init() first!
3045 store
->set_cache_shards(get_num_op_shards());
3047 int r
= store
->mount();
3049 derr
<< "OSD:init: unable to mount object store" << dendl
;
3052 journal_is_rotational
= store
->is_journal_rotational();
3053 dout(2) << "journal looks like " << (journal_is_rotational
? "hdd" : "ssd")
3056 enable_disable_fuse(false);
3058 dout(2) << "boot" << dendl
;
3060 service
.meta_ch
= store
->open_collection(coll_t::meta());
3062 // initialize the daily loadavg with current 15min loadavg
3064 if (getloadavg(loadavgs
, 3) == 3) {
3065 daily_loadavg
= loadavgs
[2];
3067 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
3068 daily_loadavg
= 1.0;
3071 int rotating_auth_attempts
= 0;
3072 auto rotating_auth_timeout
=
3073 g_conf().get_val
<int64_t>("rotating_keys_bootstrap_timeout");
3075 // sanity check long object name handling
3078 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
3079 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
3080 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
3081 r
= store
->validate_hobject_key(l
);
3083 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
3084 << "object name[space] len" << dendl
;
3085 derr
<< " osd max object name len = "
3086 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
3087 derr
<< " osd max object namespace len = "
3088 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
3089 derr
<< cpp_strerror(r
) << dendl
;
3090 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
3093 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
3096 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
3101 r
= read_superblock();
3103 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
3108 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
3109 derr
<< "The disk uses features unsupported by the executable." << dendl
;
3110 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
3111 derr
<< " daemon features " << osd_compat
<< dendl
;
3113 if (osd_compat
.writeable(superblock
.compat_features
)) {
3114 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3115 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
3120 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3121 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
3127 assert_warn(whoami
== superblock
.whoami
);
3128 if (whoami
!= superblock
.whoami
) {
3129 derr
<< "OSD::init: superblock says osd"
3130 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
3135 // load up "current" osdmap
3136 assert_warn(!osdmap
);
3138 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
3142 osdmap
= get_map(superblock
.current_epoch
);
3144 // make sure we don't have legacy pgs deleting
3147 int r
= store
->list_collections(ls
);
3148 ceph_assert(r
>= 0);
3151 if (c
.is_pg(&pgid
) &&
3152 !osdmap
->have_pg_pool(pgid
.pool())) {
3153 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
3154 if (!store
->exists(service
.meta_ch
, oid
)) {
3155 derr
<< __func__
<< " missing pg_pool_t for deleted pool "
3156 << pgid
.pool() << " for pg " << pgid
3157 << "; please downgrade to luminous and allow "
3158 << "pg deletion to complete before upgrading" << dendl
;
3165 initial
= get_osd_initial_compat_set();
3166 diff
= superblock
.compat_features
.unsupported(initial
);
3167 if (superblock
.compat_features
.merge(initial
)) {
3168 // We need to persist the new compat_set before we
3170 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
3171 ObjectStore::Transaction t
;
3172 write_superblock(t
);
3173 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3178 // make sure snap mapper object exists
3179 if (!store
->exists(service
.meta_ch
, OSD::make_snapmapper_oid())) {
3180 dout(10) << "init creating/touching snapmapper object" << dendl
;
3181 ObjectStore::Transaction t
;
3182 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3183 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3188 class_handler
= new ClassHandler(cct
);
3189 cls_initialize(class_handler
);
3191 if (cct
->_conf
->osd_open_classes_on_start
) {
3192 int r
= class_handler
->open_all_classes();
3194 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
3197 check_osdmap_features();
3199 create_recoverystate_perf();
3202 epoch_t bind_epoch
= osdmap
->get_epoch();
3203 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
3206 clear_temp_objects();
3208 // initialize osdmap references in sharded wq
3209 for (auto& shard
: shards
) {
3210 std::lock_guard
l(shard
->osdmap_lock
);
3211 shard
->shard_osdmap
= osdmap
;
3214 // load up pgs (as they previously existed)
3217 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
3218 dout(0) << "using " << op_queue
<< " op queue with priority op cut off at " <<
3219 op_prio_cutoff
<< "." << dendl
;
3225 struct store_statfs_t stbuf
;
3226 osd_alert_list_t alerts
;
3227 int r
= store
->statfs(&stbuf
, &alerts
);
3228 ceph_assert(r
== 0);
3229 service
.set_statfs(stbuf
, alerts
);
3232 // client_messenger auth_client is already set up by monc.
3233 for (auto m
: { cluster_messenger
,
3235 hb_front_client_messenger
,
3236 hb_back_client_messenger
,
3237 hb_front_server_messenger
,
3238 hb_back_server_messenger
} ) {
3239 m
->set_auth_client(monc
);
3241 for (auto m
: { client_messenger
,
3243 hb_front_server_messenger
,
3244 hb_back_server_messenger
}) {
3245 m
->set_auth_server(monc
);
3247 monc
->set_handle_authentication_dispatcher(this);
3249 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
3250 | CEPH_ENTITY_TYPE_MGR
);
3255 mgrc
.set_pgstats_cb([this](){ return collect_pg_stats(); });
3256 mgrc
.set_perf_metric_query_cb(
3257 [this](const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
) {
3258 set_perf_queries(queries
);
3260 [this](std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> *reports
) {
3261 get_perf_reports(reports
);
3265 // tell monc about log_client so it will know about mon session resets
3266 monc
->set_log_client(&log_client
);
3267 update_log_config();
3270 client_messenger
->add_dispatcher_tail(&mgrc
);
3271 client_messenger
->add_dispatcher_tail(this);
3272 cluster_messenger
->add_dispatcher_head(this);
3274 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3275 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3276 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3277 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3279 objecter_messenger
->add_dispatcher_head(service
.objecter
);
3282 service
.publish_map(osdmap
);
3283 service
.publish_superblock(superblock
);
3284 service
.max_oldest_map
= superblock
.oldest_map
;
3286 for (auto& shard
: shards
) {
3287 // put PGs in a temporary set because we may modify pg_slots
3288 // unordered_map below.
3290 for (auto& i
: shard
->pg_slots
) {
3291 PGRef pg
= i
.second
->pg
;
3297 for (auto pg
: pgs
) {
3299 set
<pair
<spg_t
,epoch_t
>> new_children
;
3300 set
<pair
<spg_t
,epoch_t
>> merge_pgs
;
3301 service
.identify_splits_and_merges(pg
->get_osdmap(), osdmap
, pg
->pg_id
,
3302 &new_children
, &merge_pgs
);
3303 if (!new_children
.empty()) {
3304 for (auto shard
: shards
) {
3305 shard
->prime_splits(osdmap
, &new_children
);
3307 assert(new_children
.empty());
3309 if (!merge_pgs
.empty()) {
3310 for (auto shard
: shards
) {
3311 shard
->prime_merges(osdmap
, &merge_pgs
);
3313 assert(merge_pgs
.empty());
3322 // start the heartbeat
3323 heartbeat_thread
.create("osd_srv_heartbt");
3326 tick_timer
.add_event_after(get_tick_interval(),
3329 std::lock_guard
l(tick_timer_lock
);
3330 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
3331 new C_Tick_WithoutOSDLock(this));
3336 r
= monc
->authenticate();
3338 derr
<< __func__
<< " authentication failed: " << cpp_strerror(r
)
3343 while (monc
->wait_auth_rotating(rotating_auth_timeout
) < 0) {
3344 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
3345 ++rotating_auth_attempts
;
3346 if (rotating_auth_attempts
> g_conf()->max_rotating_auth_attempts
) {
3347 derr
<< __func__
<< " wait_auth_rotating timed out" << dendl
;
3352 r
= update_crush_device_class();
3354 derr
<< __func__
<< " unable to update_crush_device_class: "
3355 << cpp_strerror(r
) << dendl
;
3359 r
= update_crush_location();
3361 derr
<< __func__
<< " unable to update_crush_location: "
3362 << cpp_strerror(r
) << dendl
;
3370 // start objecter *after* we have authenticated, so that we don't ignore
3371 // the OSDMaps it requests.
3372 service
.final_init();
3376 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
3379 dout(0) << "done with init, starting boot process" << dendl
;
3381 // subscribe to any pg creations
3382 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
3384 // MgrClient needs this (it doesn't have MonClient reference itself)
3385 monc
->sub_want("mgrmap", 0, 0);
3387 // we don't need to ask for an osdmap here; objecter will
3388 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3397 enable_disable_fuse(true);
3404 void OSD::final_init()
3406 AdminSocket
*admin_socket
= cct
->get_admin_socket();
3407 asok_hook
= new OSDSocketHook(this);
3408 int r
= admin_socket
->register_command("status", "status", asok_hook
,
3409 "high-level status of OSD");
3410 ceph_assert(r
== 0);
3411 r
= admin_socket
->register_command("flush_journal", "flush_journal",
3413 "flush the journal to permanent store");
3414 ceph_assert(r
== 0);
3415 r
= admin_socket
->register_command("dump_ops_in_flight",
3416 "dump_ops_in_flight " \
3417 "name=filterstr,type=CephString,n=N,req=false",
3419 "show the ops currently in flight");
3420 ceph_assert(r
== 0);
3421 r
= admin_socket
->register_command("ops",
3423 "name=filterstr,type=CephString,n=N,req=false",
3425 "show the ops currently in flight");
3426 ceph_assert(r
== 0);
3427 r
= admin_socket
->register_command("dump_blocked_ops",
3428 "dump_blocked_ops " \
3429 "name=filterstr,type=CephString,n=N,req=false",
3431 "show the blocked ops currently in flight");
3432 ceph_assert(r
== 0);
3433 r
= admin_socket
->register_command("dump_historic_ops",
3434 "dump_historic_ops " \
3435 "name=filterstr,type=CephString,n=N,req=false",
3438 ceph_assert(r
== 0);
3439 r
= admin_socket
->register_command("dump_historic_slow_ops",
3440 "dump_historic_slow_ops " \
3441 "name=filterstr,type=CephString,n=N,req=false",
3443 "show slowest recent ops");
3444 ceph_assert(r
== 0);
3445 r
= admin_socket
->register_command("dump_historic_ops_by_duration",
3446 "dump_historic_ops_by_duration " \
3447 "name=filterstr,type=CephString,n=N,req=false",
3449 "show slowest recent ops, sorted by duration");
3450 ceph_assert(r
== 0);
3451 r
= admin_socket
->register_command("dump_op_pq_state", "dump_op_pq_state",
3453 "dump op priority queue state");
3454 ceph_assert(r
== 0);
3455 r
= admin_socket
->register_command("dump_blacklist", "dump_blacklist",
3457 "dump blacklisted clients and times");
3458 ceph_assert(r
== 0);
3459 r
= admin_socket
->register_command("dump_watchers", "dump_watchers",
3461 "show clients which have active watches,"
3462 " and on which objects");
3463 ceph_assert(r
== 0);
3464 r
= admin_socket
->register_command("dump_recovery_reservations", "dump_recovery_reservations",
3466 "show recovery reservations");
3467 ceph_assert(r
== 0);
3468 r
= admin_socket
->register_command("dump_scrub_reservations", "dump_scrub_reservations",
3470 "show scrub reservations");
3471 ceph_assert(r
== 0);
3472 r
= admin_socket
->register_command("get_latest_osdmap", "get_latest_osdmap",
3474 "force osd to update the latest map from "
3476 ceph_assert(r
== 0);
3478 r
= admin_socket
->register_command( "heap",
3480 "name=heapcmd,type=CephString " \
3481 "name=value,type=CephString,req=false",
3483 "show heap usage info (available only if "
3484 "compiled with tcmalloc)");
3485 ceph_assert(r
== 0);
3487 r
= admin_socket
->register_command("set_heap_property",
3488 "set_heap_property " \
3489 "name=property,type=CephString " \
3490 "name=value,type=CephInt",
3492 "update malloc extension heap property");
3493 ceph_assert(r
== 0);
3495 r
= admin_socket
->register_command("get_heap_property",
3496 "get_heap_property " \
3497 "name=property,type=CephString",
3499 "get malloc extension heap property");
3500 ceph_assert(r
== 0);
3502 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
3503 "dump_objectstore_kv_stats",
3505 "print statistics of kvdb which used by bluestore");
3506 ceph_assert(r
== 0);
3508 r
= admin_socket
->register_command("dump_scrubs",
3511 "print scheduled scrubs");
3512 ceph_assert(r
== 0);
3514 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
3515 "calc_objectstore_db_histogram",
3517 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3518 ceph_assert(r
== 0);
3520 r
= admin_socket
->register_command("flush_store_cache",
3521 "flush_store_cache",
3523 "Flush bluestore internal cache");
3524 ceph_assert(r
== 0);
3525 r
= admin_socket
->register_command("dump_pgstate_history", "dump_pgstate_history",
3527 "show recent state history");
3528 ceph_assert(r
== 0);
3530 r
= admin_socket
->register_command("compact", "compact",
3532 "Commpact object store's omap."
3533 " WARNING: Compaction probably slows your requests");
3534 ceph_assert(r
== 0);
3536 r
= admin_socket
->register_command("get_mapped_pools", "get_mapped_pools",
3538 "dump pools whose PG(s) are mapped to this OSD.");
3540 ceph_assert(r
== 0);
3542 r
= admin_socket
->register_command("smart", "smart name=devid,type=CephString,req=False",
3544 "probe OSD devices for SMART data.");
3546 ceph_assert(r
== 0);
3548 r
= admin_socket
->register_command("list_devices", "list_devices",
3550 "list OSD devices.");
3551 r
= admin_socket
->register_command("send_beacon", "send_beacon",
3553 "send OSD beacon to mon immediately");
3555 r
= admin_socket
->register_command("dump_osd_network", "dump_osd_network name=value,type=CephInt,req=false", asok_hook
,
3556 "Dump osd heartbeat network ping times");
3557 ceph_assert(r
== 0);
3559 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
);
3560 // Note: pools are CephString instead of CephPoolname because
3561 // these commands traditionally support both pool names and numbers
3562 r
= admin_socket
->register_command(
3565 "name=pool,type=CephString " \
3566 "name=objname,type=CephObjectname " \
3567 "name=key,type=CephString "\
3568 "name=val,type=CephString",
3571 ceph_assert(r
== 0);
3572 r
= admin_socket
->register_command(
3575 "name=pool,type=CephString " \
3576 "name=objname,type=CephObjectname " \
3577 "name=key,type=CephString",
3580 ceph_assert(r
== 0);
3581 r
= admin_socket
->register_command(
3584 "name=pool,type=CephString " \
3585 "name=objname,type=CephObjectname " \
3586 "name=header,type=CephString",
3589 ceph_assert(r
== 0);
3591 r
= admin_socket
->register_command(
3594 "name=pool,type=CephString " \
3595 "name=objname,type=CephObjectname",
3597 "output entire object map");
3598 ceph_assert(r
== 0);
3600 r
= admin_socket
->register_command(
3603 "name=pool,type=CephString " \
3604 "name=objname,type=CephObjectname " \
3605 "name=len,type=CephInt",
3607 "truncate object to length");
3608 ceph_assert(r
== 0);
3610 r
= admin_socket
->register_command(
3613 "name=pool,type=CephString " \
3614 "name=objname,type=CephObjectname " \
3615 "name=shardid,type=CephInt,req=false,range=0|255",
3617 "inject data error to an object");
3618 ceph_assert(r
== 0);
3620 r
= admin_socket
->register_command(
3623 "name=pool,type=CephString " \
3624 "name=objname,type=CephObjectname " \
3625 "name=shardid,type=CephInt,req=false,range=0|255",
3627 "inject metadata error to an object");
3628 ceph_assert(r
== 0);
3629 r
= admin_socket
->register_command(
3630 "set_recovery_delay",
3631 "set_recovery_delay " \
3632 "name=utime,type=CephInt,req=false",
3634 "Delay osd recovery by specified seconds");
3635 ceph_assert(r
== 0);
3636 r
= admin_socket
->register_command(
3639 "name=pgid,type=CephString " \
3640 "name=time,type=CephInt,req=false",
3642 "Trigger a scheduled scrub ");
3643 ceph_assert(r
== 0);
3644 r
= admin_socket
->register_command(
3645 "trigger_deep_scrub",
3646 "trigger_deep_scrub " \
3647 "name=pgid,type=CephString " \
3648 "name=time,type=CephInt,req=false",
3650 "Trigger a scheduled deep scrub ");
3651 ceph_assert(r
== 0);
3652 r
= admin_socket
->register_command(
3655 "name=type,type=CephString,req=false " \
3656 "name=count,type=CephInt,req=false ",
3658 "Inject a full disk (optional count times)");
3659 ceph_assert(r
== 0);
3662 void OSD::create_logger()
3664 dout(10) << "create_logger" << dendl
;
3666 PerfCountersBuilder
osd_plb(cct
, "osd", l_osd_first
, l_osd_last
);
3668 // Latency axis configuration for op histograms, values are in nanoseconds
3669 PerfHistogramCommon::axis_config_d op_hist_x_axis_config
{
3671 PerfHistogramCommon::SCALE_LOG2
, ///< Latency in logarithmic scale
3673 100000, ///< Quantization unit is 100usec
3674 32, ///< Enough to cover much longer than slow requests
3677 // Op size axis configuration for op histograms, values are in bytes
3678 PerfHistogramCommon::axis_config_d op_hist_y_axis_config
{
3679 "Request size (bytes)",
3680 PerfHistogramCommon::SCALE_LOG2
, ///< Request size in logarithmic scale
3682 512, ///< Quantization unit is 512 bytes
3683 32, ///< Enough to cover requests larger than GB
3687 // All the basic OSD operation stats are to be considered useful
3688 osd_plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
3691 l_osd_op_wip
, "op_wip",
3692 "Replication operations currently being processed (primary)");
3693 osd_plb
.add_u64_counter(
3695 "Client operations",
3696 "ops", PerfCountersBuilder::PRIO_CRITICAL
);
3697 osd_plb
.add_u64_counter(
3698 l_osd_op_inb
, "op_in_bytes",
3699 "Client operations total write size",
3700 "wr", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
3701 osd_plb
.add_u64_counter(
3702 l_osd_op_outb
, "op_out_bytes",
3703 "Client operations total read size",
3704 "rd", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
3705 osd_plb
.add_time_avg(
3706 l_osd_op_lat
, "op_latency",
3707 "Latency of client operations (including queue time)",
3709 osd_plb
.add_time_avg(
3710 l_osd_op_process_lat
, "op_process_latency",
3711 "Latency of client operations (excluding queue time)");
3712 osd_plb
.add_time_avg(
3713 l_osd_op_prepare_lat
, "op_prepare_latency",
3714 "Latency of client operations (excluding queue time and wait for finished)");
3716 osd_plb
.add_u64_counter(
3717 l_osd_op_r
, "op_r", "Client read operations");
3718 osd_plb
.add_u64_counter(
3719 l_osd_op_r_outb
, "op_r_out_bytes", "Client data read", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3720 osd_plb
.add_time_avg(
3721 l_osd_op_r_lat
, "op_r_latency",
3722 "Latency of read operation (including queue time)");
3723 osd_plb
.add_u64_counter_histogram(
3724 l_osd_op_r_lat_outb_hist
, "op_r_latency_out_bytes_histogram",
3725 op_hist_x_axis_config
, op_hist_y_axis_config
,
3726 "Histogram of operation latency (including queue time) + data read");
3727 osd_plb
.add_time_avg(
3728 l_osd_op_r_process_lat
, "op_r_process_latency",
3729 "Latency of read operation (excluding queue time)");
3730 osd_plb
.add_time_avg(
3731 l_osd_op_r_prepare_lat
, "op_r_prepare_latency",
3732 "Latency of read operations (excluding queue time and wait for finished)");
3733 osd_plb
.add_u64_counter(
3734 l_osd_op_w
, "op_w", "Client write operations");
3735 osd_plb
.add_u64_counter(
3736 l_osd_op_w_inb
, "op_w_in_bytes", "Client data written");
3737 osd_plb
.add_time_avg(
3738 l_osd_op_w_lat
, "op_w_latency",
3739 "Latency of write operation (including queue time)");
3740 osd_plb
.add_u64_counter_histogram(
3741 l_osd_op_w_lat_inb_hist
, "op_w_latency_in_bytes_histogram",
3742 op_hist_x_axis_config
, op_hist_y_axis_config
,
3743 "Histogram of operation latency (including queue time) + data written");
3744 osd_plb
.add_time_avg(
3745 l_osd_op_w_process_lat
, "op_w_process_latency",
3746 "Latency of write operation (excluding queue time)");
3747 osd_plb
.add_time_avg(
3748 l_osd_op_w_prepare_lat
, "op_w_prepare_latency",
3749 "Latency of write operations (excluding queue time and wait for finished)");
3750 osd_plb
.add_u64_counter(
3751 l_osd_op_rw
, "op_rw",
3752 "Client read-modify-write operations");
3753 osd_plb
.add_u64_counter(
3754 l_osd_op_rw_inb
, "op_rw_in_bytes",
3755 "Client read-modify-write operations write in", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3756 osd_plb
.add_u64_counter(
3757 l_osd_op_rw_outb
,"op_rw_out_bytes",
3758 "Client read-modify-write operations read out ", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3759 osd_plb
.add_time_avg(
3760 l_osd_op_rw_lat
, "op_rw_latency",
3761 "Latency of read-modify-write operation (including queue time)");
3762 osd_plb
.add_u64_counter_histogram(
3763 l_osd_op_rw_lat_inb_hist
, "op_rw_latency_in_bytes_histogram",
3764 op_hist_x_axis_config
, op_hist_y_axis_config
,
3765 "Histogram of rw operation latency (including queue time) + data written");
3766 osd_plb
.add_u64_counter_histogram(
3767 l_osd_op_rw_lat_outb_hist
, "op_rw_latency_out_bytes_histogram",
3768 op_hist_x_axis_config
, op_hist_y_axis_config
,
3769 "Histogram of rw operation latency (including queue time) + data read");
3770 osd_plb
.add_time_avg(
3771 l_osd_op_rw_process_lat
, "op_rw_process_latency",
3772 "Latency of read-modify-write operation (excluding queue time)");
3773 osd_plb
.add_time_avg(
3774 l_osd_op_rw_prepare_lat
, "op_rw_prepare_latency",
3775 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3777 // Now we move on to some more obscure stats, revert to assuming things
3778 // are low priority unless otherwise specified.
3779 osd_plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
3781 osd_plb
.add_time_avg(l_osd_op_before_queue_op_lat
, "op_before_queue_op_lat",
3782 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3783 osd_plb
.add_time_avg(l_osd_op_before_dequeue_op_lat
, "op_before_dequeue_op_lat",
3784 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3786 osd_plb
.add_u64_counter(
3787 l_osd_sop
, "subop", "Suboperations");
3788 osd_plb
.add_u64_counter(
3789 l_osd_sop_inb
, "subop_in_bytes", "Suboperations total size", NULL
, 0, unit_t(UNIT_BYTES
));
3790 osd_plb
.add_time_avg(l_osd_sop_lat
, "subop_latency", "Suboperations latency");
3792 osd_plb
.add_u64_counter(l_osd_sop_w
, "subop_w", "Replicated writes");
3793 osd_plb
.add_u64_counter(
3794 l_osd_sop_w_inb
, "subop_w_in_bytes", "Replicated written data size", NULL
, 0, unit_t(UNIT_BYTES
));
3795 osd_plb
.add_time_avg(
3796 l_osd_sop_w_lat
, "subop_w_latency", "Replicated writes latency");
3797 osd_plb
.add_u64_counter(
3798 l_osd_sop_pull
, "subop_pull", "Suboperations pull requests");
3799 osd_plb
.add_time_avg(
3800 l_osd_sop_pull_lat
, "subop_pull_latency", "Suboperations pull latency");
3801 osd_plb
.add_u64_counter(
3802 l_osd_sop_push
, "subop_push", "Suboperations push messages");
3803 osd_plb
.add_u64_counter(
3804 l_osd_sop_push_inb
, "subop_push_in_bytes", "Suboperations pushed size", NULL
, 0, unit_t(UNIT_BYTES
));
3805 osd_plb
.add_time_avg(
3806 l_osd_sop_push_lat
, "subop_push_latency", "Suboperations push latency");
3808 osd_plb
.add_u64_counter(l_osd_pull
, "pull", "Pull requests sent");
3809 osd_plb
.add_u64_counter(l_osd_push
, "push", "Push messages sent");
3810 osd_plb
.add_u64_counter(l_osd_push_outb
, "push_out_bytes", "Pushed size", NULL
, 0, unit_t(UNIT_BYTES
));
3812 osd_plb
.add_u64_counter(
3813 l_osd_rop
, "recovery_ops",
3814 "Started recovery operations",
3815 "rop", PerfCountersBuilder::PRIO_INTERESTING
);
3817 osd_plb
.add_u64_counter(
3818 l_osd_rbytes
, "recovery_bytes",
3820 "rbt", PerfCountersBuilder::PRIO_INTERESTING
);
3822 osd_plb
.add_u64(l_osd_loadavg
, "loadavg", "CPU load");
3824 l_osd_cached_crc
, "cached_crc", "Total number getting crc from crc_cache");
3826 l_osd_cached_crc_adjusted
, "cached_crc_adjusted",
3827 "Total number getting crc from crc_cache with adjusting");
3828 osd_plb
.add_u64(l_osd_missed_crc
, "missed_crc",
3829 "Total number of crc cache misses");
3831 osd_plb
.add_u64(l_osd_pg
, "numpg", "Placement groups",
3832 "pgs", PerfCountersBuilder::PRIO_USEFUL
);
3834 l_osd_pg_primary
, "numpg_primary",
3835 "Placement groups for which this osd is primary");
3837 l_osd_pg_replica
, "numpg_replica",
3838 "Placement groups for which this osd is replica");
3840 l_osd_pg_stray
, "numpg_stray",
3841 "Placement groups ready to be deleted from this osd");
3843 l_osd_pg_removing
, "numpg_removing",
3844 "Placement groups queued for local deletion", "pgsr",
3845 PerfCountersBuilder::PRIO_USEFUL
);
3847 l_osd_hb_to
, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3848 osd_plb
.add_u64_counter(l_osd_map
, "map_messages", "OSD map messages");
3849 osd_plb
.add_u64_counter(l_osd_mape
, "map_message_epochs", "OSD map epochs");
3850 osd_plb
.add_u64_counter(
3851 l_osd_mape_dup
, "map_message_epoch_dups", "OSD map duplicates");
3852 osd_plb
.add_u64_counter(
3853 l_osd_waiting_for_map
, "messages_delayed_for_map",
3854 "Operations waiting for OSD map");
3856 osd_plb
.add_u64_counter(
3857 l_osd_map_cache_hit
, "osd_map_cache_hit", "osdmap cache hit");
3858 osd_plb
.add_u64_counter(
3859 l_osd_map_cache_miss
, "osd_map_cache_miss", "osdmap cache miss");
3860 osd_plb
.add_u64_counter(
3861 l_osd_map_cache_miss_low
, "osd_map_cache_miss_low",
3862 "osdmap cache miss below cache lower bound");
3863 osd_plb
.add_u64_avg(
3864 l_osd_map_cache_miss_low_avg
, "osd_map_cache_miss_low_avg",
3865 "osdmap cache miss, avg distance below cache lower bound");
3866 osd_plb
.add_u64_counter(
3867 l_osd_map_bl_cache_hit
, "osd_map_bl_cache_hit",
3868 "OSDMap buffer cache hits");
3869 osd_plb
.add_u64_counter(
3870 l_osd_map_bl_cache_miss
, "osd_map_bl_cache_miss",
3871 "OSDMap buffer cache misses");
3874 l_osd_stat_bytes
, "stat_bytes", "OSD size", "size",
3875 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3877 l_osd_stat_bytes_used
, "stat_bytes_used", "Used space", "used",
3878 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3879 osd_plb
.add_u64(l_osd_stat_bytes_avail
, "stat_bytes_avail", "Available space", NULL
, 0, unit_t(UNIT_BYTES
));
3881 osd_plb
.add_u64_counter(
3882 l_osd_copyfrom
, "copyfrom", "Rados \"copy-from\" operations");
3884 osd_plb
.add_u64_counter(l_osd_tier_promote
, "tier_promote", "Tier promotions");
3885 osd_plb
.add_u64_counter(l_osd_tier_flush
, "tier_flush", "Tier flushes");
3886 osd_plb
.add_u64_counter(
3887 l_osd_tier_flush_fail
, "tier_flush_fail", "Failed tier flushes");
3888 osd_plb
.add_u64_counter(
3889 l_osd_tier_try_flush
, "tier_try_flush", "Tier flush attempts");
3890 osd_plb
.add_u64_counter(
3891 l_osd_tier_try_flush_fail
, "tier_try_flush_fail",
3892 "Failed tier flush attempts");
3893 osd_plb
.add_u64_counter(
3894 l_osd_tier_evict
, "tier_evict", "Tier evictions");
3895 osd_plb
.add_u64_counter(
3896 l_osd_tier_whiteout
, "tier_whiteout", "Tier whiteouts");
3897 osd_plb
.add_u64_counter(
3898 l_osd_tier_dirty
, "tier_dirty", "Dirty tier flag set");
3899 osd_plb
.add_u64_counter(
3900 l_osd_tier_clean
, "tier_clean", "Dirty tier flag cleaned");
3901 osd_plb
.add_u64_counter(
3902 l_osd_tier_delay
, "tier_delay", "Tier delays (agent waiting)");
3903 osd_plb
.add_u64_counter(
3904 l_osd_tier_proxy_read
, "tier_proxy_read", "Tier proxy reads");
3905 osd_plb
.add_u64_counter(
3906 l_osd_tier_proxy_write
, "tier_proxy_write", "Tier proxy writes");
3908 osd_plb
.add_u64_counter(
3909 l_osd_agent_wake
, "agent_wake", "Tiering agent wake up");
3910 osd_plb
.add_u64_counter(
3911 l_osd_agent_skip
, "agent_skip", "Objects skipped by agent");
3912 osd_plb
.add_u64_counter(
3913 l_osd_agent_flush
, "agent_flush", "Tiering agent flushes");
3914 osd_plb
.add_u64_counter(
3915 l_osd_agent_evict
, "agent_evict", "Tiering agent evictions");
3917 osd_plb
.add_u64_counter(
3918 l_osd_object_ctx_cache_hit
, "object_ctx_cache_hit", "Object context cache hits");
3919 osd_plb
.add_u64_counter(
3920 l_osd_object_ctx_cache_total
, "object_ctx_cache_total", "Object context cache lookups");
3922 osd_plb
.add_u64_counter(l_osd_op_cache_hit
, "op_cache_hit");
3923 osd_plb
.add_time_avg(
3924 l_osd_tier_flush_lat
, "osd_tier_flush_lat", "Object flush latency");
3925 osd_plb
.add_time_avg(
3926 l_osd_tier_promote_lat
, "osd_tier_promote_lat", "Object promote latency");
3927 osd_plb
.add_time_avg(
3928 l_osd_tier_r_lat
, "osd_tier_r_lat", "Object proxy read latency");
3930 osd_plb
.add_u64_counter(
3931 l_osd_pg_info
, "osd_pg_info", "PG updated its info (using any method)");
3932 osd_plb
.add_u64_counter(
3933 l_osd_pg_fastinfo
, "osd_pg_fastinfo",
3934 "PG updated its info using fastinfo attr");
3935 osd_plb
.add_u64_counter(
3936 l_osd_pg_biginfo
, "osd_pg_biginfo", "PG updated its biginfo attr");
3938 logger
= osd_plb
.create_perf_counters();
3939 cct
->get_perfcounters_collection()->add(logger
);
3942 void OSD::create_recoverystate_perf()
3944 dout(10) << "create_recoverystate_perf" << dendl
;
3946 PerfCountersBuilder
rs_perf(cct
, "recoverystate_perf", rs_first
, rs_last
);
3948 rs_perf
.add_time_avg(rs_initial_latency
, "initial_latency", "Initial recovery state latency");
3949 rs_perf
.add_time_avg(rs_started_latency
, "started_latency", "Started recovery state latency");
3950 rs_perf
.add_time_avg(rs_reset_latency
, "reset_latency", "Reset recovery state latency");
3951 rs_perf
.add_time_avg(rs_start_latency
, "start_latency", "Start recovery state latency");
3952 rs_perf
.add_time_avg(rs_primary_latency
, "primary_latency", "Primary recovery state latency");
3953 rs_perf
.add_time_avg(rs_peering_latency
, "peering_latency", "Peering recovery state latency");
3954 rs_perf
.add_time_avg(rs_backfilling_latency
, "backfilling_latency", "Backfilling recovery state latency");
3955 rs_perf
.add_time_avg(rs_waitremotebackfillreserved_latency
, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3956 rs_perf
.add_time_avg(rs_waitlocalbackfillreserved_latency
, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3957 rs_perf
.add_time_avg(rs_notbackfilling_latency
, "notbackfilling_latency", "Notbackfilling recovery state latency");
3958 rs_perf
.add_time_avg(rs_repnotrecovering_latency
, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3959 rs_perf
.add_time_avg(rs_repwaitrecoveryreserved_latency
, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3960 rs_perf
.add_time_avg(rs_repwaitbackfillreserved_latency
, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3961 rs_perf
.add_time_avg(rs_reprecovering_latency
, "reprecovering_latency", "RepRecovering recovery state latency");
3962 rs_perf
.add_time_avg(rs_activating_latency
, "activating_latency", "Activating recovery state latency");
3963 rs_perf
.add_time_avg(rs_waitlocalrecoveryreserved_latency
, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3964 rs_perf
.add_time_avg(rs_waitremoterecoveryreserved_latency
, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3965 rs_perf
.add_time_avg(rs_recovering_latency
, "recovering_latency", "Recovering recovery state latency");
3966 rs_perf
.add_time_avg(rs_recovered_latency
, "recovered_latency", "Recovered recovery state latency");
3967 rs_perf
.add_time_avg(rs_clean_latency
, "clean_latency", "Clean recovery state latency");
3968 rs_perf
.add_time_avg(rs_active_latency
, "active_latency", "Active recovery state latency");
3969 rs_perf
.add_time_avg(rs_replicaactive_latency
, "replicaactive_latency", "Replicaactive recovery state latency");
3970 rs_perf
.add_time_avg(rs_stray_latency
, "stray_latency", "Stray recovery state latency");
3971 rs_perf
.add_time_avg(rs_getinfo_latency
, "getinfo_latency", "Getinfo recovery state latency");
3972 rs_perf
.add_time_avg(rs_getlog_latency
, "getlog_latency", "Getlog recovery state latency");
3973 rs_perf
.add_time_avg(rs_waitactingchange_latency
, "waitactingchange_latency", "Waitactingchange recovery state latency");
3974 rs_perf
.add_time_avg(rs_incomplete_latency
, "incomplete_latency", "Incomplete recovery state latency");
3975 rs_perf
.add_time_avg(rs_down_latency
, "down_latency", "Down recovery state latency");
3976 rs_perf
.add_time_avg(rs_getmissing_latency
, "getmissing_latency", "Getmissing recovery state latency");
3977 rs_perf
.add_time_avg(rs_waitupthru_latency
, "waitupthru_latency", "Waitupthru recovery state latency");
3978 rs_perf
.add_time_avg(rs_notrecovering_latency
, "notrecovering_latency", "Notrecovering recovery state latency");
3980 recoverystate_perf
= rs_perf
.create_perf_counters();
3981 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
3986 if (!service
.prepare_to_stop())
3987 return 0; // already shutting down
3989 if (is_stopping()) {
3993 dout(0) << "shutdown" << dendl
;
3995 set_state(STATE_STOPPING
);
3998 if (cct
->_conf
.get_val
<bool>("osd_debug_shutdown")) {
3999 cct
->_conf
.set_val("debug_osd", "100");
4000 cct
->_conf
.set_val("debug_journal", "100");
4001 cct
->_conf
.set_val("debug_filestore", "100");
4002 cct
->_conf
.set_val("debug_bluestore", "100");
4003 cct
->_conf
.set_val("debug_ms", "100");
4004 cct
->_conf
.apply_changes(nullptr);
4007 // stop MgrClient earlier as it's more like an internal consumer of OSD
4010 service
.start_shutdown();
4012 // stop sending work to pgs. this just prevents any new work in _process
4013 // from racing with on_shutdown and potentially entering the pg after.
4014 op_shardedwq
.drain();
4020 for (auto pg
: pgs
) {
4025 // drain op queue again (in case PGs requeued something)
4026 op_shardedwq
.drain();
4028 finished
.clear(); // zap waiters (bleh, this is messy)
4029 waiting_for_osdmap
.clear();
4032 // unregister commands
4033 cct
->get_admin_socket()->unregister_commands(asok_hook
);
4037 cct
->get_admin_socket()->unregister_commands(test_ops_hook
);
4038 delete test_ops_hook
;
4039 test_ops_hook
= NULL
;
4043 heartbeat_lock
.Lock();
4044 heartbeat_stop
= true;
4045 heartbeat_cond
.Signal();
4046 heartbeat_lock
.Unlock();
4047 heartbeat_thread
.join();
4051 dout(10) << "op sharded tp stopped" << dendl
;
4055 dout(10) << "command tp stopped" << dendl
;
4057 dout(10) << "stopping agent" << dendl
;
4058 service
.agent_stop();
4060 boot_finisher
.wait_for_empty();
4064 boot_finisher
.stop();
4065 reset_heartbeat_peers(true);
4067 tick_timer
.shutdown();
4070 std::lock_guard
l(tick_timer_lock
);
4071 tick_timer_without_osd_lock
.shutdown();
4074 // note unmount epoch
4075 dout(10) << "noting clean unmount in epoch " << osdmap
->get_epoch() << dendl
;
4076 superblock
.mounted
= service
.get_boot_epoch();
4077 superblock
.clean_thru
= osdmap
->get_epoch();
4078 ObjectStore::Transaction t
;
4079 write_superblock(t
);
4080 int r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
4082 derr
<< "OSD::shutdown: error writing superblock: "
4083 << cpp_strerror(r
) << dendl
;
4087 service
.shutdown_reserver();
4090 #ifdef PG_DEBUG_REFS
4091 service
.dump_live_pgids();
4095 _get_pgs(&pgs
, true);
4099 for (auto& pg
: pgs
) {
4100 if (pg
->is_deleted()) {
4103 dout(20) << " kicking pg " << pg
<< dendl
;
4105 if (pg
->get_num_ref() != 1) {
4106 derr
<< "pgid " << pg
->get_pgid() << " has ref count of "
4107 << pg
->get_num_ref() << dendl
;
4108 #ifdef PG_DEBUG_REFS
4109 pg
->dump_live_ids();
4111 if (cct
->_conf
->osd_shutdown_pgref_assert
) {
4119 #ifdef PG_DEBUG_REFS
4120 service
.dump_live_pgids();
4124 cct
->_conf
.remove_observer(this);
4127 service
.meta_ch
.reset();
4129 dout(10) << "syncing store" << dendl
;
4130 enable_disable_fuse(true);
4132 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
4133 dout(10) << "flushing journal" << dendl
;
4134 store
->flush_journal();
4140 map_lock
.get_write();
4141 osdmap
= OSDMapRef();
4142 map_lock
.put_write();
4144 for (auto s
: shards
) {
4145 std::lock_guard
l(s
->osdmap_lock
);
4146 s
->shard_osdmap
= OSDMapRef();
4150 std::lock_guard
lock(osd_lock
);
4154 dout(10) << "Store synced" << dendl
;
4156 op_tracker
.on_shutdown();
4158 class_handler
->shutdown();
4159 client_messenger
->shutdown();
4160 cluster_messenger
->shutdown();
4161 hb_front_client_messenger
->shutdown();
4162 hb_back_client_messenger
->shutdown();
4163 objecter_messenger
->shutdown();
4164 hb_front_server_messenger
->shutdown();
4165 hb_back_server_messenger
->shutdown();
4170 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
4172 bool created
= false;
4174 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
4175 vector
<string
> vcmd
{cmd
};
4179 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
4182 if (r
== -ENOENT
&& !created
) {
4183 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
4184 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
4185 vector
<string
> vnewcmd
{newcmd
};
4189 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
4192 derr
<< __func__
<< " fail: osd does not exist and created failed: "
4193 << cpp_strerror(r
) << dendl
;
4199 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
4208 int OSD::update_crush_location()
4210 if (!cct
->_conf
->osd_crush_update_on_start
) {
4211 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
4216 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
4217 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
4219 struct store_statfs_t st
;
4220 osd_alert_list_t alerts
;
4221 int r
= store
->statfs(&st
, &alerts
);
4223 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
4226 snprintf(weight
, sizeof(weight
), "%.4lf",
4229 double(1ull << 40 /* TB */)));
4232 std::multimap
<string
,string
> loc
= cct
->crush_location
.get_location();
4233 dout(10) << __func__
<< " crush location is " << loc
<< dendl
;
4236 string("{\"prefix\": \"osd crush create-or-move\", ") +
4237 string("\"id\": ") + stringify(whoami
) + string(", ") +
4238 string("\"weight\":") + weight
+ string(", ") +
4239 string("\"args\": [");
4240 for (multimap
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
) {
4241 if (p
!= loc
.begin())
4243 cmd
+= "\"" + p
->first
+ "=" + p
->second
+ "\"";
4247 return mon_cmd_maybe_osd_create(cmd
);
4250 int OSD::update_crush_device_class()
4252 if (!cct
->_conf
->osd_class_update_on_start
) {
4253 dout(10) << __func__
<< " osd_class_update_on_start = false" << dendl
;
4257 string device_class
;
4258 int r
= store
->read_meta("crush_device_class", &device_class
);
4259 if (r
< 0 || device_class
.empty()) {
4260 device_class
= store
->get_default_device_class();
4263 if (device_class
.empty()) {
4264 dout(20) << __func__
<< " no device class stored locally" << dendl
;
4269 string("{\"prefix\": \"osd crush set-device-class\", ") +
4270 string("\"class\": \"") + device_class
+ string("\", ") +
4271 string("\"ids\": [\"") + stringify(whoami
) + string("\"]}");
4273 r
= mon_cmd_maybe_osd_create(cmd
);
4275 // good, already bound to a device-class
4282 void OSD::write_superblock(ObjectStore::Transaction
& t
)
4284 dout(10) << "write_superblock " << superblock
<< dendl
;
4286 //hack: at minimum it's using the baseline feature set
4287 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
4288 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
4291 encode(superblock
, bl
);
4292 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
4295 int OSD::read_superblock()
4298 int r
= store
->read(service
.meta_ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
4302 auto p
= bl
.cbegin();
4303 decode(superblock
, p
);
4305 dout(10) << "read_superblock " << superblock
<< dendl
;
4310 void OSD::clear_temp_objects()
4312 dout(10) << __func__
<< dendl
;
4314 store
->list_collections(ls
);
4315 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
4317 if (!p
->is_pg(&pgid
))
4320 // list temp objects
4321 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
4323 vector
<ghobject_t
> temps
;
4326 vector
<ghobject_t
> objects
;
4327 auto ch
= store
->open_collection(*p
);
4329 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4330 store
->get_ideal_list_max(),
4332 if (objects
.empty())
4334 vector
<ghobject_t
>::iterator q
;
4335 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
4336 // Hammer set pool for temps to -1, so check for clean-up
4337 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
4338 temps
.push_back(*q
);
4343 // If we saw a non-temp object and hit the break above we can
4344 // break out of the while loop too.
4345 if (q
!= objects
.end())
4348 if (!temps
.empty()) {
4349 ObjectStore::Transaction t
;
4351 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
4352 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
4354 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
4355 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4356 t
= ObjectStore::Transaction();
4361 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4367 void OSD::recursive_remove_collection(CephContext
* cct
,
4368 ObjectStore
*store
, spg_t pgid
,
4374 make_snapmapper_oid());
4376 ObjectStore::CollectionHandle ch
= store
->open_collection(tmp
);
4377 ObjectStore::Transaction t
;
4378 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
4381 int max
= cct
->_conf
->osd_target_transaction_size
;
4382 vector
<ghobject_t
> objects
;
4383 objects
.reserve(max
);
4386 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4387 max
, &objects
, &next
);
4388 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
4389 if (objects
.empty())
4391 for (auto& p
: objects
) {
4392 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
4393 int r
= mapper
.remove_oid(p
.hobj
, &_t
);
4394 if (r
!= 0 && r
!= -ENOENT
)
4398 int r
= store
->queue_transaction(ch
, std::move(t
));
4399 ceph_assert(r
== 0);
4400 t
= ObjectStore::Transaction();
4402 t
.remove_collection(tmp
);
4403 int r
= store
->queue_transaction(ch
, std::move(t
));
4404 ceph_assert(r
== 0);
4407 if (!ch
->flush_commit(&waiter
)) {
4413 // ======================================================
4417 OSDMapRef createmap
,
4420 dout(10) << __func__
<< " " << pgid
<< dendl
;
4422 map
<string
,string
> ec_profile
;
4424 if (createmap
->have_pg_pool(pgid
.pool())) {
4425 pi
= *createmap
->get_pg_pool(pgid
.pool());
4426 name
= createmap
->get_pool_name(pgid
.pool());
4427 if (pi
.is_erasure()) {
4428 ec_profile
= createmap
->get_erasure_code_profile(pi
.erasure_code_profile
);
4431 // pool was deleted; grab final pg_pool_t off disk.
4432 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
4434 int r
= store
->read(service
.meta_ch
, oid
, 0, 0, bl
);
4436 derr
<< __func__
<< " missing pool " << pgid
.pool() << " tombstone"
4440 ceph_assert(r
>= 0);
4441 auto p
= bl
.cbegin();
4444 if (p
.end()) { // dev release v13.0.2 did not include ec_profile
4445 derr
<< __func__
<< " missing ec_profile from pool " << pgid
.pool()
4446 << " tombstone" << dendl
;
4449 decode(ec_profile
, p
);
4451 PGPool
pool(cct
, createmap
, pgid
.pool(), pi
, name
);
4453 if (pi
.type
== pg_pool_t::TYPE_REPLICATED
||
4454 pi
.type
== pg_pool_t::TYPE_ERASURE
)
4455 pg
= new PrimaryLogPG(&service
, createmap
, pool
, ec_profile
, pgid
);
4461 void OSD::_get_pgs(vector
<PGRef
> *v
, bool clear_too
)
4464 v
->reserve(get_num_pgs());
4465 for (auto& s
: shards
) {
4466 std::lock_guard
l(s
->shard_lock
);
4467 for (auto& j
: s
->pg_slots
) {
4469 !j
.second
->pg
->is_deleted()) {
4470 v
->push_back(j
.second
->pg
);
4472 s
->_detach_pg(j
.second
.get());
4479 void OSD::_get_pgids(vector
<spg_t
> *v
)
4482 v
->reserve(get_num_pgs());
4483 for (auto& s
: shards
) {
4484 std::lock_guard
l(s
->shard_lock
);
4485 for (auto& j
: s
->pg_slots
) {
4487 !j
.second
->pg
->is_deleted()) {
4488 v
->push_back(j
.first
);
4494 void OSD::register_pg(PGRef pg
)
4496 spg_t pgid
= pg
->get_pgid();
4497 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4498 auto sdata
= shards
[shard_index
];
4499 std::lock_guard
l(sdata
->shard_lock
);
4500 auto r
= sdata
->pg_slots
.emplace(pgid
, make_unique
<OSDShardPGSlot
>());
4501 ceph_assert(r
.second
);
4502 auto *slot
= r
.first
->second
.get();
4503 dout(20) << __func__
<< " " << pgid
<< " " << pg
<< dendl
;
4504 sdata
->_attach_pg(slot
, pg
.get());
4507 bool OSD::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
4509 auto sdata
= pg
->osd_shard
;
4512 std::lock_guard
l(sdata
->shard_lock
);
4513 auto p
= sdata
->pg_slots
.find(pg
->pg_id
);
4514 if (p
== sdata
->pg_slots
.end() ||
4516 dout(20) << __func__
<< " " << pg
->pg_id
<< " not found" << dendl
;
4519 if (p
->second
->waiting_for_merge_epoch
) {
4520 dout(20) << __func__
<< " " << pg
->pg_id
<< " waiting for merge" << dendl
;
4523 dout(20) << __func__
<< " " << pg
->pg_id
<< " " << pg
<< dendl
;
4524 sdata
->_detach_pg(p
->second
.get());
4527 for (auto shard
: shards
) {
4528 shard
->unprime_split_children(pg
->pg_id
, old_pg_num
);
4531 // update pg count now since we might not get an osdmap any time soon.
4532 if (pg
->is_primary())
4533 service
.logger
->dec(l_osd_pg_primary
);
4534 else if (pg
->is_replica())
4535 service
.logger
->dec(l_osd_pg_replica
);
4537 service
.logger
->dec(l_osd_pg_stray
);
4542 PGRef
OSD::_lookup_pg(spg_t pgid
)
4544 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4545 auto sdata
= shards
[shard_index
];
4546 std::lock_guard
l(sdata
->shard_lock
);
4547 auto p
= sdata
->pg_slots
.find(pgid
);
4548 if (p
== sdata
->pg_slots
.end()) {
4551 return p
->second
->pg
;
4554 PGRef
OSD::_lookup_lock_pg(spg_t pgid
)
4556 PGRef pg
= _lookup_pg(pgid
);
4561 if (!pg
->is_deleted()) {
4568 PGRef
OSD::lookup_lock_pg(spg_t pgid
)
4570 return _lookup_lock_pg(pgid
);
4573 void OSD::load_pgs()
4575 ceph_assert(osd_lock
.is_locked());
4576 dout(0) << "load_pgs" << dendl
;
4579 auto pghist
= make_pg_num_history_oid();
4581 int r
= store
->read(service
.meta_ch
, pghist
, 0, 0, bl
, 0);
4582 if (r
>= 0 && bl
.length() > 0) {
4583 auto p
= bl
.cbegin();
4584 decode(pg_num_history
, p
);
4586 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
4590 int r
= store
->list_collections(ls
);
4592 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
4596 for (vector
<coll_t
>::iterator it
= ls
.begin();
4600 if (it
->is_temp(&pgid
) ||
4601 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
, pgid
))) {
4602 dout(10) << "load_pgs " << *it
4603 << " removing, legacy or flagged for removal pg" << dendl
;
4604 recursive_remove_collection(cct
, store
, pgid
, *it
);
4608 if (!it
->is_pg(&pgid
)) {
4609 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
4613 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
4614 epoch_t map_epoch
= 0;
4615 int r
= PG::peek_map_epoch(store
, pgid
, &map_epoch
);
4617 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
4623 if (map_epoch
> 0) {
4624 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
4626 if (!osdmap
->have_pg_pool(pgid
.pool())) {
4627 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
4628 << " on pg " << pgid
<< ", but the pool is not present in the "
4629 << "current map, so this is probably a result of bug 10617. "
4630 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4631 << "to clean it up later." << dendl
;
4634 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
4635 << map_epoch
<< ", but missing map. Crashing."
4637 ceph_abort_msg("Missing map in load_pgs");
4640 pg
= _make_pg(pgosdmap
, pgid
);
4642 pg
= _make_pg(osdmap
, pgid
);
4645 recursive_remove_collection(cct
, store
, pgid
, *it
);
4649 // there can be no waiters here, so we don't call _wake_pg_slot
4652 pg
->ch
= store
->open_collection(pg
->coll
);
4654 // read pg state, log
4655 pg
->read_state(store
);
4658 dout(10) << "load_pgs " << *it
<< " deleting dne" << dendl
;
4661 recursive_remove_collection(cct
, store
, pgid
, *it
);
4665 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4666 assert(NULL
!= shards
[shard_index
]);
4667 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4670 pg
->reg_next_scrub();
4672 dout(10) << __func__
<< " loaded " << *pg
<< dendl
;
4678 dout(0) << __func__
<< " opened " << num
<< " pgs" << dendl
;
4682 PGRef
OSD::handle_pg_create_info(const OSDMapRef
& osdmap
,
4683 const PGCreateInfo
*info
)
4685 spg_t pgid
= info
->pgid
;
4687 if (maybe_wait_for_max_pg(osdmap
, pgid
, info
->by_mon
)) {
4688 dout(10) << __func__
<< " hit max pg, dropping" << dendl
;
4692 PG::RecoveryCtx rctx
= create_context();
4694 OSDMapRef startmap
= get_map(info
->epoch
);
4697 int64_t pool_id
= pgid
.pgid
.pool();
4698 const pg_pool_t
*pool
= osdmap
->get_pg_pool(pool_id
);
4700 dout(10) << __func__
<< " ignoring " << pgid
<< ", pool dne" << dendl
;
4703 if (osdmap
->require_osd_release
>= CEPH_RELEASE_NAUTILUS
&&
4704 !pool
->has_flag(pg_pool_t::FLAG_CREATING
)) {
4705 // this ensures we do not process old creating messages after the
4706 // pool's initial pgs have been created (and pg are subsequently
4707 // allowed to split or merge).
4708 dout(20) << __func__
<< " dropping " << pgid
4709 << "create, pool does not have CREATING flag set" << dendl
;
4714 int up_primary
, acting_primary
;
4715 vector
<int> up
, acting
;
4716 startmap
->pg_to_up_acting_osds(
4717 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4719 const pg_pool_t
* pp
= startmap
->get_pg_pool(pgid
.pool());
4720 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
4721 store
->get_type() != "bluestore") {
4722 clog
->warn() << "pg " << pgid
4723 << " is at risk of silent data corruption: "
4724 << "the pool allows ec overwrites but is not stored in "
4725 << "bluestore, so deep scrubbing will not detect bitrot";
4727 PG::_create(*rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
4728 PG::_init(*rctx
.transaction
, pgid
, pp
);
4730 int role
= startmap
->calc_pg_role(whoami
, acting
, acting
.size());
4731 if (!pp
->is_replicated() && role
!= pgid
.shard
) {
4735 PGRef pg
= _make_pg(startmap
, pgid
);
4736 pg
->ch
= store
->create_new_collection(pg
->coll
);
4739 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4740 assert(NULL
!= shards
[shard_index
]);
4741 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4746 // we are holding the shard lock
4747 ceph_assert(!pg
->is_deleted());
4756 info
->past_intervals
,
4760 if (pg
->is_primary()) {
4761 Mutex::Locker
locker(m_perf_queries_lock
);
4762 pg
->set_dynamic_perf_stats_queries(m_perf_queries
);
4765 pg
->handle_initialize(&rctx
);
4766 pg
->handle_activate_map(&rctx
);
4768 dispatch_context(rctx
, pg
.get(), osdmap
, nullptr);
4770 dout(10) << __func__
<< " new pg " << *pg
<< dendl
;
4774 bool OSD::maybe_wait_for_max_pg(const OSDMapRef
& osdmap
,
4778 const auto max_pgs_per_osd
=
4779 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4780 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4782 if (num_pgs
< max_pgs_per_osd
) {
4786 std::lock_guard
l(pending_creates_lock
);
4787 if (is_mon_create
) {
4788 pending_creates_from_mon
++;
4790 bool is_primary
= osdmap
->get_pg_acting_rank(pgid
.pgid
, whoami
) == 0;
4791 pending_creates_from_osd
.emplace(pgid
.pgid
, is_primary
);
4793 dout(1) << __func__
<< " withhold creation of pg " << pgid
4794 << ": " << num_pgs
<< " >= "<< max_pgs_per_osd
<< dendl
;
4798 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4799 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4800 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4801 static vector
<int32_t> twiddle(const vector
<int>& acting
) {
4802 if (acting
.size() > 1) {
4805 vector
<int32_t> twiddled(acting
.begin(), acting
.end());
4806 twiddled
.push_back(-1);
4811 void OSD::resume_creating_pg()
4813 bool do_sub_pg_creates
= false;
4814 bool have_pending_creates
= false;
4816 const auto max_pgs_per_osd
=
4817 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4818 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4819 if (max_pgs_per_osd
<= num_pgs
) {
4820 // this could happen if admin decreases this setting before a PG is removed
4823 unsigned spare_pgs
= max_pgs_per_osd
- num_pgs
;
4824 std::lock_guard
l(pending_creates_lock
);
4825 if (pending_creates_from_mon
> 0) {
4826 dout(20) << __func__
<< " pending_creates_from_mon "
4827 << pending_creates_from_mon
<< dendl
;
4828 do_sub_pg_creates
= true;
4829 if (pending_creates_from_mon
>= spare_pgs
) {
4830 spare_pgs
= pending_creates_from_mon
= 0;
4832 spare_pgs
-= pending_creates_from_mon
;
4833 pending_creates_from_mon
= 0;
4836 auto pg
= pending_creates_from_osd
.cbegin();
4837 while (spare_pgs
> 0 && pg
!= pending_creates_from_osd
.cend()) {
4838 dout(20) << __func__
<< " pg " << pg
->first
<< dendl
;
4840 osdmap
->pg_to_up_acting_osds(pg
->first
, nullptr, nullptr, &acting
, nullptr);
4841 service
.queue_want_pg_temp(pg
->first
, twiddle(acting
), true);
4842 pg
= pending_creates_from_osd
.erase(pg
);
4843 do_sub_pg_creates
= true;
4846 have_pending_creates
= (pending_creates_from_mon
> 0 ||
4847 !pending_creates_from_osd
.empty());
4850 bool do_renew_subs
= false;
4851 if (do_sub_pg_creates
) {
4852 if (monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0)) {
4853 dout(4) << __func__
<< ": resolicit pg creates from mon since "
4854 << last_pg_create_epoch
<< dendl
;
4855 do_renew_subs
= true;
4858 version_t start
= osdmap
->get_epoch() + 1;
4859 if (have_pending_creates
) {
4860 // don't miss any new osdmap deleting PGs
4861 if (monc
->sub_want("osdmap", start
, 0)) {
4862 dout(4) << __func__
<< ": resolicit osdmap from mon since "
4864 do_renew_subs
= true;
4866 } else if (do_sub_pg_creates
) {
4867 // no need to subscribe the osdmap continuously anymore
4868 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4869 if (monc
->sub_want_increment("osdmap", start
, CEPH_SUBSCRIBE_ONETIME
)) {
4870 dout(4) << __func__
<< ": re-subscribe osdmap(onetime) since "
4872 do_renew_subs
= true;
4876 if (do_renew_subs
) {
4880 service
.send_pg_temp();
4883 void OSD::build_initial_pg_history(
4886 utime_t created_stamp
,
4890 dout(10) << __func__
<< " " << pgid
<< " created " << created
<< dendl
;
4891 h
->epoch_created
= created
;
4892 h
->epoch_pool_created
= created
;
4893 h
->same_interval_since
= created
;
4894 h
->same_up_since
= created
;
4895 h
->same_primary_since
= created
;
4896 h
->last_scrub_stamp
= created_stamp
;
4897 h
->last_deep_scrub_stamp
= created_stamp
;
4898 h
->last_clean_scrub_stamp
= created_stamp
;
4900 OSDMapRef lastmap
= service
.get_map(created
);
4901 int up_primary
, acting_primary
;
4902 vector
<int> up
, acting
;
4903 lastmap
->pg_to_up_acting_osds(
4904 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4906 ostringstream debug
;
4907 for (epoch_t e
= created
+ 1; e
<= osdmap
->get_epoch(); ++e
) {
4908 OSDMapRef osdmap
= service
.get_map(e
);
4909 int new_up_primary
, new_acting_primary
;
4910 vector
<int> new_up
, new_acting
;
4911 osdmap
->pg_to_up_acting_osds(
4912 pgid
.pgid
, &new_up
, &new_up_primary
, &new_acting
, &new_acting_primary
);
4914 // this is a bit imprecise, but sufficient?
4915 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
4916 const pg_pool_t
*pi
;
4917 bool operator()(const set
<pg_shard_t
> &have
) const {
4918 return have
.size() >= pi
->min_size
;
4920 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
4921 } min_size_predicate(osdmap
->get_pg_pool(pgid
.pgid
.pool()));
4923 bool new_interval
= PastIntervals::check_new_interval(
4930 h
->same_interval_since
,
4931 h
->last_epoch_clean
,
4935 &min_size_predicate
,
4939 h
->same_interval_since
= e
;
4941 h
->same_up_since
= e
;
4943 if (acting_primary
!= new_acting_primary
) {
4944 h
->same_primary_since
= e
;
4946 if (pgid
.pgid
.is_split(lastmap
->get_pg_num(pgid
.pgid
.pool()),
4947 osdmap
->get_pg_num(pgid
.pgid
.pool()),
4949 h
->last_epoch_split
= e
;
4952 acting
= new_acting
;
4953 up_primary
= new_up_primary
;
4954 acting_primary
= new_acting_primary
;
4958 dout(20) << __func__
<< " " << debug
.str() << dendl
;
4959 dout(10) << __func__
<< " " << *h
<< " " << *pi
4960 << " [" << (pi
->empty() ? pair
<epoch_t
,epoch_t
>(0,0) :
4961 pi
->get_bounds()) << ")"
4965 void OSD::_add_heartbeat_peer(int p
)
4971 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
4972 if (i
== heartbeat_peers
.end()) {
4973 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, osdmap
->get_epoch());
4976 hi
= &heartbeat_peers
[p
];
4978 RefCountedPtr s
{new HeartbeatSession
{p
}, false};
4979 hi
->hb_interval_start
= ceph_clock_now();
4980 hi
->con_back
= cons
.first
.get();
4981 hi
->con_back
->set_priv(s
);
4983 hi
->con_front
= cons
.second
.get();
4984 hi
->con_front
->set_priv(s
);
4985 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4986 << " " << hi
->con_back
->get_peer_addr()
4987 << " " << hi
->con_front
->get_peer_addr()
4990 hi
->con_front
.reset(NULL
);
4991 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4992 << " " << hi
->con_back
->get_peer_addr()
4998 hi
->epoch
= osdmap
->get_epoch();
5001 void OSD::_remove_heartbeat_peer(int n
)
5003 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
5004 ceph_assert(q
!= heartbeat_peers
.end());
5005 dout(20) << " removing heartbeat peer osd." << n
5006 << " " << q
->second
.con_back
->get_peer_addr()
5007 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
5009 q
->second
.con_back
->mark_down();
5010 if (q
->second
.con_front
) {
5011 q
->second
.con_front
->mark_down();
5013 heartbeat_peers
.erase(q
);
5016 void OSD::need_heartbeat_peer_update()
5020 dout(20) << "need_heartbeat_peer_update" << dendl
;
5021 heartbeat_set_peers_need_update();
5024 void OSD::maybe_update_heartbeat_peers()
5026 ceph_assert(osd_lock
.is_locked());
5028 if (is_waiting_for_healthy() || is_active()) {
5029 utime_t now
= ceph_clock_now();
5030 if (last_heartbeat_resample
== utime_t()) {
5031 last_heartbeat_resample
= now
;
5032 heartbeat_set_peers_need_update();
5033 } else if (!heartbeat_peers_need_update()) {
5034 utime_t dur
= now
- last_heartbeat_resample
;
5035 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
5036 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
5037 heartbeat_set_peers_need_update();
5038 last_heartbeat_resample
= now
;
5039 // automatically clean up any stale heartbeat peers
5040 // if we are unhealthy, then clean all
5041 reset_heartbeat_peers(is_waiting_for_healthy());
5046 if (!heartbeat_peers_need_update())
5048 heartbeat_clear_peers_need_update();
5050 std::lock_guard
l(heartbeat_lock
);
5052 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
5055 // build heartbeat from set
5059 for (auto& pg
: pgs
) {
5060 pg
->with_heartbeat_peers([&](int peer
) {
5061 if (osdmap
->is_up(peer
)) {
5062 _add_heartbeat_peer(peer
);
5068 // include next and previous up osds to ensure we have a fully-connected set
5069 set
<int> want
, extras
;
5070 const int next
= osdmap
->get_next_up_osd_after(whoami
);
5073 int prev
= osdmap
->get_previous_up_osd_before(whoami
);
5074 if (prev
>= 0 && prev
!= next
)
5077 // make sure we have at least **min_down** osds coming from different
5078 // subtree level (e.g., hosts) for fast failure detection.
5079 auto min_down
= cct
->_conf
.get_val
<uint64_t>("mon_osd_min_down_reporters");
5080 auto subtree
= cct
->_conf
.get_val
<string
>("mon_osd_reporter_subtree_level");
5081 osdmap
->get_random_up_osds_by_subtree(
5082 whoami
, subtree
, min_down
, want
, &want
);
5084 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
5085 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
5087 _add_heartbeat_peer(*p
);
5090 // remove down peers; enumerate extras
5091 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5092 while (p
!= heartbeat_peers
.end()) {
5093 if (!osdmap
->is_up(p
->first
)) {
5096 _remove_heartbeat_peer(o
);
5099 if (p
->second
.epoch
< osdmap
->get_epoch()) {
5100 extras
.insert(p
->first
);
5106 for (int n
= next
; n
>= 0; ) {
5107 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
5109 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
5110 dout(10) << " adding random peer osd." << n
<< dendl
;
5112 _add_heartbeat_peer(n
);
5114 n
= osdmap
->get_next_up_osd_after(n
);
5116 break; // came full circle; stop
5120 for (set
<int>::iterator p
= extras
.begin();
5121 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
5125 _remove_heartbeat_peer(*p
);
5128 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
5131 void OSD::reset_heartbeat_peers(bool all
)
5133 ceph_assert(osd_lock
.is_locked());
5134 dout(10) << "reset_heartbeat_peers" << dendl
;
5135 utime_t stale
= ceph_clock_now();
5136 stale
-= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
5137 std::lock_guard
l(heartbeat_lock
);
5138 for (auto it
= heartbeat_peers
.begin(); it
!= heartbeat_peers
.end();) {
5139 HeartbeatInfo
& hi
= it
->second
;
5140 if (all
|| hi
.is_stale(stale
)) {
5141 hi
.con_back
->mark_down();
5143 hi
.con_front
->mark_down();
5145 // stop sending failure_report to mon too
5146 failure_queue
.erase(it
->first
);
5147 heartbeat_peers
.erase(it
++);
5154 void OSD::handle_osd_ping(MOSDPing
*m
)
5156 if (superblock
.cluster_fsid
!= m
->fsid
) {
5157 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
5158 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
<< dendl
;
5163 int from
= m
->get_source().num();
5165 heartbeat_lock
.Lock();
5166 if (is_stopping()) {
5167 heartbeat_lock
.Unlock();
5172 OSDMapRef curmap
= service
.get_osdmap();
5174 heartbeat_lock
.Unlock();
5181 case MOSDPing::PING
:
5183 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
5184 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
5185 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
5186 if (heartbeat_drop
->second
== 0) {
5187 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
5189 --heartbeat_drop
->second
;
5190 dout(5) << "Dropping heartbeat from " << from
5191 << ", " << heartbeat_drop
->second
5192 << " remaining to drop" << dendl
;
5195 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
5196 ((((double)(rand()%100))/100.0))) {
5198 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
5199 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
5200 dout(5) << "Dropping heartbeat from " << from
5201 << ", " << heartbeat_drop
->second
5202 << " remaining to drop" << dendl
;
5207 if (!cct
->get_heartbeat_map()->is_healthy()) {
5208 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl
;
5212 Message
*r
= new MOSDPing(monc
->get_fsid(),
5213 curmap
->get_epoch(),
5214 MOSDPing::PING_REPLY
, m
->stamp
,
5215 cct
->_conf
->osd_heartbeat_min_size
);
5216 m
->get_connection()->send_message(r
);
5218 if (curmap
->is_up(from
)) {
5219 service
.note_peer_epoch(from
, m
->map_epoch
);
5221 ConnectionRef con
= service
.get_con_osd_cluster(from
, curmap
->get_epoch());
5223 service
.share_map_peer(from
, con
.get());
5226 } else if (!curmap
->exists(from
) ||
5227 curmap
->get_down_at(from
) > m
->map_epoch
) {
5228 // tell them they have died
5229 Message
*r
= new MOSDPing(monc
->get_fsid(),
5230 curmap
->get_epoch(),
5233 cct
->_conf
->osd_heartbeat_min_size
);
5234 m
->get_connection()->send_message(r
);
5239 case MOSDPing::PING_REPLY
:
5241 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
5242 if (i
!= heartbeat_peers
.end()) {
5243 auto acked
= i
->second
.ping_history
.find(m
->stamp
);
5244 if (acked
!= i
->second
.ping_history
.end()) {
5245 utime_t now
= ceph_clock_now();
5246 int &unacknowledged
= acked
->second
.second
;
5247 if (m
->get_connection() == i
->second
.con_back
) {
5248 dout(25) << "handle_osd_ping got reply from osd." << from
5249 << " first_tx " << i
->second
.first_tx
5250 << " last_tx " << i
->second
.last_tx
5251 << " last_rx_back " << i
->second
.last_rx_back
<< " -> " << now
5252 << " last_rx_front " << i
->second
.last_rx_front
5254 i
->second
.last_rx_back
= now
;
5255 ceph_assert(unacknowledged
> 0);
5257 // if there is no front con, set both stamps.
5258 if (i
->second
.con_front
== NULL
) {
5259 i
->second
.last_rx_front
= now
;
5260 ceph_assert(unacknowledged
> 0);
5263 } else if (m
->get_connection() == i
->second
.con_front
) {
5264 dout(25) << "handle_osd_ping got reply from osd." << from
5265 << " first_tx " << i
->second
.first_tx
5266 << " last_tx " << i
->second
.last_tx
5267 << " last_rx_back " << i
->second
.last_rx_back
5268 << " last_rx_front " << i
->second
.last_rx_front
<< " -> " << now
5270 i
->second
.last_rx_front
= now
;
5271 ceph_assert(unacknowledged
> 0);
5275 if (unacknowledged
== 0) {
5276 // succeeded in getting all replies
5277 dout(25) << "handle_osd_ping got all replies from osd." << from
5278 << " , erase pending ping(sent at " << m
->stamp
<< ")"
5279 << " and older pending ping(s)"
5282 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5283 ++i
->second
.hb_average_count
;
5284 uint32_t back_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_back
- m
->stamp
);
5285 i
->second
.hb_total_back
+= back_pingtime
;
5286 if (back_pingtime
< i
->second
.hb_min_back
)
5287 i
->second
.hb_min_back
= back_pingtime
;
5288 if (back_pingtime
> i
->second
.hb_max_back
)
5289 i
->second
.hb_max_back
= back_pingtime
;
5290 uint32_t front_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_front
- m
->stamp
);
5291 i
->second
.hb_total_front
+= front_pingtime
;
5292 if (front_pingtime
< i
->second
.hb_min_front
)
5293 i
->second
.hb_min_front
= front_pingtime
;
5294 if (front_pingtime
> i
->second
.hb_max_front
)
5295 i
->second
.hb_max_front
= front_pingtime
;
5297 ceph_assert(i
->second
.hb_interval_start
!= utime_t());
5298 if (i
->second
.hb_interval_start
== utime_t())
5299 i
->second
.hb_interval_start
= now
;
5300 int64_t hb_avg_time_period
= 60;
5301 if (cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span")) {
5302 hb_avg_time_period
= cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span");
5304 if (now
- i
->second
.hb_interval_start
>= utime_t(hb_avg_time_period
, 0)) {
5305 uint32_t back_avg
= i
->second
.hb_total_back
/ i
->second
.hb_average_count
;
5306 uint32_t back_min
= i
->second
.hb_min_back
;
5307 uint32_t back_max
= i
->second
.hb_max_back
;
5308 uint32_t front_avg
= i
->second
.hb_total_front
/ i
->second
.hb_average_count
;
5309 uint32_t front_min
= i
->second
.hb_min_front
;
5310 uint32_t front_max
= i
->second
.hb_max_front
;
5312 // Reset for new interval
5313 i
->second
.hb_average_count
= 0;
5314 i
->second
.hb_interval_start
= now
;
5315 i
->second
.hb_total_back
= i
->second
.hb_max_back
= 0;
5316 i
->second
.hb_min_back
= UINT_MAX
;
5317 i
->second
.hb_total_front
= i
->second
.hb_max_front
= 0;
5318 i
->second
.hb_min_front
= UINT_MAX
;
5320 // Record per osd interace ping times
5321 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5322 if (i
->second
.hb_back_pingtime
.size() == 0) {
5323 ceph_assert(i
->second
.hb_front_pingtime
.size() == 0);
5324 for (unsigned k
= 0 ; k
< hb_vector_size
; ++k
) {
5325 i
->second
.hb_back_pingtime
.push_back(back_avg
);
5326 i
->second
.hb_back_min
.push_back(back_min
);
5327 i
->second
.hb_back_max
.push_back(back_max
);
5328 i
->second
.hb_front_pingtime
.push_back(front_avg
);
5329 i
->second
.hb_front_min
.push_back(front_min
);
5330 i
->second
.hb_front_max
.push_back(front_max
);
5331 ++i
->second
.hb_index
;
5334 int index
= i
->second
.hb_index
& (hb_vector_size
- 1);
5335 i
->second
.hb_back_pingtime
[index
] = back_avg
;
5336 i
->second
.hb_back_min
[index
] = back_min
;
5337 i
->second
.hb_back_max
[index
] = back_max
;
5338 i
->second
.hb_front_pingtime
[index
] = front_avg
;
5339 i
->second
.hb_front_min
[index
] = front_min
;
5340 i
->second
.hb_front_max
[index
] = front_max
;
5341 ++i
->second
.hb_index
;
5345 std::lock_guard
l(service
.stat_lock
);
5346 service
.osd_stat
.hb_pingtime
[from
].last_update
= now
.sec();
5347 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5350 uint32_t min
= UINT_MAX
;
5354 uint32_t size
= (uint32_t)i
->second
.hb_back_pingtime
.size();
5355 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5357 int index
= (i
->second
.hb_index
+ k
) % size
;
5358 total
+= i
->second
.hb_back_pingtime
[index
];
5359 if (i
->second
.hb_back_min
[index
] < min
)
5360 min
= i
->second
.hb_back_min
[index
];
5361 if (i
->second
.hb_back_max
[index
] > max
)
5362 max
= i
->second
.hb_back_max
[index
];
5363 if (count
== 1 || count
== 5 || count
== 15) {
5364 service
.osd_stat
.hb_pingtime
[from
].back_pingtime
[which
] = total
/ count
;
5365 service
.osd_stat
.hb_pingtime
[from
].back_min
[which
] = min
;
5366 service
.osd_stat
.hb_pingtime
[from
].back_max
[which
] = max
;
5373 if (i
->second
.con_front
!= NULL
) {
5374 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5381 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5383 int index
= (i
->second
.hb_index
+ k
) % size
;
5384 total
+= i
->second
.hb_front_pingtime
[index
];
5385 if (i
->second
.hb_front_min
[index
] < min
)
5386 min
= i
->second
.hb_front_min
[index
];
5387 if (i
->second
.hb_front_max
[index
] > max
)
5388 max
= i
->second
.hb_front_max
[index
];
5389 if (count
== 1 || count
== 5 || count
== 15) {
5390 service
.osd_stat
.hb_pingtime
[from
].front_pingtime
[which
] = total
/ count
;
5391 service
.osd_stat
.hb_pingtime
[from
].front_min
[which
] = min
;
5392 service
.osd_stat
.hb_pingtime
[from
].front_max
[which
] = max
;
5401 std::lock_guard
l(service
.stat_lock
);
5402 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5403 if (i
->second
.con_front
!= NULL
)
5404 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5406 i
->second
.ping_history
.erase(i
->second
.ping_history
.begin(), ++acked
);
5409 if (i
->second
.is_healthy(now
)) {
5410 // Cancel false reports
5411 auto failure_queue_entry
= failure_queue
.find(from
);
5412 if (failure_queue_entry
!= failure_queue
.end()) {
5413 dout(10) << "handle_osd_ping canceling queued "
5414 << "failure report for osd." << from
<< dendl
;
5415 failure_queue
.erase(failure_queue_entry
);
5418 auto failure_pending_entry
= failure_pending
.find(from
);
5419 if (failure_pending_entry
!= failure_pending
.end()) {
5420 dout(10) << "handle_osd_ping canceling in-flight "
5421 << "failure report for osd." << from
<< dendl
;
5422 send_still_alive(curmap
->get_epoch(),
5424 failure_pending_entry
->second
.second
);
5425 failure_pending
.erase(failure_pending_entry
);
5429 // old replies, deprecated by newly sent pings.
5430 dout(10) << "handle_osd_ping no pending ping(sent at " << m
->stamp
5431 << ") is found, treat as covered by newly sent pings "
5438 curmap
->is_up(from
)) {
5439 service
.note_peer_epoch(from
, m
->map_epoch
);
5441 ConnectionRef con
= service
.get_con_osd_cluster(from
, curmap
->get_epoch());
5443 service
.share_map_peer(from
, con
.get());
5450 case MOSDPing::YOU_DIED
:
5451 dout(10) << "handle_osd_ping " << m
->get_source_inst()
5452 << " says i am down in " << m
->map_epoch
<< dendl
;
5453 osdmap_subscribe(curmap
->get_epoch()+1, false);
5457 heartbeat_lock
.Unlock();
5461 void OSD::heartbeat_entry()
5463 std::lock_guard
l(heartbeat_lock
);
5466 while (!heartbeat_stop
) {
5470 if (cct
->_conf
.get_val
<bool>("debug_disable_randomized_ping")) {
5471 wait
= (float)cct
->_conf
->osd_heartbeat_interval
;
5473 wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
5476 w
.set_from_double(wait
);
5477 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
5478 heartbeat_cond
.WaitInterval(heartbeat_lock
, w
);
5481 dout(30) << "heartbeat_entry woke up" << dendl
;
5485 void OSD::heartbeat_check()
5487 ceph_assert(heartbeat_lock
.is_locked());
5488 utime_t now
= ceph_clock_now();
5490 // check for incoming heartbeats (move me elsewhere?)
5491 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5492 p
!= heartbeat_peers
.end();
5495 if (p
->second
.first_tx
== utime_t()) {
5496 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
5497 << " yet, skipping" << dendl
;
5501 dout(25) << "heartbeat_check osd." << p
->first
5502 << " first_tx " << p
->second
.first_tx
5503 << " last_tx " << p
->second
.last_tx
5504 << " last_rx_back " << p
->second
.last_rx_back
5505 << " last_rx_front " << p
->second
.last_rx_front
5507 if (p
->second
.is_unhealthy(now
)) {
5508 utime_t oldest_deadline
= p
->second
.ping_history
.begin()->second
.first
;
5509 if (p
->second
.last_rx_back
== utime_t() ||
5510 p
->second
.last_rx_front
== utime_t()) {
5511 derr
<< "heartbeat_check: no reply from "
5512 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5513 << " osd." << p
->first
5514 << " ever on either front or back, first ping sent "
5515 << p
->second
.first_tx
5516 << " (oldest deadline " << oldest_deadline
<< ")"
5519 failure_queue
[p
->first
] = p
->second
.first_tx
;
5521 derr
<< "heartbeat_check: no reply from "
5522 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5523 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
5524 << " front " << p
->second
.last_rx_front
5525 << " (oldest deadline " << oldest_deadline
<< ")"
5528 failure_queue
[p
->first
] = std::min(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
5534 void OSD::heartbeat()
5536 ceph_assert(heartbeat_lock
.is_locked_by_me());
5537 dout(30) << "heartbeat" << dendl
;
5541 int hb_interval
= cct
->_conf
->osd_heartbeat_interval
;
5542 int n_samples
= 86400;
5543 if (hb_interval
> 1) {
5544 n_samples
/= hb_interval
;
5549 if (getloadavg(loadavgs
, 1) == 1) {
5550 logger
->set(l_osd_loadavg
, 100 * loadavgs
[0]);
5551 daily_loadavg
= (daily_loadavg
* (n_samples
- 1) + loadavgs
[0]) / n_samples
;
5552 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg
<< dendl
;
5555 dout(30) << "heartbeat checking stats" << dendl
;
5557 // refresh peer list and osd stats
5558 vector
<int> hb_peers
;
5559 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5560 p
!= heartbeat_peers
.end();
5562 hb_peers
.push_back(p
->first
);
5564 auto new_stat
= service
.set_osd_stat(hb_peers
, get_num_pgs());
5565 dout(5) << __func__
<< " " << new_stat
<< dendl
;
5566 ceph_assert(new_stat
.statfs
.total
);
5569 float ratio
= service
.compute_adjusted_ratio(new_stat
, &pratio
);
5571 service
.check_full_status(ratio
, pratio
);
5573 utime_t now
= ceph_clock_now();
5574 utime_t deadline
= now
;
5575 deadline
+= cct
->_conf
->osd_heartbeat_grace
;
5578 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
5579 i
!= heartbeat_peers
.end();
5581 int peer
= i
->first
;
5582 i
->second
.last_tx
= now
;
5583 if (i
->second
.first_tx
== utime_t())
5584 i
->second
.first_tx
= now
;
5585 i
->second
.ping_history
[now
] = make_pair(deadline
,
5586 HeartbeatInfo::HEARTBEAT_MAX_CONN
);
5587 if (i
->second
.hb_interval_start
== utime_t())
5588 i
->second
.hb_interval_start
= now
;
5589 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
5590 i
->second
.con_back
->send_message(new MOSDPing(monc
->get_fsid(),
5591 service
.get_osdmap_epoch(),
5592 MOSDPing::PING
, now
,
5593 cct
->_conf
->osd_heartbeat_min_size
));
5595 if (i
->second
.con_front
)
5596 i
->second
.con_front
->send_message(new MOSDPing(monc
->get_fsid(),
5597 service
.get_osdmap_epoch(),
5598 MOSDPing::PING
, now
,
5599 cct
->_conf
->osd_heartbeat_min_size
));
5602 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
5604 // hmm.. am i all alone?
5605 dout(30) << "heartbeat lonely?" << dendl
;
5606 if (heartbeat_peers
.empty()) {
5607 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
5608 last_mon_heartbeat
= now
;
5609 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
5610 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
5614 dout(30) << "heartbeat done" << dendl
;
5617 bool OSD::heartbeat_reset(Connection
*con
)
5619 std::lock_guard
l(heartbeat_lock
);
5620 auto s
= con
->get_priv();
5621 con
->set_priv(nullptr);
5623 if (is_stopping()) {
5626 auto heartbeat_session
= static_cast<HeartbeatSession
*>(s
.get());
5627 auto p
= heartbeat_peers
.find(heartbeat_session
->peer
);
5628 if (p
!= heartbeat_peers
.end() &&
5629 (p
->second
.con_back
== con
||
5630 p
->second
.con_front
== con
)) {
5631 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5632 << ", reopening" << dendl
;
5633 if (con
!= p
->second
.con_back
) {
5634 p
->second
.con_back
->mark_down();
5636 p
->second
.con_back
.reset(NULL
);
5637 if (p
->second
.con_front
&& con
!= p
->second
.con_front
) {
5638 p
->second
.con_front
->mark_down();
5640 p
->second
.con_front
.reset(NULL
);
5641 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
5643 p
->second
.con_back
= newcon
.first
.get();
5644 p
->second
.con_back
->set_priv(s
);
5645 if (newcon
.second
) {
5646 p
->second
.con_front
= newcon
.second
.get();
5647 p
->second
.con_front
->set_priv(s
);
5649 p
->second
.ping_history
.clear();
5651 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5652 << ", raced with osdmap update, closing out peer" << dendl
;
5653 heartbeat_peers
.erase(p
);
5656 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
5664 // =========================================
5668 ceph_assert(osd_lock
.is_locked());
5669 dout(10) << "tick" << dendl
;
5671 if (is_active() || is_waiting_for_healthy()) {
5672 maybe_update_heartbeat_peers();
5675 if (is_waiting_for_healthy()) {
5679 if (is_waiting_for_healthy() || is_booting()) {
5680 std::lock_guard
l(heartbeat_lock
);
5681 utime_t now
= ceph_clock_now();
5682 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
) {
5683 last_mon_heartbeat
= now
;
5684 dout(1) << __func__
<< " checking mon for new map" << dendl
;
5685 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
5691 tick_timer
.add_event_after(get_tick_interval(), new C_Tick(this));
5694 void OSD::tick_without_osd_lock()
5696 ceph_assert(tick_timer_lock
.is_locked());
5697 dout(10) << "tick_without_osd_lock" << dendl
;
5699 logger
->set(l_osd_cached_crc
, buffer::get_cached_crc());
5700 logger
->set(l_osd_cached_crc_adjusted
, buffer::get_cached_crc_adjusted());
5701 logger
->set(l_osd_missed_crc
, buffer::get_missed_crc());
5703 // refresh osd stats
5704 struct store_statfs_t stbuf
;
5705 osd_alert_list_t alerts
;
5706 int r
= store
->statfs(&stbuf
, &alerts
);
5707 ceph_assert(r
== 0);
5708 service
.set_statfs(stbuf
, alerts
);
5710 // osd_lock is not being held, which means the OSD state
5711 // might change when doing the monitor report
5712 if (is_active() || is_waiting_for_healthy()) {
5713 heartbeat_lock
.Lock();
5715 heartbeat_lock
.Unlock();
5717 map_lock
.get_read();
5718 std::lock_guard
l(mon_report_lock
);
5721 utime_t now
= ceph_clock_now();
5722 if (service
.need_fullness_update() ||
5723 now
- last_mon_report
> cct
->_conf
->osd_mon_report_interval
) {
5724 last_mon_report
= now
;
5728 map_lock
.put_read();
5730 epoch_t max_waiting_epoch
= 0;
5731 for (auto s
: shards
) {
5732 max_waiting_epoch
= std::max(max_waiting_epoch
,
5733 s
->get_max_waiting_epoch());
5735 if (max_waiting_epoch
> get_osdmap()->get_epoch()) {
5736 dout(20) << __func__
<< " max_waiting_epoch " << max_waiting_epoch
5737 << ", requesting new map" << dendl
;
5738 osdmap_subscribe(superblock
.newest_map
+ 1, false);
5743 if (!scrub_random_backoff()) {
5746 service
.promote_throttle_recalibrate();
5747 resume_creating_pg();
5748 bool need_send_beacon
= false;
5749 const auto now
= ceph::coarse_mono_clock::now();
5751 // borrow lec lock to pretect last_sent_beacon from changing
5752 std::lock_guard l
{min_last_epoch_clean_lock
};
5753 const auto elapsed
= now
- last_sent_beacon
;
5754 if (chrono::duration_cast
<chrono::seconds
>(elapsed
).count() >
5755 cct
->_conf
->osd_beacon_report_interval
) {
5756 need_send_beacon
= true;
5759 if (need_send_beacon
) {
5764 mgrc
.update_daemon_health(get_health_metrics());
5765 service
.kick_recovery_queue();
5766 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
5767 new C_Tick_WithoutOSDLock(this));
5771 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5772 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5773 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5774 // getomap <pool> [namespace/]<obj-name>
5775 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5776 // injectmdataerr [namespace/]<obj-name> [shardid]
5777 // injectdataerr [namespace/]<obj-name> [shardid]
5779 // set_recovery_delay [utime]
5780 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
5781 std::string_view command
,
5782 const cmdmap_t
& cmdmap
, ostream
&ss
)
5785 //Support changing the omap on a single osd by using the Admin Socket to
5786 //directly request the osd make a change.
5787 if (command
== "setomapval" || command
== "rmomapkey" ||
5788 command
== "setomapheader" || command
== "getomap" ||
5789 command
== "truncobj" || command
== "injectmdataerr" ||
5790 command
== "injectdataerr"
5794 OSDMapRef curmap
= service
->get_osdmap();
5799 cmd_getval(service
->cct
, cmdmap
, "pool", poolstr
);
5800 pool
= curmap
->lookup_pg_pool_name(poolstr
);
5801 //If we can't find it by name then maybe id specified
5802 if (pool
< 0 && isdigit(poolstr
[0]))
5803 pool
= atoll(poolstr
.c_str());
5805 ss
<< "Invalid pool '" << poolstr
<< "''";
5809 string objname
, nspace
;
5810 cmd_getval(service
->cct
, cmdmap
, "objname", objname
);
5811 std::size_t found
= objname
.find_first_of('/');
5812 if (found
!= string::npos
) {
5813 nspace
= objname
.substr(0, found
);
5814 objname
= objname
.substr(found
+1);
5816 object_locator_t
oloc(pool
, nspace
);
5817 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
5820 ss
<< "Invalid namespace/objname";
5825 cmd_getval(service
->cct
, cmdmap
, "shardid", shardid
, int64_t(shard_id_t::NO_SHARD
));
5826 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
5827 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
5828 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
5829 if (curmap
->pg_is_ec(rawpg
)) {
5830 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
5831 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
5836 ObjectStore::Transaction t
;
5838 if (command
== "setomapval") {
5839 map
<string
, bufferlist
> newattrs
;
5842 cmd_getval(service
->cct
, cmdmap
, "key", key
);
5843 cmd_getval(service
->cct
, cmdmap
, "val", valstr
);
5846 newattrs
[key
] = val
;
5847 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
5848 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5850 ss
<< "error=" << r
;
5853 } else if (command
== "rmomapkey") {
5856 cmd_getval(service
->cct
, cmdmap
, "key", key
);
5859 t
.omap_rmkeys(coll_t(pgid
), ghobject_t(obj
), keys
);
5860 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5862 ss
<< "error=" << r
;
5865 } else if (command
== "setomapheader") {
5866 bufferlist newheader
;
5869 cmd_getval(service
->cct
, cmdmap
, "header", headerstr
);
5870 newheader
.append(headerstr
);
5871 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
5872 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5874 ss
<< "error=" << r
;
5877 } else if (command
== "getomap") {
5878 //Debug: Output entire omap
5880 map
<string
, bufferlist
> keyvals
;
5881 auto ch
= store
->open_collection(coll_t(pgid
));
5883 ss
<< "unable to open collection for " << pgid
;
5886 r
= store
->omap_get(ch
, ghobject_t(obj
), &hdrbl
, &keyvals
);
5888 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
5889 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
5890 it
!= keyvals
.end(); ++it
)
5891 ss
<< " key=" << (*it
).first
<< " val="
5892 << string((*it
).second
.c_str(), (*it
).second
.length());
5894 ss
<< "error=" << r
;
5897 } else if (command
== "truncobj") {
5899 cmd_getval(service
->cct
, cmdmap
, "len", trunclen
);
5900 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
5901 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5903 ss
<< "error=" << r
;
5906 } else if (command
== "injectdataerr") {
5907 store
->inject_data_error(gobj
);
5909 } else if (command
== "injectmdataerr") {
5910 store
->inject_mdata_error(gobj
);
5915 if (command
== "set_recovery_delay") {
5917 cmd_getval(service
->cct
, cmdmap
, "utime", delay
, (int64_t)0);
5920 int r
= service
->cct
->_conf
.set_val("osd_recovery_delay_start",
5923 ss
<< "set_recovery_delay: error setting "
5924 << "osd_recovery_delay_start to '" << delay
<< "': error "
5928 service
->cct
->_conf
.apply_changes(nullptr);
5929 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
5930 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
5933 if (command
== "trigger_scrub" || command
== "trigger_deep_scrub") {
5935 bool deep
= (command
== "trigger_deep_scrub");
5936 OSDMapRef curmap
= service
->get_osdmap();
5940 cmd_getval(service
->cct
, cmdmap
, "pgid", pgidstr
);
5941 if (!pgid
.parse(pgidstr
.c_str())) {
5942 ss
<< "Invalid pgid specified";
5947 cmd_getval(service
->cct
, cmdmap
, "time", time
, (int64_t)0);
5949 PGRef pg
= service
->osd
->_lookup_lock_pg(pgid
);
5950 if (pg
== nullptr) {
5951 ss
<< "Can't find pg " << pgid
;
5955 if (pg
->is_primary()) {
5956 pg
->unreg_next_scrub();
5957 const pg_pool_t
*p
= curmap
->get_pg_pool(pgid
.pool());
5958 double pool_scrub_max_interval
= 0;
5959 double scrub_max_interval
;
5961 p
->opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &pool_scrub_max_interval
);
5962 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
5963 pool_scrub_max_interval
: g_conf()->osd_deep_scrub_interval
;
5965 p
->opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &pool_scrub_max_interval
);
5966 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
5967 pool_scrub_max_interval
: g_conf()->osd_scrub_max_interval
;
5969 // Instead of marking must_scrub force a schedule scrub
5970 utime_t stamp
= ceph_clock_now();
5972 stamp
-= scrub_max_interval
;
5974 stamp
-= (float)time
;
5975 stamp
-= 100.0; // push back last scrub more for good measure
5977 pg
->set_last_deep_scrub_stamp(stamp
);
5979 pg
->set_last_scrub_stamp(stamp
);
5981 pg
->reg_next_scrub();
5982 pg
->publish_stats_to_osd();
5983 ss
<< "ok - set" << (deep
? " deep" : "" ) << " stamp " << stamp
;
5985 ss
<< "Not primary";
5990 if (command
== "injectfull") {
5993 OSDService::s_names state
;
5994 cmd_getval(service
->cct
, cmdmap
, "type", type
, string("full"));
5995 cmd_getval(service
->cct
, cmdmap
, "count", count
, (int64_t)-1);
5996 if (type
== "none" || count
== 0) {
6000 state
= service
->get_full_state(type
);
6001 if (state
== OSDService::s_names::INVALID
) {
6002 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6005 service
->set_injectfull(state
, count
);
6008 ss
<< "Internal error - command=" << command
;
6011 // =========================================
6013 void OSD::ms_handle_connect(Connection
*con
)
6015 dout(10) << __func__
<< " con " << con
<< dendl
;
6016 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
6017 std::lock_guard
l(osd_lock
);
6020 dout(10) << __func__
<< " on mon" << dendl
;
6024 } else if (is_booting()) {
6025 _send_boot(); // resend boot message
6027 map_lock
.get_read();
6028 std::lock_guard
l2(mon_report_lock
);
6030 utime_t now
= ceph_clock_now();
6031 last_mon_report
= now
;
6033 // resend everything, it's a new session
6036 service
.requeue_pg_temp();
6037 service
.clear_sent_ready_to_merge();
6038 service
.send_pg_temp();
6039 service
.send_ready_to_merge();
6040 service
.send_pg_created();
6044 map_lock
.put_read();
6046 send_beacon(ceph::coarse_mono_clock::now());
6050 // full map requests may happen while active or pre-boot
6051 if (requested_full_first
) {
6052 rerequest_full_maps();
6057 void OSD::ms_handle_fast_connect(Connection
*con
)
6059 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6060 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6061 auto priv
= con
->get_priv();
6062 auto s
= static_cast<Session
*>(priv
.get());
6064 s
= new Session
{cct
, con
};
6065 con
->set_priv(RefCountedPtr
{s
, false});
6066 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
6067 << " addr=" << s
->con
->get_peer_addr() << dendl
;
6068 // we don't connect to clients
6069 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6070 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6075 void OSD::ms_handle_fast_accept(Connection
*con
)
6077 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6078 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6079 auto priv
= con
->get_priv();
6080 auto s
= static_cast<Session
*>(priv
.get());
6082 s
= new Session
{cct
, con
};
6083 con
->set_priv(RefCountedPtr
{s
, false});
6084 dout(10) << "new session (incoming)" << s
<< " con=" << con
6085 << " addr=" << con
->get_peer_addr()
6086 << " must have raced with connect" << dendl
;
6087 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6088 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6093 bool OSD::ms_handle_reset(Connection
*con
)
6095 auto s
= con
->get_priv();
6096 auto session
= static_cast<Session
*>(s
.get());
6097 dout(2) << "ms_handle_reset con " << con
<< " session " << session
<< dendl
;
6100 session
->wstate
.reset(con
);
6101 session
->con
->set_priv(nullptr);
6102 session
->con
.reset(); // break con <-> session ref cycle
6103 // note that we break session->con *before* the session_handle_reset
6104 // cleanup below. this avoids a race between us and
6105 // PG::add_backoff, Session::check_backoff, etc.
6106 session_handle_reset(SessionRef
{session
});
6110 bool OSD::ms_handle_refused(Connection
*con
)
6112 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
6115 auto priv
= con
->get_priv();
6116 auto session
= static_cast<Session
*>(priv
.get());
6117 dout(2) << "ms_handle_refused con " << con
<< " session " << session
<< dendl
;
6120 int type
= con
->get_peer_type();
6121 // handle only OSD failures here
6122 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
6123 OSDMapRef osdmap
= get_osdmap();
6125 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
6126 if (id
>= 0 && osdmap
->is_up(id
)) {
6127 // I'm cheating mon heartbeat grace logic, because we know it's not going
6128 // to respawn alone. +1 so we won't hit any boundary case.
6129 monc
->send_mon_message(
6133 osdmap
->get_addrs(id
),
6134 cct
->_conf
->osd_heartbeat_grace
+ 1,
6135 osdmap
->get_epoch(),
6136 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
6144 struct C_OSD_GetVersion
: public Context
{
6146 uint64_t oldest
, newest
;
6147 explicit C_OSD_GetVersion(OSD
*o
) : osd(o
), oldest(0), newest(0) {}
6148 void finish(int r
) override
{
6150 osd
->_got_mon_epochs(oldest
, newest
);
6154 void OSD::start_boot()
6156 if (!_is_healthy()) {
6157 // if we are not healthy, do not mark ourselves up (yet)
6158 dout(1) << "not healthy; waiting to boot" << dendl
;
6159 if (!is_waiting_for_healthy())
6160 start_waiting_for_healthy();
6161 // send pings sooner rather than later
6165 dout(1) << __func__
<< dendl
;
6166 set_state(STATE_PREBOOT
);
6167 dout(10) << "start_boot - have maps " << superblock
.oldest_map
6168 << ".." << superblock
.newest_map
<< dendl
;
6169 C_OSD_GetVersion
*c
= new C_OSD_GetVersion(this);
6170 monc
->get_version("osdmap", &c
->newest
, &c
->oldest
, c
);
6173 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
6175 std::lock_guard
l(osd_lock
);
6177 _preboot(oldest
, newest
);
6181 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
6183 ceph_assert(is_preboot());
6184 dout(10) << __func__
<< " _preboot mon has osdmaps "
6185 << oldest
<< ".." << newest
<< dendl
;
6187 // ensure our local fullness awareness is accurate
6189 std::lock_guard
l(heartbeat_lock
);
6193 // if our map within recent history, try to add ourselves to the osdmap.
6194 if (osdmap
->get_epoch() == 0) {
6195 derr
<< "waiting for initial osdmap" << dendl
;
6196 } else if (osdmap
->is_destroyed(whoami
)) {
6197 derr
<< "osdmap says I am destroyed" << dendl
;
6198 // provide a small margin so we don't livelock seeing if we
6199 // un-destroyed ourselves.
6200 if (osdmap
->get_epoch() > newest
- 1) {
6203 } else if (osdmap
->is_noup(whoami
)) {
6204 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
6205 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
6206 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6208 } else if (osdmap
->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
6209 derr
<< "osdmap require_osd_release < luminous; please upgrade to luminous"
6211 } else if (service
.need_fullness_update()) {
6212 derr
<< "osdmap fullness state needs update" << dendl
;
6214 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
6215 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
6217 // wait for pgs to fully catch up in a different thread, since
6218 // this thread might be required for splitting and merging PGs to
6220 boot_finisher
.queue(
6221 new FunctionContext(
6223 std::lock_guard
l(osd_lock
);
6225 dout(10) << __func__
<< " waiting for peering work to drain"
6228 for (auto shard
: shards
) {
6229 shard
->wait_min_pg_epoch(osdmap
->get_epoch());
6240 // get all the latest maps
6241 if (osdmap
->get_epoch() + 1 >= oldest
)
6242 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6244 osdmap_subscribe(oldest
- 1, true);
6247 void OSD::send_full_update()
6249 if (!service
.need_fullness_update())
6252 if (service
.is_full()) {
6253 state
= CEPH_OSD_FULL
;
6254 } else if (service
.is_backfillfull()) {
6255 state
= CEPH_OSD_BACKFILLFULL
;
6256 } else if (service
.is_nearfull()) {
6257 state
= CEPH_OSD_NEARFULL
;
6260 OSDMap::calc_state_set(state
, s
);
6261 dout(10) << __func__
<< " want state " << s
<< dendl
;
6262 monc
->send_mon_message(new MOSDFull(osdmap
->get_epoch(), state
));
6265 void OSD::start_waiting_for_healthy()
6267 dout(1) << "start_waiting_for_healthy" << dendl
;
6268 set_state(STATE_WAITING_FOR_HEALTHY
);
6269 last_heartbeat_resample
= utime_t();
6271 // subscribe to osdmap updates, in case our peers really are known to be dead
6272 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6275 bool OSD::_is_healthy()
6277 if (!cct
->get_heartbeat_map()->is_healthy()) {
6278 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
6282 if (is_waiting_for_healthy()) {
6283 utime_t now
= ceph_clock_now();
6284 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
6285 while (!osd_markdown_log
.empty() &&
6286 osd_markdown_log
.front() + grace
< now
)
6287 osd_markdown_log
.pop_front();
6288 if (osd_markdown_log
.size() <= 1) {
6289 dout(5) << __func__
<< " first time marked as down,"
6290 << " try reboot unconditionally" << dendl
;
6293 std::lock_guard
l(heartbeat_lock
);
6294 int num
= 0, up
= 0;
6295 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
6296 p
!= heartbeat_peers
.end();
6298 if (p
->second
.is_healthy(now
))
6302 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
6303 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
6304 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
6312 void OSD::_send_boot()
6314 dout(10) << "_send_boot" << dendl
;
6315 Connection
*local_connection
=
6316 cluster_messenger
->get_loopback_connection().get();
6317 entity_addrvec_t client_addrs
= client_messenger
->get_myaddrs();
6318 entity_addrvec_t cluster_addrs
= cluster_messenger
->get_myaddrs();
6319 entity_addrvec_t hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6320 entity_addrvec_t hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6322 dout(20) << " initial client_addrs " << client_addrs
6323 << ", cluster_addrs " << cluster_addrs
6324 << ", hb_back_addrs " << hb_back_addrs
6325 << ", hb_front_addrs " << hb_front_addrs
6327 if (cluster_messenger
->set_addr_unknowns(client_addrs
)) {
6328 dout(10) << " assuming cluster_addrs match client_addrs "
6329 << client_addrs
<< dendl
;
6330 cluster_addrs
= cluster_messenger
->get_myaddrs();
6332 if (auto session
= local_connection
->get_priv(); !session
) {
6333 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6336 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
6337 if (hb_back_server_messenger
->set_addr_unknowns(cluster_addrs
)) {
6338 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6339 << cluster_addrs
<< dendl
;
6340 hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6342 if (auto session
= local_connection
->get_priv(); !session
) {
6343 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6346 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
6347 if (hb_front_server_messenger
->set_addr_unknowns(client_addrs
)) {
6348 dout(10) << " assuming hb_front_addrs match client_addrs "
6349 << client_addrs
<< dendl
;
6350 hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6352 if (auto session
= local_connection
->get_priv(); !session
) {
6353 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6356 // we now know what our front and back addrs will be, and we are
6357 // about to tell the mon what our metadata (including numa bindings)
6358 // are, so now is a good time!
6359 set_numa_affinity();
6361 MOSDBoot
*mboot
= new MOSDBoot(
6362 superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
6363 hb_back_addrs
, hb_front_addrs
, cluster_addrs
,
6365 dout(10) << " final client_addrs " << client_addrs
6366 << ", cluster_addrs " << cluster_addrs
6367 << ", hb_back_addrs " << hb_back_addrs
6368 << ", hb_front_addrs " << hb_front_addrs
6370 _collect_metadata(&mboot
->metadata
);
6371 monc
->send_mon_message(mboot
);
6372 set_state(STATE_BOOTING
);
6375 void OSD::_collect_metadata(map
<string
,string
> *pm
)
6378 (*pm
)["osd_data"] = dev_path
;
6379 if (store
->get_type() == "filestore") {
6380 // not applicable for bluestore
6381 (*pm
)["osd_journal"] = journal_path
;
6383 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddrs());
6384 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddrs());
6385 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddrs());
6386 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddrs());
6389 (*pm
)["osd_objectstore"] = store
->get_type();
6390 (*pm
)["rotational"] = store_is_rotational
? "1" : "0";
6391 (*pm
)["journal_rotational"] = journal_is_rotational
? "1" : "0";
6392 (*pm
)["default_device_class"] = store
->get_default_device_class();
6393 store
->collect_metadata(pm
);
6395 collect_sys_info(pm
, cct
);
6397 (*pm
)["front_iface"] = pick_iface(
6399 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
6400 (*pm
)["back_iface"] = pick_iface(
6402 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
6408 set
<string
> unknown
;
6409 for (auto nm
: { "front_iface", "back_iface" }) {
6410 if (!(*pm
)[nm
].size()) {
6415 int r
= get_iface_numa_node((*pm
)[nm
], &n
);
6417 unknown
.insert((*pm
)[nm
]);
6425 if (unknown
.size()) {
6426 (*pm
)["network_numa_unknown_ifaces"] = stringify(unknown
);
6428 if (!nodes
.empty()) {
6429 (*pm
)["network_numa_nodes"] = stringify(nodes
);
6431 if (node
>= 0 && nodes
.size() == 1 && unknown
.empty()) {
6432 (*pm
)["network_numa_node"] = stringify(node
);
6436 if (numa_node
>= 0) {
6437 (*pm
)["numa_node"] = stringify(numa_node
);
6438 (*pm
)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size
,
6442 set
<string
> devnames
;
6443 store
->get_devices(&devnames
);
6444 (*pm
)["devices"] = stringify(devnames
);
6446 for (auto& dev
: devnames
) {
6448 string id
= get_device_id(dev
, &err
);
6450 if (!devids
.empty()) {
6453 devids
+= dev
+ "=" + id
;
6455 dout(10) << __func__
<< " no unique device id for " << dev
<< ": "
6459 (*pm
)["device_ids"] = devids
;
6461 dout(10) << __func__
<< " " << *pm
<< dendl
;
6464 void OSD::queue_want_up_thru(epoch_t want
)
6466 map_lock
.get_read();
6467 epoch_t cur
= osdmap
->get_up_thru(whoami
);
6468 std::lock_guard
l(mon_report_lock
);
6469 if (want
> up_thru_wanted
) {
6470 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
6471 << ", currently " << cur
6473 up_thru_wanted
= want
;
6476 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
6477 << ", currently " << cur
6480 map_lock
.put_read();
6483 void OSD::send_alive()
6485 ceph_assert(mon_report_lock
.is_locked());
6486 if (!osdmap
->exists(whoami
))
6488 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
6489 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
6490 if (up_thru_wanted
> up_thru
) {
6491 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
6492 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
6496 void OSD::request_full_map(epoch_t first
, epoch_t last
)
6498 dout(10) << __func__
<< " " << first
<< ".." << last
6499 << ", previously requested "
6500 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
6501 ceph_assert(osd_lock
.is_locked());
6502 ceph_assert(first
> 0 && last
> 0);
6503 ceph_assert(first
<= last
);
6504 ceph_assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
6505 if (requested_full_first
== 0) {
6507 requested_full_first
= first
;
6508 requested_full_last
= last
;
6509 } else if (last
<= requested_full_last
) {
6513 // additional request
6514 first
= requested_full_last
+ 1;
6515 requested_full_last
= last
;
6517 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
6518 req
->request_full(first
, last
);
6519 monc
->send_mon_message(req
);
6522 void OSD::got_full_map(epoch_t e
)
6524 ceph_assert(requested_full_first
<= requested_full_last
);
6525 ceph_assert(osd_lock
.is_locked());
6526 if (requested_full_first
== 0) {
6527 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
6530 if (e
< requested_full_first
) {
6531 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6532 << ".." << requested_full_last
6533 << ", ignoring" << dendl
;
6536 if (e
>= requested_full_last
) {
6537 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6538 << ".." << requested_full_last
<< ", resetting" << dendl
;
6539 requested_full_first
= requested_full_last
= 0;
6543 requested_full_first
= e
+ 1;
6545 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6546 << ".." << requested_full_last
6547 << ", still need more" << dendl
;
6550 void OSD::requeue_failures()
6552 std::lock_guard
l(heartbeat_lock
);
6553 unsigned old_queue
= failure_queue
.size();
6554 unsigned old_pending
= failure_pending
.size();
6555 for (auto p
= failure_pending
.begin(); p
!= failure_pending
.end(); ) {
6556 failure_queue
[p
->first
] = p
->second
.first
;
6557 failure_pending
.erase(p
++);
6559 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
6560 << failure_queue
.size() << dendl
;
6563 void OSD::send_failures()
6565 ceph_assert(map_lock
.is_locked());
6566 ceph_assert(mon_report_lock
.is_locked());
6567 std::lock_guard
l(heartbeat_lock
);
6568 utime_t now
= ceph_clock_now();
6569 while (!failure_queue
.empty()) {
6570 int osd
= failure_queue
.begin()->first
;
6571 if (!failure_pending
.count(osd
)) {
6572 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
6573 monc
->send_mon_message(
6577 osdmap
->get_addrs(osd
),
6579 osdmap
->get_epoch()));
6580 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
,
6581 osdmap
->get_addrs(osd
));
6583 failure_queue
.erase(osd
);
6587 void OSD::send_still_alive(epoch_t epoch
, int osd
, const entity_addrvec_t
&addrs
)
6589 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), osd
, addrs
, 0, epoch
,
6590 MOSDFailure::FLAG_ALIVE
);
6591 monc
->send_mon_message(m
);
6594 void OSD::cancel_pending_failures()
6596 std::lock_guard
l(heartbeat_lock
);
6597 auto it
= failure_pending
.begin();
6598 while (it
!= failure_pending
.end()) {
6599 dout(10) << __func__
<< " canceling in-flight failure report for osd."
6600 << it
->first
<< dendl
;
6601 send_still_alive(osdmap
->get_epoch(), it
->first
, it
->second
.second
);
6602 failure_pending
.erase(it
++);
6606 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
6608 const auto& monmap
= monc
->monmap
;
6609 // send beacon to mon even if we are just connected, and the monmap is not
6610 // initialized yet by then.
6611 if (monmap
.epoch
> 0 &&
6612 monmap
.get_required_features().contains_all(
6613 ceph::features::mon::FEATURE_LUMINOUS
)) {
6614 dout(20) << __func__
<< " sending" << dendl
;
6615 MOSDBeacon
* beacon
= nullptr;
6617 std::lock_guard l
{min_last_epoch_clean_lock
};
6618 beacon
= new MOSDBeacon(osdmap
->get_epoch(), min_last_epoch_clean
);
6619 beacon
->pgs
= min_last_epoch_clean_pgs
;
6620 last_sent_beacon
= now
;
6622 monc
->send_mon_message(beacon
);
6624 dout(20) << __func__
<< " not sending" << dendl
;
6628 void OSD::handle_command(MMonCommand
*m
)
6630 if (!require_mon_peer(m
)) {
6635 Command
*c
= new Command(m
->cmd
, m
->get_tid(), m
->get_data(), NULL
);
6636 command_wq
.queue(c
);
6640 void OSD::handle_command(MCommand
*m
)
6642 ConnectionRef con
= m
->get_connection();
6643 auto priv
= con
->get_priv();
6644 auto session
= static_cast<Session
*>(priv
.get());
6646 con
->send_message(new MCommandReply(m
, -EPERM
));
6651 OSDCap
& caps
= session
->caps
;
6654 if (!caps
.allow_all() || m
->get_source().is_mon()) {
6655 con
->send_message(new MCommandReply(m
, -EPERM
));
6660 Command
*c
= new Command(m
->cmd
, m
->get_tid(), m
->get_data(), con
.get());
6661 command_wq
.queue(c
);
6671 } osd_commands
[] = {
6673 #define COMMAND(parsesig, helptext, module, perm) \
6674 {parsesig, helptext, module, perm},
6676 // yes, these are really pg commands, but there's a limit to how
6677 // much work it's worth. The OSD returns all of them. Make this
6678 // form (pg <pgid> <cmd>) valid only for the cli.
6679 // Rest uses "tell <pgid> <cmd>"
6682 "name=pgid,type=CephPgid " \
6683 "name=cmd,type=CephChoices,strings=query", \
6684 "show details of a specific pg", "osd", "r")
6686 "name=pgid,type=CephPgid " \
6687 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6688 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6689 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6692 "name=pgid,type=CephPgid " \
6693 "name=cmd,type=CephChoices,strings=list_unfound " \
6694 "name=offset,type=CephString,req=false",
6695 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6698 // new form: tell <pgid> <cmd> for both cli and rest
6701 "show details of a specific pg", "osd", "r")
6702 COMMAND("mark_unfound_lost " \
6703 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6704 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6706 COMMAND("list_unfound " \
6707 "name=offset,type=CephString,req=false",
6708 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6710 COMMAND("perf histogram dump "
6711 "name=logger,type=CephString,req=false "
6712 "name=counter,type=CephString,req=false",
6713 "Get histogram data",
6716 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6717 COMMAND("version", "report version of OSD", "osd", "r")
6718 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r")
6719 COMMAND("injectargs " \
6720 "name=injected_args,type=CephString,n=N",
6721 "inject configuration arguments into running OSD",
6723 COMMAND("config set " \
6724 "name=key,type=CephString name=value,type=CephString",
6725 "Set a configuration option at runtime (not persistent)",
6727 COMMAND("config get " \
6728 "name=key,type=CephString",
6729 "Get a configuration option at runtime",
6731 COMMAND("config unset " \
6732 "name=key,type=CephString",
6733 "Unset a configuration option at runtime (not persistent)",
6735 COMMAND("cluster_log " \
6736 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6737 "name=message,type=CephString,n=N",
6738 "log a message to the cluster log",
6741 "name=count,type=CephInt,req=false " \
6742 "name=size,type=CephInt,req=false " \
6743 "name=object_size,type=CephInt,req=false " \
6744 "name=object_num,type=CephInt,req=false ", \
6745 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
6746 "(default count=1G default size=4MB). Results in log.",
6748 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw")
6750 "name=heapcmd,type=CephChoices,strings="\
6751 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
6752 "name=value,type=CephString,req=false",
6753 "show heap usage info (available only if compiled with tcmalloc)",
6755 COMMAND("debug dump_missing " \
6756 "name=filename,type=CephFilepath",
6757 "dump missing objects to a named file", "osd", "r")
6758 COMMAND("debug kick_recovery_wq " \
6759 "name=delay,type=CephInt,range=0",
6760 "set osd_recovery_delay_start to <val>", "osd", "rw")
6761 COMMAND("cpu_profiler " \
6762 "name=arg,type=CephChoices,strings=status|flush",
6763 "run cpu profiling on daemon", "osd", "rw")
6764 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6766 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6769 "compact object store's omap. "
6770 "WARNING: Compaction probably slows your requests",
6772 COMMAND("smart name=devid,type=CephString,req=False",
6773 "runs smartctl on this osd devices. ",
6775 COMMAND("cache drop",
6776 "Drop all OSD caches",
6778 COMMAND("cache status",
6779 "Get OSD caches statistics",
6781 COMMAND("send_beacon",
6782 "Send OSD beacon to mon immediately",
6786 void OSD::do_command(
6787 Connection
*con
, ceph_tid_t tid
, vector
<string
>& cmd
, bufferlist
& data
)
6789 dout(20) << "do_command tid " << tid
<< " " << cmd
<< dendl
;
6792 stringstream ss
, ds
;
6796 ss
<< "no command given";
6799 if (!cmdmap_from_json(cmd
, &cmdmap
, ss
)) {
6805 r
= _do_command(con
, cmdmap
, tid
, data
, odata
, ss
, ds
);
6806 } catch (const bad_cmd_get
& e
) {
6814 string rs
= ss
.str();
6816 dout(0) << "do_command r=" << r
<< " " << rs
<< dendl
;
6819 MCommandReply
*reply
= new MCommandReply(r
, rs
);
6820 reply
->set_tid(tid
);
6821 reply
->set_data(odata
);
6822 con
->send_message(reply
);
6827 class unlock_guard
{
6830 explicit unlock_guard(Mutex
& mutex
)
6835 unlock_guard(unlock_guard
&) = delete;
6842 int OSD::_do_command(
6843 Connection
*con
, cmdmap_t
& cmdmap
, ceph_tid_t tid
, bufferlist
& data
,
6844 bufferlist
& odata
, stringstream
& ss
, stringstream
& ds
)
6850 boost::scoped_ptr
<Formatter
> f
;
6852 cmd_getval(cct
, cmdmap
, "prefix", prefix
);
6854 if (prefix
== "get_command_descriptions") {
6856 JSONFormatter
*f
= new JSONFormatter();
6857 f
->open_object_section("command_descriptions");
6858 for (OSDCommand
*cp
= osd_commands
;
6859 cp
< &osd_commands
[std::size(osd_commands
)]; cp
++) {
6861 ostringstream secname
;
6862 secname
<< "cmd" << setfill('0') << std::setw(3) << cmdnum
;
6863 dump_cmddesc_to_json(f
, con
->get_features(),
6864 secname
.str(), cp
->cmdstring
, cp
->helpstring
,
6865 cp
->module
, cp
->perm
, 0);
6868 f
->close_section(); // command_descriptions
6875 cmd_getval(cct
, cmdmap
, "format", format
);
6876 f
.reset(Formatter::create(format
));
6878 if (prefix
== "version") {
6880 f
->open_object_section("version");
6881 f
->dump_string("version", pretty_version_to_str());
6885 ds
<< pretty_version_to_str();
6889 else if (prefix
== "injectargs") {
6890 vector
<string
> argsvec
;
6891 cmd_getval(cct
, cmdmap
, "injected_args", argsvec
);
6893 if (argsvec
.empty()) {
6895 ss
<< "ignoring empty injectargs";
6898 string args
= argsvec
.front();
6899 for (vector
<string
>::iterator a
= ++argsvec
.begin(); a
!= argsvec
.end(); ++a
)
6901 unlock_guard unlock
{osd_lock
};
6902 r
= cct
->_conf
.injectargs(args
, &ss
);
6904 else if (prefix
== "config set") {
6907 cmd_getval(cct
, cmdmap
, "key", key
);
6908 cmd_getval(cct
, cmdmap
, "value", val
);
6909 unlock_guard unlock
{osd_lock
};
6910 r
= cct
->_conf
.set_val(key
, val
, &ss
);
6912 cct
->_conf
.apply_changes(nullptr);
6915 else if (prefix
== "config get") {
6917 cmd_getval(cct
, cmdmap
, "key", key
);
6918 unlock_guard unlock
{osd_lock
};
6920 r
= cct
->_conf
.get_val(key
, &val
);
6925 else if (prefix
== "config unset") {
6927 cmd_getval(cct
, cmdmap
, "key", key
);
6928 unlock_guard unlock
{osd_lock
};
6929 r
= cct
->_conf
.rm_val(key
);
6931 cct
->_conf
.apply_changes(nullptr);
6934 r
= 0; // make command idempotent
6937 else if (prefix
== "cluster_log") {
6939 cmd_getval(cct
, cmdmap
, "message", msg
);
6942 ss
<< "ignoring empty log message";
6945 string message
= msg
.front();
6946 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
6947 message
+= " " + *a
;
6949 cmd_getval(cct
, cmdmap
, "level", lvl
);
6950 clog_type level
= string_to_clog_type(lvl
);
6953 ss
<< "unknown level '" << lvl
<< "'";
6956 clog
->do_log(level
, message
);
6959 // either 'pg <pgid> <command>' or
6960 // 'tell <pgid>' (which comes in without any of that prefix)?
6962 else if (prefix
== "pg" ||
6963 prefix
== "query" ||
6964 prefix
== "mark_unfound_lost" ||
6965 prefix
== "list_unfound"
6969 if (!cmd_getval(cct
, cmdmap
, "pgid", pgidstr
)) {
6970 ss
<< "no pgid specified";
6972 } else if (!pgid
.parse(pgidstr
.c_str())) {
6973 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
6978 if (osdmap
->get_primary_shard(pgid
, &pcand
) &&
6979 (pg
= _lookup_lock_pg(pcand
))) {
6980 if (pg
->is_primary()) {
6981 // simulate pg <pgid> cmd= for pg->do-command
6983 cmd_putval(cct
, cmdmap
, "cmd", prefix
);
6985 r
= pg
->do_command(cmdmap
, ss
, data
, odata
, con
, tid
);
6986 } catch (const bad_cmd_get
& e
) {
6993 // don't reply, pg will do so async
6997 ss
<< "not primary for pgid " << pgid
;
6999 // send them the latest diff to ensure they realize the mapping
7001 service
.send_incremental_map(osdmap
->get_epoch() - 1, con
, osdmap
);
7003 // do not reply; they will get newer maps and realize they
7010 ss
<< "i don't have pgid " << pgid
;
7016 else if (prefix
== "bench") {
7019 int64_t osize
, onum
;
7020 // default count 1G, size 4MB
7021 cmd_getval(cct
, cmdmap
, "count", count
, (int64_t)1 << 30);
7022 cmd_getval(cct
, cmdmap
, "size", bsize
, (int64_t)4 << 20);
7023 cmd_getval(cct
, cmdmap
, "object_size", osize
, (int64_t)0);
7024 cmd_getval(cct
, cmdmap
, "object_num", onum
, (int64_t)0);
7026 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
7028 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
7029 // let us limit the block size because the next checks rely on it
7030 // having a sane value. If we allow any block size to be set things
7031 // can still go sideways.
7032 ss
<< "block 'size' values are capped at "
7033 << byte_u_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
7034 << " a higher value, please adjust 'osd_bench_max_block_size'";
7037 } else if (bsize
< (int64_t) (1 << 20)) {
7038 // entering the realm of small block sizes.
7039 // limit the count to a sane value, assuming a configurable amount of
7040 // IOPS and duration, so that the OSD doesn't get hung up on this,
7041 // preventing timeouts from going off
7043 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
7044 if (count
> max_count
) {
7045 ss
<< "'count' values greater than " << max_count
7046 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
7047 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
7048 << " for " << duration
<< " seconds,"
7049 << " can cause ill effects on osd. "
7050 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
7051 << " value if you wish to use a higher 'count'.";
7056 // 1MB block sizes are big enough so that we get more stuff done.
7057 // However, to avoid the osd from getting hung on this and having
7058 // timers being triggered, we are going to limit the count assuming
7059 // a configurable throughput and duration.
7060 // NOTE: max_count is the total amount of bytes that we believe we
7061 // will be able to write during 'duration' for the given
7062 // throughput. The block size hardly impacts this unless it's
7063 // way too big. Given we already check how big the block size
7064 // is, it's safe to assume everything will check out.
7066 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
7067 if (count
> max_count
) {
7068 ss
<< "'count' values greater than " << max_count
7069 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
7070 << byte_u_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
7071 << " for " << duration
<< " seconds,"
7072 << " can cause ill effects on osd. "
7073 << " Please adjust 'osd_bench_large_size_max_throughput'"
7074 << " with a higher value if you wish to use a higher 'count'.";
7080 if (osize
&& bsize
> osize
)
7083 dout(1) << " bench count " << count
7084 << " bsize " << byte_u_t(bsize
) << dendl
;
7086 ObjectStore::Transaction cleanupt
;
7088 if (osize
&& onum
) {
7090 bufferptr
bp(osize
);
7092 bl
.push_back(std::move(bp
));
7093 bl
.rebuild_page_aligned();
7094 for (int i
=0; i
<onum
; ++i
) {
7096 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
7098 hobject_t
soid(sobject_t(oid
, 0));
7099 ObjectStore::Transaction t
;
7100 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
7101 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
7102 cleanupt
.remove(coll_t(), ghobject_t(soid
));
7107 bufferptr
bp(bsize
);
7109 bl
.push_back(std::move(bp
));
7110 bl
.rebuild_page_aligned();
7114 if (!service
.meta_ch
->flush_commit(&waiter
)) {
7119 utime_t start
= ceph_clock_now();
7120 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
7122 unsigned offset
= 0;
7123 if (onum
&& osize
) {
7124 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
7125 offset
= rand() % (osize
/ bsize
) * bsize
;
7127 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
7130 hobject_t
soid(sobject_t(oid
, 0));
7131 ObjectStore::Transaction t
;
7132 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
7133 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
7134 if (!onum
|| !osize
)
7135 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
7140 if (!service
.meta_ch
->flush_commit(&waiter
)) {
7144 utime_t end
= ceph_clock_now();
7147 store
->queue_transaction(service
.meta_ch
, std::move(cleanupt
), NULL
);
7150 if (!service
.meta_ch
->flush_commit(&waiter
)) {
7155 double elapsed
= end
- start
;
7156 double rate
= count
/ elapsed
;
7157 double iops
= rate
/ bsize
;
7159 f
->open_object_section("osd_bench_results");
7160 f
->dump_int("bytes_written", count
);
7161 f
->dump_int("blocksize", bsize
);
7162 f
->dump_float("elapsed_sec", elapsed
);
7163 f
->dump_float("bytes_per_sec", rate
);
7164 f
->dump_float("iops", iops
);
7168 ds
<< "bench: wrote " << byte_u_t(count
)
7169 << " in blocks of " << byte_u_t(bsize
) << " in "
7170 << elapsed
<< " sec at " << byte_u_t(rate
) << "/sec "
7171 << si_u_t(iops
) << " IOPS";
7175 else if (prefix
== "flush_pg_stats") {
7176 mgrc
.send_pgstats();
7177 ds
<< service
.get_osd_stat_seq() << "\n";
7180 else if (prefix
== "heap") {
7181 r
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ds
);
7184 else if (prefix
== "debug dump_missing") {
7186 f
.reset(new JSONFormatter(true));
7188 f
->open_array_section("pgs");
7191 for (auto& pg
: pgs
) {
7192 string s
= stringify(pg
->pg_id
);
7193 f
->open_array_section(s
.c_str());
7195 pg
->dump_missing(f
.get());
7202 else if (prefix
== "debug kick_recovery_wq") {
7204 cmd_getval(cct
, cmdmap
, "delay", delay
);
7207 unlock_guard unlock
{osd_lock
};
7208 r
= cct
->_conf
.set_val("osd_recovery_delay_start", oss
.str().c_str());
7210 ss
<< "kick_recovery_wq: error setting "
7211 << "osd_recovery_delay_start to '" << delay
<< "': error "
7215 cct
->_conf
.apply_changes(nullptr);
7216 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
7217 << "to " << cct
->_conf
->osd_recovery_delay_start
;
7220 else if (prefix
== "cpu_profiler") {
7222 cmd_getval(cct
, cmdmap
, "arg", arg
);
7223 vector
<string
> argvec
;
7224 get_str_vec(arg
, argvec
);
7225 cpu_profiler_handle_command(argvec
, ds
);
7228 else if (prefix
== "dump_pg_recovery_stats") {
7231 pg_recovery_stats
.dump_formatted(f
.get());
7234 pg_recovery_stats
.dump(s
);
7235 ds
<< "dump pg recovery stats: " << s
.str();
7239 else if (prefix
== "reset_pg_recovery_stats") {
7240 ss
<< "reset pg recovery stats";
7241 pg_recovery_stats
.reset();
7244 else if (prefix
== "perf histogram dump") {
7246 std::string counter
;
7247 cmd_getval(cct
, cmdmap
, "logger", logger
);
7248 cmd_getval(cct
, cmdmap
, "counter", counter
);
7250 cct
->get_perfcounters_collection()->dump_formatted_histograms(
7251 f
.get(), false, logger
, counter
);
7256 else if (prefix
== "compact") {
7257 dout(1) << "triggering manual compaction" << dendl
;
7258 auto start
= ceph::coarse_mono_clock::now();
7260 auto end
= ceph::coarse_mono_clock::now();
7261 double duration
= std::chrono::duration
<double>(end
-start
).count();
7262 dout(1) << "finished manual compaction in "
7264 << " seconds" << dendl
;
7265 ss
<< "compacted omap in " << duration
<< " seconds";
7268 else if (prefix
== "smart") {
7270 cmd_getval(cct
, cmdmap
, "devid", devid
);
7271 probe_smart(devid
, ds
);
7274 else if (prefix
== "cache drop") {
7275 dout(20) << "clearing all caches" << dendl
;
7276 // Clear the objectstore's cache - onode and buffer for Bluestore,
7277 // system's pagecache for Filestore
7278 r
= store
->flush_cache(&ss
);
7280 ds
<< "Error flushing objectstore cache: " << cpp_strerror(r
);
7283 // Clear the objectcontext cache (per PG)
7286 for (auto& pg
: pgs
) {
7291 else if (prefix
== "cache status") {
7292 int obj_ctx_count
= 0;
7295 for (auto& pg
: pgs
) {
7296 obj_ctx_count
+= pg
->get_cache_obj_count();
7299 f
->open_object_section("cache_status");
7300 f
->dump_int("object_ctx", obj_ctx_count
);
7301 store
->dump_cache_stats(f
.get());
7305 ds
<< "object_ctx: " << obj_ctx_count
;
7306 store
->dump_cache_stats(ds
);
7309 else if (prefix
== "send_beacon") {
7311 send_beacon(ceph::coarse_mono_clock::now());
7314 ss
<< "unrecognized command '" << prefix
<< "'";
7322 void OSD::probe_smart(const string
& only_devid
, ostream
& ss
)
7324 set
<string
> devnames
;
7325 store
->get_devices(&devnames
);
7326 uint64_t smart_timeout
= cct
->_conf
.get_val
<uint64_t>(
7327 "osd_smart_report_timeout");
7329 // == typedef std::map<std::string, mValue> mObject;
7330 json_spirit::mObject json_map
;
7332 for (auto dev
: devnames
) {
7333 // smartctl works only on physical devices; filter out any logical device
7334 if (dev
.find("dm-") == 0) {
7339 string devid
= get_device_id(dev
, &err
);
7340 if (devid
.size() == 0) {
7341 dout(10) << __func__
<< " no unique id for dev " << dev
<< " ("
7342 << err
<< "), skipping" << dendl
;
7345 if (only_devid
.size() && devid
!= only_devid
) {
7349 json_spirit::mValue smart_json
;
7350 if (block_device_get_metrics(dev
, smart_timeout
,
7352 dout(10) << "block_device_get_metrics failed for /dev/" << dev
<< dendl
;
7355 json_map
[devid
] = smart_json
;
7357 json_spirit::write(json_map
, ss
, json_spirit::pretty_print
);
7360 bool OSD::heartbeat_dispatch(Message
*m
)
7362 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
7363 switch (m
->get_type()) {
7366 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
7371 handle_osd_ping(static_cast<MOSDPing
*>(m
));
7375 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
7382 bool OSD::ms_dispatch(Message
*m
)
7384 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
7385 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
7386 service
.got_stop_ack();
7394 if (is_stopping()) {
7408 void OSD::maybe_share_map(
7413 if (!op
->check_send_map
) {
7416 epoch_t last_sent_epoch
= 0;
7418 session
->sent_epoch_lock
.lock();
7419 last_sent_epoch
= session
->last_sent_epoch
;
7420 session
->sent_epoch_lock
.unlock();
7422 // assume the peer has the newer of the op's sent_epoch and what
7423 // we think we sent them.
7424 epoch_t from
= std::max(last_sent_epoch
, op
->sent_epoch
);
7426 const Message
*m
= op
->get_req();
7429 m
->get_connection().get(),
7432 session
? &last_sent_epoch
: NULL
);
7434 session
->sent_epoch_lock
.lock();
7435 if (session
->last_sent_epoch
< last_sent_epoch
) {
7436 session
->last_sent_epoch
= last_sent_epoch
;
7438 session
->sent_epoch_lock
.unlock();
7440 op
->check_send_map
= false;
7443 void OSD::dispatch_session_waiting(SessionRef session
, OSDMapRef osdmap
)
7445 ceph_assert(session
->session_dispatch_lock
.is_locked());
7447 auto i
= session
->waiting_on_map
.begin();
7448 while (i
!= session
->waiting_on_map
.end()) {
7449 OpRequestRef op
= &(*i
);
7450 ceph_assert(ms_can_fast_dispatch(op
->get_req()));
7451 const MOSDFastDispatchOp
*m
= static_cast<const MOSDFastDispatchOp
*>(
7453 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
7456 session
->waiting_on_map
.erase(i
++);
7460 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
7461 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
7462 static_cast<const MOSDOp
*>(m
)->get_pg());
7463 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
7467 pgid
= m
->get_spg();
7469 enqueue_op(pgid
, std::move(op
), m
->get_map_epoch());
7472 if (session
->waiting_on_map
.empty()) {
7473 clear_session_waiting_on_map(session
);
7475 register_session_waiting_on_map(session
);
7479 void OSD::ms_fast_dispatch(Message
*m
)
7482 if (service
.is_stopping()) {
7488 switch (m
->get_type()) {
7490 dout(10) << "ping from " << m
->get_source() << dendl
;
7493 case MSG_MON_COMMAND
:
7494 handle_command(static_cast<MMonCommand
*>(m
));
7496 case MSG_OSD_FORCE_RECOVERY
:
7497 handle_fast_force_recovery(static_cast<MOSDForceRecovery
*>(m
));
7499 case MSG_OSD_SCRUB2
:
7500 handle_fast_scrub(static_cast<MOSDScrub2
*>(m
));
7503 case MSG_OSD_PG_CREATE2
:
7504 return handle_fast_pg_create(static_cast<MOSDPGCreate2
*>(m
));
7505 case MSG_OSD_PG_QUERY
:
7506 return handle_fast_pg_query(static_cast<MOSDPGQuery
*>(m
));
7507 case MSG_OSD_PG_NOTIFY
:
7508 return handle_fast_pg_notify(static_cast<MOSDPGNotify
*>(m
));
7509 case MSG_OSD_PG_INFO
:
7510 return handle_fast_pg_info(static_cast<MOSDPGInfo
*>(m
));
7511 case MSG_OSD_PG_REMOVE
:
7512 return handle_fast_pg_remove(static_cast<MOSDPGRemove
*>(m
));
7514 // these are single-pg messages that handle themselves
7515 case MSG_OSD_PG_LOG
:
7516 case MSG_OSD_PG_TRIM
:
7517 case MSG_OSD_BACKFILL_RESERVE
:
7518 case MSG_OSD_RECOVERY_RESERVE
:
7520 MOSDPeeringOp
*pm
= static_cast<MOSDPeeringOp
*>(m
);
7521 if (require_osd_peer(pm
)) {
7522 enqueue_peering_evt(
7524 PGPeeringEventRef(pm
->get_event()));
7531 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7534 osd_reqid_t reqid
= op
->get_reqid();
7536 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
7537 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
7541 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7543 // note sender epoch, min req's epoch
7544 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
7545 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
7546 ceph_assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
7548 service
.maybe_inject_dispatch_delay();
7550 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
7551 m
->get_type() != CEPH_MSG_OSD_OP
) {
7552 // queue it directly
7554 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
7556 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
7558 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7559 // message that didn't have an explicit spg_t); we need to map
7560 // them to an spg_t while preserving delivery order.
7561 auto priv
= m
->get_connection()->get_priv();
7562 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
7563 std::lock_guard l
{session
->session_dispatch_lock
};
7565 session
->waiting_on_map
.push_back(*op
);
7566 OSDMapRef nextmap
= service
.get_nextmap_reserved();
7567 dispatch_session_waiting(session
, nextmap
);
7568 service
.release_map(nextmap
);
7571 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
7574 bool OSD::ms_get_authorizer(int dest_type
, AuthAuthorizer
**authorizer
)
7576 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type
) << dendl
;
7578 if (is_stopping()) {
7579 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
7583 if (dest_type
== CEPH_ENTITY_TYPE_MON
)
7586 *authorizer
= monc
->build_authorizer(dest_type
);
7587 return *authorizer
!= NULL
;
7590 KeyStore
*OSD::ms_get_auth1_authorizer_keystore()
7592 return monc
->rotating_secrets
.get();
7595 int OSD::ms_handle_authentication(Connection
*con
)
7598 auto priv
= con
->get_priv();
7599 Session
*s
= static_cast<Session
*>(priv
.get());
7601 s
= new Session(cct
, con
);
7602 con
->set_priv(RefCountedPtr
{s
, false});
7603 s
->entity_name
= con
->get_peer_entity_name();
7604 dout(10) << __func__
<< " new session " << s
<< " con " << s
->con
7605 << " entity " << s
->entity_name
7606 << " addr " << con
->get_peer_addrs() << dendl
;
7608 dout(10) << __func__
<< " existing session " << s
<< " con " << s
->con
7609 << " entity " << s
->entity_name
7610 << " addr " << con
->get_peer_addrs() << dendl
;
7613 AuthCapsInfo
&caps_info
= con
->get_peer_caps_info();
7614 if (caps_info
.allow_all
)
7615 s
->caps
.set_allow_all();
7617 if (caps_info
.caps
.length() > 0) {
7618 bufferlist::const_iterator p
= caps_info
.caps
.cbegin();
7623 catch (buffer::error
& e
) {
7624 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7625 << " failed to decode caps string" << dendl
;
7629 bool success
= s
->caps
.parse(str
);
7631 dout(10) << __func__
<< " session " << s
7632 << " " << s
->entity_name
7633 << " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
7636 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7637 << " failed to parse caps '" << str
<< "'" << dendl
;
7645 void OSD::do_waiters()
7647 ceph_assert(osd_lock
.is_locked());
7649 dout(10) << "do_waiters -- start" << dendl
;
7650 while (!finished
.empty()) {
7651 OpRequestRef next
= finished
.front();
7652 finished
.pop_front();
7655 dout(10) << "do_waiters -- finish" << dendl
;
7658 void OSD::dispatch_op(OpRequestRef op
)
7660 switch (op
->get_req()->get_type()) {
7662 case MSG_OSD_PG_CREATE
:
7663 handle_pg_create(op
);
7668 void OSD::_dispatch(Message
*m
)
7670 ceph_assert(osd_lock
.is_locked());
7671 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
7673 switch (m
->get_type()) {
7674 // -- don't need OSDMap --
7676 // map and replication
7677 case CEPH_MSG_OSD_MAP
:
7678 handle_osd_map(static_cast<MOSDMap
*>(m
));
7683 handle_scrub(static_cast<MOSDScrub
*>(m
));
7687 handle_command(static_cast<MCommand
*>(m
));
7690 // -- need OSDMap --
7692 case MSG_OSD_PG_CREATE
:
7694 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7696 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7697 // no map? starting up?
7699 dout(7) << "no OSDMap, not booted" << dendl
;
7700 logger
->inc(l_osd_waiting_for_map
);
7701 waiting_for_osdmap
.push_back(op
);
7702 op
->mark_delayed("no osdmap");
7712 // remove me post-nautilus
7713 void OSD::handle_scrub(MOSDScrub
*m
)
7715 dout(10) << "handle_scrub " << *m
<< dendl
;
7716 if (!require_mon_or_mgr_peer(m
)) {
7720 if (m
->fsid
!= monc
->get_fsid()) {
7721 dout(0) << "handle_scrub fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7730 if (!m
->scrub_pgs
.empty()) {
7732 for (auto pgid
: m
->scrub_pgs
) {
7734 if (osdmap
->get_primary_shard(pgid
, &pcand
) &&
7735 std::find(spgs
.begin(), spgs
.end(), pcand
) != spgs
.end()) {
7742 for (auto pgid
: spgs
) {
7743 enqueue_peering_evt(
7746 std::make_shared
<PGPeeringEvent
>(
7749 PG::RequestScrub(m
->deep
, m
->repair
))));
7755 void OSD::handle_fast_scrub(MOSDScrub2
*m
)
7757 dout(10) << __func__
<< " " << *m
<< dendl
;
7758 if (!require_mon_or_mgr_peer(m
)) {
7762 if (m
->fsid
!= monc
->get_fsid()) {
7763 dout(0) << __func__
<< " fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7768 for (auto pgid
: m
->scrub_pgs
) {
7769 enqueue_peering_evt(
7772 std::make_shared
<PGPeeringEvent
>(
7775 PG::RequestScrub(m
->deep
, m
->repair
))));
7780 bool OSD::scrub_random_backoff()
7782 bool coin_flip
= (rand() / (double)RAND_MAX
>=
7783 cct
->_conf
->osd_scrub_backoff_ratio
);
7785 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl
;
7791 OSDService::ScrubJob::ScrubJob(CephContext
* cct
,
7792 const spg_t
& pg
, const utime_t
& timestamp
,
7793 double pool_scrub_min_interval
,
7794 double pool_scrub_max_interval
, bool must
)
7797 sched_time(timestamp
),
7800 // if not explicitly requested, postpone the scrub with a random delay
7802 double scrub_min_interval
= pool_scrub_min_interval
> 0 ?
7803 pool_scrub_min_interval
: cct
->_conf
->osd_scrub_min_interval
;
7804 double scrub_max_interval
= pool_scrub_max_interval
> 0 ?
7805 pool_scrub_max_interval
: cct
->_conf
->osd_scrub_max_interval
;
7807 sched_time
+= scrub_min_interval
;
7808 double r
= rand() / (double)RAND_MAX
;
7810 scrub_min_interval
* cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
7811 if (scrub_max_interval
== 0) {
7812 deadline
= utime_t();
7814 deadline
+= scrub_max_interval
;
7820 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob
& rhs
) const {
7821 if (sched_time
< rhs
.sched_time
)
7823 if (sched_time
> rhs
.sched_time
)
7825 return pgid
< rhs
.pgid
;
7828 bool OSD::scrub_time_permit(utime_t now
)
7831 time_t tt
= now
.sec();
7832 localtime_r(&tt
, &bdt
);
7834 bool day_permit
= false;
7835 if (cct
->_conf
->osd_scrub_begin_week_day
< cct
->_conf
->osd_scrub_end_week_day
) {
7836 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
&& bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7840 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
|| bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7846 dout(20) << __func__
<< " should run between week day " << cct
->_conf
->osd_scrub_begin_week_day
7847 << " - " << cct
->_conf
->osd_scrub_end_week_day
7848 << " now " << bdt
.tm_wday
<< " = no" << dendl
;
7852 bool time_permit
= false;
7853 if (cct
->_conf
->osd_scrub_begin_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7854 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
&& bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7858 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
|| bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7863 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7864 << " - " << cct
->_conf
->osd_scrub_end_hour
7865 << " now " << bdt
.tm_hour
<< " = no" << dendl
;
7867 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7868 << " - " << cct
->_conf
->osd_scrub_end_hour
7869 << " now " << bdt
.tm_hour
<< " = yes" << dendl
;
7874 bool OSD::scrub_load_below_threshold()
7877 if (getloadavg(loadavgs
, 3) != 3) {
7878 dout(10) << __func__
<< " couldn't read loadavgs\n" << dendl
;
7882 // allow scrub if below configured threshold
7883 long cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
7884 double loadavg_per_cpu
= cpus
> 0 ? loadavgs
[0] / cpus
: loadavgs
[0];
7885 if (loadavg_per_cpu
< cct
->_conf
->osd_scrub_load_threshold
) {
7886 dout(20) << __func__
<< " loadavg per cpu " << loadavg_per_cpu
7887 << " < max " << cct
->_conf
->osd_scrub_load_threshold
7888 << " = yes" << dendl
;
7892 // allow scrub if below daily avg and currently decreasing
7893 if (loadavgs
[0] < daily_loadavg
&& loadavgs
[0] < loadavgs
[2]) {
7894 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7895 << " < daily_loadavg " << daily_loadavg
7896 << " and < 15m avg " << loadavgs
[2]
7897 << " = yes" << dendl
;
7901 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7902 << " >= max " << cct
->_conf
->osd_scrub_load_threshold
7903 << " and ( >= daily_loadavg " << daily_loadavg
7904 << " or >= 15m avg " << loadavgs
[2]
7905 << ") = no" << dendl
;
7909 void OSD::sched_scrub()
7911 // if not permitted, fail fast
7912 if (!service
.can_inc_scrubs()) {
7915 bool allow_requested_repair_only
= false;
7916 if (service
.is_recovery_active()) {
7917 if (!cct
->_conf
->osd_scrub_during_recovery
&& cct
->_conf
->osd_repair_during_recovery
) {
7918 dout(10) << __func__
7919 << " will only schedule explicitly requested repair due to active recovery"
7921 allow_requested_repair_only
= true;
7922 } else if (!cct
->_conf
->osd_scrub_during_recovery
&& !cct
->_conf
->osd_repair_during_recovery
) {
7923 dout(20) << __func__
<< " not scheduling scrubs due to active recovery" << dendl
;
7928 utime_t now
= ceph_clock_now();
7929 bool time_permit
= scrub_time_permit(now
);
7930 bool load_is_low
= scrub_load_below_threshold();
7931 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low
<< dendl
;
7933 OSDService::ScrubJob scrub
;
7934 if (service
.first_scrub_stamp(&scrub
)) {
7936 dout(30) << "sched_scrub examine " << scrub
.pgid
<< " at " << scrub
.sched_time
<< dendl
;
7938 if (scrub
.sched_time
> now
) {
7939 // save ourselves some effort
7940 dout(10) << "sched_scrub " << scrub
.pgid
<< " scheduled at " << scrub
.sched_time
7941 << " > " << now
<< dendl
;
7945 if ((scrub
.deadline
.is_zero() || scrub
.deadline
>= now
) && !(time_permit
&& load_is_low
)) {
7946 dout(10) << __func__
<< " not scheduling scrub for " << scrub
.pgid
<< " due to "
7947 << (!time_permit
? "time not permit" : "high load") << dendl
;
7951 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7954 // This has already started, so go on to the next scrub job
7955 if (pg
->scrubber
.active
) {
7957 dout(30) << __func__
<< ": already in progress pgid " << scrub
.pgid
<< dendl
;
7960 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7961 if (allow_requested_repair_only
&& !pg
->scrubber
.must_repair
) {
7963 dout(10) << __func__
<< " skip " << scrub
.pgid
7964 << " because repairing is not explicitly requested on it"
7968 // If it is reserving, let it resolve before going to the next scrub job
7969 if (pg
->scrubber
.local_reserved
&& !pg
->scrubber
.active
) {
7971 dout(30) << __func__
<< ": reserve in progress pgid " << scrub
.pgid
<< dendl
;
7974 dout(10) << "sched_scrub scrubbing " << scrub
.pgid
<< " at " << scrub
.sched_time
7975 << (pg
->get_must_scrub() ? ", explicitly requested" :
7976 (load_is_low
? ", load_is_low" : " deadline < now"))
7978 if (pg
->sched_scrub()) {
7983 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7985 dout(20) << "sched_scrub done" << dendl
;
7988 void OSD::resched_all_scrubs()
7990 dout(10) << __func__
<< ": start" << dendl
;
7991 OSDService::ScrubJob scrub
;
7992 if (service
.first_scrub_stamp(&scrub
)) {
7994 dout(20) << __func__
<< ": examine " << scrub
.pgid
<< dendl
;
7996 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7999 if (!pg
->scrubber
.must_scrub
&& !pg
->scrubber
.need_auto
) {
8000 dout(20) << __func__
<< ": reschedule " << scrub
.pgid
<< dendl
;
8001 pg
->on_info_history_change();
8004 } while (service
.next_scrub_stamp(scrub
, &scrub
));
8006 dout(10) << __func__
<< ": done" << dendl
;
8009 MPGStats
* OSD::collect_pg_stats()
8011 // This implementation unconditionally sends every is_primary PG's
8012 // stats every time we're called. This has equivalent cost to the
8013 // previous implementation's worst case where all PGs are busy and
8014 // their stats are always enqueued for sending.
8015 RWLock::RLocker
l(map_lock
);
8017 utime_t had_for
= ceph_clock_now() - had_map_since
;
8018 osd_stat_t cur_stat
= service
.get_osd_stat();
8019 cur_stat
.os_perf_stat
= store
->get_cur_stats();
8021 auto m
= new MPGStats(monc
->get_fsid(), osdmap
->get_epoch(), had_for
);
8022 m
->osd_stat
= cur_stat
;
8024 std::lock_guard lec
{min_last_epoch_clean_lock
};
8025 min_last_epoch_clean
= osdmap
->get_epoch();
8026 min_last_epoch_clean_pgs
.clear();
8028 std::set
<int64_t> pool_set
;
8031 for (auto& pg
: pgs
) {
8032 auto pool
= pg
->pg_id
.pgid
.pool();
8033 pool_set
.emplace((int64_t)pool
);
8034 if (!pg
->is_primary()) {
8037 pg
->get_pg_stats([&](const pg_stat_t
& s
, epoch_t lec
) {
8038 m
->pg_stat
[pg
->pg_id
.pgid
] = s
;
8039 min_last_epoch_clean
= min(min_last_epoch_clean
, lec
);
8040 min_last_epoch_clean_pgs
.push_back(pg
->pg_id
.pgid
);
8044 bool per_pool_stats
= false;
8045 for (auto p
: pool_set
) {
8046 int r
= store
->pool_statfs(p
, &st
);
8047 if (r
== -ENOTSUP
) {
8051 m
->pool_stat
[p
] = st
;
8052 per_pool_stats
= true;
8056 // indicate whether we are reporting per-pool stats
8057 m
->osd_stat
.num_osds
= 1;
8058 m
->osd_stat
.num_per_pool_osds
= per_pool_stats
? 1 : 0;
8063 vector
<DaemonHealthMetric
> OSD::get_health_metrics()
8065 vector
<DaemonHealthMetric
> metrics
;
8067 utime_t oldest_secs
;
8068 const utime_t now
= ceph_clock_now();
8070 too_old
-= cct
->_conf
.get_val
<double>("osd_op_complaint_time");
8072 TrackedOpRef oldest_op
;
8073 auto count_slow_ops
= [&](TrackedOp
& op
) {
8074 if (op
.get_initiated() < too_old
) {
8075 lgeneric_subdout(cct
,osd
,20) << "slow op " << op
.get_desc()
8077 << op
.get_initiated() << dendl
;
8079 if (!oldest_op
|| op
.get_initiated() < oldest_op
->get_initiated()) {
8087 if (op_tracker
.visit_ops_in_flight(&oldest_secs
, count_slow_ops
)) {
8089 derr
<< __func__
<< " reporting " << slow
<< " slow ops, oldest is "
8090 << oldest_op
->get_desc() << dendl
;
8092 metrics
.emplace_back(daemon_metric::SLOW_OPS
, slow
, oldest_secs
);
8094 // no news is not good news.
8095 metrics
.emplace_back(daemon_metric::SLOW_OPS
, 0, 0);
8099 std::lock_guard
l(pending_creates_lock
);
8100 auto n_primaries
= pending_creates_from_mon
;
8101 for (const auto& create
: pending_creates_from_osd
) {
8102 if (create
.second
) {
8106 metrics
.emplace_back(daemon_metric::PENDING_CREATING_PGS
, n_primaries
);
8111 // =====================================================
8114 void OSD::wait_for_new_map(OpRequestRef op
)
8117 if (waiting_for_osdmap
.empty()) {
8118 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
8121 logger
->inc(l_osd_waiting_for_map
);
8122 waiting_for_osdmap
.push_back(op
);
8123 op
->mark_delayed("wait for new map");
8128 * assimilate new OSDMap(s). scan pgs, etc.
8131 void OSD::note_down_osd(int peer
)
8133 ceph_assert(osd_lock
.is_locked());
8134 cluster_messenger
->mark_down_addrs(osdmap
->get_cluster_addrs(peer
));
8136 heartbeat_lock
.Lock();
8137 failure_queue
.erase(peer
);
8138 failure_pending
.erase(peer
);
8139 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
8140 if (p
!= heartbeat_peers
.end()) {
8141 p
->second
.con_back
->mark_down();
8142 if (p
->second
.con_front
) {
8143 p
->second
.con_front
->mark_down();
8145 heartbeat_peers
.erase(p
);
8147 heartbeat_lock
.Unlock();
8150 void OSD::note_up_osd(int peer
)
8152 service
.forget_peer_epoch(peer
, osdmap
->get_epoch() - 1);
8153 heartbeat_set_peers_need_update();
8156 struct C_OnMapCommit
: public Context
{
8158 epoch_t first
, last
;
8160 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
8161 : osd(o
), first(f
), last(l
), msg(m
) {}
8162 void finish(int r
) override
{
8163 osd
->_committed_osd_maps(first
, last
, msg
);
8168 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
8170 std::lock_guard
l(osdmap_subscribe_lock
);
8171 if (latest_subscribed_epoch
>= epoch
&& !force_request
)
8174 latest_subscribed_epoch
= std::max
<uint64_t>(epoch
, latest_subscribed_epoch
);
8176 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
8182 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
8184 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
8185 if (min
<= superblock
.oldest_map
)
8189 ObjectStore::Transaction t
;
8190 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
8191 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
8192 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
8193 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
8194 superblock
.oldest_map
= e
+ 1;
8196 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
8197 service
.publish_superblock(superblock
);
8198 write_superblock(t
);
8199 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
8200 ceph_assert(tr
== 0);
8203 // skip_maps leaves us with a range of old maps if we fail to remove all
8204 // of them before moving superblock.oldest_map forward to the first map
8205 // in the incoming MOSDMap msg. so we should continue removing them in
8206 // this case, even we could do huge series of delete transactions all at
8213 service
.publish_superblock(superblock
);
8214 write_superblock(t
);
8215 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
8216 ceph_assert(tr
== 0);
8218 // we should not remove the cached maps
8219 ceph_assert(min
<= service
.map_cache
.cached_key_lower_bound());
8222 void OSD::handle_osd_map(MOSDMap
*m
)
8224 // wait for pgs to catch up
8226 // we extend the map cache pins to accomodate pgs slow to consume maps
8227 // for some period, until we hit the max_lag_factor bound, at which point
8228 // we block here to stop injesting more maps than they are able to keep
8230 epoch_t max_lag
= cct
->_conf
->osd_map_cache_size
*
8231 m_osd_pg_epoch_max_lag_factor
;
8232 ceph_assert(max_lag
> 0);
8233 epoch_t osd_min
= 0;
8234 for (auto shard
: shards
) {
8235 epoch_t min
= shard
->get_min_pg_epoch();
8236 if (osd_min
== 0 || min
< osd_min
) {
8241 osdmap
->get_epoch() > max_lag
&&
8242 osdmap
->get_epoch() - max_lag
> osd_min
) {
8243 epoch_t need
= osdmap
->get_epoch() - max_lag
;
8244 dout(10) << __func__
<< " waiting for pgs to catch up (need " << need
8245 << " max_lag " << max_lag
<< ")" << dendl
;
8246 for (auto shard
: shards
) {
8247 epoch_t min
= shard
->get_min_pg_epoch();
8249 dout(10) << __func__
<< " waiting for pgs to consume " << need
8250 << " (shard " << shard
->shard_id
<< " min " << min
8251 << ", map cache is " << cct
->_conf
->osd_map_cache_size
8252 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
8254 unlock_guard unlock
{osd_lock
};
8255 shard
->wait_min_pg_epoch(need
);
8261 ceph_assert(osd_lock
.is_locked());
8262 map
<epoch_t
,OSDMapRef
> added_maps
;
8263 map
<epoch_t
,bufferlist
> added_maps_bl
;
8264 if (m
->fsid
!= monc
->get_fsid()) {
8265 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
8266 << monc
->get_fsid() << dendl
;
8270 if (is_initializing()) {
8271 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
8276 auto priv
= m
->get_connection()->get_priv();
8277 if (auto session
= static_cast<Session
*>(priv
.get());
8278 session
&& !(session
->entity_name
.is_mon() ||
8279 session
->entity_name
.is_osd())) {
8281 dout(10) << "got osd map from Session " << session
8282 << " which we can't take maps from (not a mon or osd)" << dendl
;
8287 // share with the objecter
8289 service
.objecter
->handle_osd_map(m
);
8291 epoch_t first
= m
->get_first();
8292 epoch_t last
= m
->get_last();
8293 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
8294 << superblock
.newest_map
8295 << ", src has [" << m
->oldest_map
<< "," << m
->newest_map
<< "]"
8298 logger
->inc(l_osd_map
);
8299 logger
->inc(l_osd_mape
, last
- first
+ 1);
8300 if (first
<= superblock
.newest_map
)
8301 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
8302 if (service
.max_oldest_map
< m
->oldest_map
) {
8303 service
.max_oldest_map
= m
->oldest_map
;
8304 ceph_assert(service
.max_oldest_map
>= superblock
.oldest_map
);
8307 // make sure there is something new, here, before we bother flushing
8308 // the queues and such
8309 if (last
<= superblock
.newest_map
) {
8310 dout(10) << " no new maps here, dropping" << dendl
;
8316 bool skip_maps
= false;
8317 if (first
> superblock
.newest_map
+ 1) {
8318 dout(10) << "handle_osd_map message skips epochs "
8319 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
8320 if (m
->oldest_map
<= superblock
.newest_map
+ 1) {
8321 osdmap_subscribe(superblock
.newest_map
+ 1, false);
8325 // always try to get the full range of maps--as many as we can. this
8326 // 1- is good to have
8327 // 2- is at present the only way to ensure that we get a *full* map as
8329 if (m
->oldest_map
< first
) {
8330 osdmap_subscribe(m
->oldest_map
- 1, true);
8337 ObjectStore::Transaction t
;
8338 uint64_t txn_size
= 0;
8340 // store new maps: queue for disk and put in the osdmap cache
8341 epoch_t start
= std::max(superblock
.newest_map
+ 1, first
);
8342 for (epoch_t e
= start
; e
<= last
; e
++) {
8343 if (txn_size
>= t
.get_num_bytes()) {
8344 derr
<< __func__
<< " transaction size overflowed" << dendl
;
8345 ceph_assert(txn_size
< t
.get_num_bytes());
8347 txn_size
= t
.get_num_bytes();
8348 map
<epoch_t
,bufferlist
>::iterator p
;
8349 p
= m
->maps
.find(e
);
8350 if (p
!= m
->maps
.end()) {
8351 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
8352 OSDMap
*o
= new OSDMap
;
8353 bufferlist
& bl
= p
->second
;
8357 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8358 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
8359 added_maps
[e
] = add_map(o
);
8360 added_maps_bl
[e
] = bl
;
8365 p
= m
->incremental_maps
.find(e
);
8366 if (p
!= m
->incremental_maps
.end()) {
8367 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
8368 bufferlist
& bl
= p
->second
;
8369 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
8370 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
8372 OSDMap
*o
= new OSDMap
;
8375 bool got
= get_map_bl(e
- 1, obl
);
8377 auto p
= added_maps_bl
.find(e
- 1);
8378 ceph_assert(p
!= added_maps_bl
.end());
8384 OSDMap::Incremental inc
;
8385 auto p
= bl
.cbegin();
8388 if (o
->apply_incremental(inc
) < 0) {
8389 derr
<< "ERROR: bad fsid? i have " << osdmap
->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
8390 ceph_abort_msg("bad fsid");
8394 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
8396 bool injected_failure
= false;
8397 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
8398 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
8399 derr
<< __func__
<< " injecting map crc failure" << dendl
;
8400 injected_failure
= true;
8403 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
8404 dout(2) << "got incremental " << e
8405 << " but failed to encode full with correct crc; requesting"
8407 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
8408 dout(20) << "my encoded map was:\n";
8409 fbl
.hexdump(*_dout
);
8412 request_full_map(e
, last
);
8418 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8419 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
8420 added_maps
[e
] = add_map(o
);
8421 added_maps_bl
[e
] = fbl
;
8425 ceph_abort_msg("MOSDMap lied about what maps it had?");
8428 // even if this map isn't from a mon, we may have satisfied our subscription
8429 monc
->sub_got("osdmap", last
);
8431 if (!m
->maps
.empty() && requested_full_first
) {
8432 dout(10) << __func__
<< " still missing full maps " << requested_full_first
8433 << ".." << requested_full_last
<< dendl
;
8434 rerequest_full_maps();
8437 if (superblock
.oldest_map
) {
8438 // make sure we at least keep pace with incoming maps
8439 trim_maps(m
->oldest_map
, last
- first
+ 1, skip_maps
);
8440 pg_num_history
.prune(superblock
.oldest_map
);
8443 if (!superblock
.oldest_map
|| skip_maps
)
8444 superblock
.oldest_map
= first
;
8445 superblock
.newest_map
= last
;
8446 superblock
.current_epoch
= last
;
8448 // note in the superblock that we were clean thru the prior epoch
8449 epoch_t boot_epoch
= service
.get_boot_epoch();
8450 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
8451 superblock
.mounted
= boot_epoch
;
8452 superblock
.clean_thru
= last
;
8455 // check for pg_num changes and deleted pools
8457 for (auto& i
: added_maps
) {
8459 if (!(lastmap
= service
.try_get_map(i
.first
- 1))) {
8460 dout(10) << __func__
<< " can't get previous map " << i
.first
- 1
8461 << " probably first start of this osd" << dendl
;
8465 ceph_assert(lastmap
->get_epoch() + 1 == i
.second
->get_epoch());
8466 for (auto& j
: lastmap
->get_pools()) {
8467 if (!i
.second
->have_pg_pool(j
.first
)) {
8468 pg_num_history
.log_pool_delete(i
.first
, j
.first
);
8469 dout(10) << __func__
<< " recording final pg_pool_t for pool "
8470 << j
.first
<< dendl
;
8471 // this information is needed by _make_pg() if have to restart before
8472 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8473 ghobject_t obj
= make_final_pool_info_oid(j
.first
);
8475 encode(j
.second
, bl
, CEPH_FEATURES_ALL
);
8476 string name
= lastmap
->get_pool_name(j
.first
);
8478 map
<string
,string
> profile
;
8479 if (lastmap
->get_pg_pool(j
.first
)->is_erasure()) {
8480 profile
= lastmap
->get_erasure_code_profile(
8481 lastmap
->get_pg_pool(j
.first
)->erasure_code_profile
);
8483 encode(profile
, bl
);
8484 t
.write(coll_t::meta(), obj
, 0, bl
.length(), bl
);
8485 service
.store_deleted_pool_pg_num(j
.first
, j
.second
.get_pg_num());
8486 } else if (unsigned new_pg_num
= i
.second
->get_pg_num(j
.first
);
8487 new_pg_num
!= j
.second
.get_pg_num()) {
8488 dout(10) << __func__
<< " recording pool " << j
.first
<< " pg_num "
8489 << j
.second
.get_pg_num() << " -> " << new_pg_num
<< dendl
;
8490 pg_num_history
.log_pg_num_change(i
.first
, j
.first
, new_pg_num
);
8493 for (auto& j
: i
.second
->get_pools()) {
8494 if (!lastmap
->have_pg_pool(j
.first
)) {
8495 dout(10) << __func__
<< " recording new pool " << j
.first
<< " pg_num "
8496 << j
.second
.get_pg_num() << dendl
;
8497 pg_num_history
.log_pg_num_change(i
.first
, j
.first
,
8498 j
.second
.get_pg_num());
8503 pg_num_history
.epoch
= last
;
8506 ::encode(pg_num_history
, bl
);
8507 t
.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl
.length(), bl
);
8508 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
8511 // superblock and commit
8512 write_superblock(t
);
8513 t
.register_on_commit(new C_OnMapCommit(this, start
, last
, m
));
8514 store
->queue_transaction(
8517 service
.publish_superblock(superblock
);
8520 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
8522 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
8523 if (is_stopping()) {
8524 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8527 std::lock_guard
l(osd_lock
);
8528 if (is_stopping()) {
8529 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8532 map_lock
.get_write();
8534 bool do_shutdown
= false;
8535 bool do_restart
= false;
8536 bool network_error
= false;
8538 // advance through the new maps
8539 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
8540 dout(10) << " advance to epoch " << cur
8541 << " (<= last " << last
8542 << " <= newest_map " << superblock
.newest_map
8545 OSDMapRef newmap
= get_map(cur
);
8546 ceph_assert(newmap
); // we just cached it above!
8548 // start blacklisting messages sent to peers that go down.
8549 service
.pre_publish_map(newmap
);
8551 // kill connections to newly down osds
8552 bool waited_for_reservations
= false;
8554 osdmap
->get_all_osds(old
);
8555 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
8557 osdmap
->is_up(*p
) && // in old map
8558 newmap
->is_down(*p
)) { // but not the new one
8559 if (!waited_for_reservations
) {
8560 service
.await_reserved_maps();
8561 waited_for_reservations
= true;
8564 } else if (*p
!= whoami
&&
8565 osdmap
->is_down(*p
) &&
8566 newmap
->is_up(*p
)) {
8571 if (osdmap
->is_noup(whoami
) != newmap
->is_noup(whoami
)) {
8572 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
8575 // this captures the case where we sent the boot message while
8576 // NOUP was being set on the mon and our boot request was
8577 // dropped, and then later it is cleared. it imperfectly
8578 // handles the case where our original boot message was not
8579 // dropped and we restart even though we might have booted, but
8580 // that is harmless (boot will just take slightly longer).
8588 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
8590 osdmap
->is_up(whoami
) &&
8591 osdmap
->get_addrs(whoami
) == client_messenger
->get_myaddrs()) {
8592 up_epoch
= osdmap
->get_epoch();
8593 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
8595 boot_epoch
= osdmap
->get_epoch();
8596 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
8598 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
8602 had_map_since
= ceph_clock_now();
8604 epoch_t _bind_epoch
= service
.get_bind_epoch();
8605 if (osdmap
->is_up(whoami
) &&
8606 osdmap
->get_addrs(whoami
).legacy_equals(
8607 client_messenger
->get_myaddrs()) &&
8608 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
8611 dout(1) << "state: booting -> active" << dendl
;
8612 set_state(STATE_ACTIVE
);
8615 // set incarnation so that osd_reqid_t's we generate for our
8616 // objecter requests are unique across restarts.
8617 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
8618 cancel_pending_failures();
8622 if (osdmap
->get_epoch() > 0 &&
8624 if (!osdmap
->exists(whoami
)) {
8625 dout(0) << "map says i do not exist. shutting down." << dendl
;
8626 do_shutdown
= true; // don't call shutdown() while we have
8627 // everything paused
8628 } else if (!osdmap
->is_up(whoami
) ||
8629 !osdmap
->get_addrs(whoami
).legacy_equals(
8630 client_messenger
->get_myaddrs()) ||
8631 !osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8632 cluster_messenger
->get_myaddrs()) ||
8633 !osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8634 hb_back_server_messenger
->get_myaddrs()) ||
8635 !osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8636 hb_front_server_messenger
->get_myaddrs())) {
8637 if (!osdmap
->is_up(whoami
)) {
8638 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
8639 service
.got_stop_ack();
8641 clog
->warn() << "Monitor daemon marked osd." << whoami
<< " down, "
8642 "but it is still running";
8643 clog
->debug() << "map e" << osdmap
->get_epoch()
8644 << " wrongly marked me down at e"
8645 << osdmap
->get_down_at(whoami
);
8647 } else if (!osdmap
->get_addrs(whoami
).legacy_equals(
8648 client_messenger
->get_myaddrs())) {
8649 clog
->error() << "map e" << osdmap
->get_epoch()
8650 << " had wrong client addr (" << osdmap
->get_addrs(whoami
)
8651 << " != my " << client_messenger
->get_myaddrs() << ")";
8652 } else if (!osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8653 cluster_messenger
->get_myaddrs())) {
8654 clog
->error() << "map e" << osdmap
->get_epoch()
8655 << " had wrong cluster addr ("
8656 << osdmap
->get_cluster_addrs(whoami
)
8657 << " != my " << cluster_messenger
->get_myaddrs() << ")";
8658 } else if (!osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8659 hb_back_server_messenger
->get_myaddrs())) {
8660 clog
->error() << "map e" << osdmap
->get_epoch()
8661 << " had wrong heartbeat back addr ("
8662 << osdmap
->get_hb_back_addrs(whoami
)
8663 << " != my " << hb_back_server_messenger
->get_myaddrs()
8665 } else if (!osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8666 hb_front_server_messenger
->get_myaddrs())) {
8667 clog
->error() << "map e" << osdmap
->get_epoch()
8668 << " had wrong heartbeat front addr ("
8669 << osdmap
->get_hb_front_addrs(whoami
)
8670 << " != my " << hb_front_server_messenger
->get_myaddrs()
8674 if (!service
.is_stopping()) {
8675 epoch_t up_epoch
= 0;
8676 epoch_t bind_epoch
= osdmap
->get_epoch();
8677 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
8681 utime_t now
= ceph_clock_now();
8682 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
8683 osd_markdown_log
.push_back(now
);
8684 //clear all out-of-date log
8685 while (!osd_markdown_log
.empty() &&
8686 osd_markdown_log
.front() + grace
< now
)
8687 osd_markdown_log
.pop_front();
8688 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
8689 dout(0) << __func__
<< " marked down "
8690 << osd_markdown_log
.size()
8691 << " > osd_max_markdown_count "
8692 << cct
->_conf
->osd_max_markdown_count
8693 << " in last " << grace
<< " seconds, shutting down"
8699 start_waiting_for_healthy();
8701 set
<int> avoid_ports
;
8702 #if defined(__FreeBSD__)
8703 // prevent FreeBSD from grabbing the client_messenger port during
8704 // rebinding. In which case a cluster_meesneger will connect also
8706 client_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8708 cluster_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8709 hb_back_server_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8710 hb_front_server_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8712 int r
= cluster_messenger
->rebind(avoid_ports
);
8714 do_shutdown
= true; // FIXME: do_restart?
8715 network_error
= true;
8716 dout(0) << __func__
<< " marked down:"
8717 << " rebind cluster_messenger failed" << dendl
;
8720 r
= hb_back_server_messenger
->rebind(avoid_ports
);
8722 do_shutdown
= true; // FIXME: do_restart?
8723 network_error
= true;
8724 dout(0) << __func__
<< " marked down:"
8725 << " rebind hb_back_server_messenger failed" << dendl
;
8728 r
= hb_front_server_messenger
->rebind(avoid_ports
);
8730 do_shutdown
= true; // FIXME: do_restart?
8731 network_error
= true;
8732 dout(0) << __func__
<< " marked down:"
8733 << " rebind hb_front_server_messenger failed" << dendl
;
8736 hb_front_client_messenger
->mark_down_all();
8737 hb_back_client_messenger
->mark_down_all();
8739 reset_heartbeat_peers(true);
8744 map_lock
.put_write();
8746 check_osdmap_features();
8751 if (is_active() || is_waiting_for_healthy())
8752 maybe_update_heartbeat_peers();
8759 if (network_error
) {
8760 cancel_pending_failures();
8762 // trigger shutdown in a different thread
8763 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
8764 queue_async_signal(SIGINT
);
8766 else if (m
->newest_map
&& m
->newest_map
> last
) {
8767 dout(10) << " msg say newest map is " << m
->newest_map
8768 << ", requesting more" << dendl
;
8769 osdmap_subscribe(osdmap
->get_epoch()+1, false);
8771 else if (is_preboot()) {
8772 if (m
->get_source().is_mon())
8773 _preboot(m
->oldest_map
, m
->newest_map
);
8777 else if (do_restart
)
8782 void OSD::check_osdmap_features()
8784 // adjust required feature bits?
8786 // we have to be a bit careful here, because we are accessing the
8787 // Policy structures without taking any lock. in particular, only
8788 // modify integer values that can safely be read by a racing CPU.
8789 // since we are only accessing existing Policy structures a their
8790 // current memory location, and setting or clearing bits in integer
8791 // fields, and we are the only writer, this is not a problem.
8794 Messenger::Policy p
= client_messenger
->get_default_policy();
8796 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
8797 if ((p
.features_required
& mask
) != features
) {
8798 dout(0) << "crush map has features " << features
8799 << ", adjusting msgr requires for clients" << dendl
;
8800 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8801 client_messenger
->set_default_policy(p
);
8805 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
8807 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
8808 if ((p
.features_required
& mask
) != features
) {
8809 dout(0) << "crush map has features " << features
8810 << " was " << p
.features_required
8811 << ", adjusting msgr requires for mons" << dendl
;
8812 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8813 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
8817 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
8819 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
8821 if ((p
.features_required
& mask
) != features
) {
8822 dout(0) << "crush map has features " << features
8823 << ", adjusting msgr requires for osds" << dendl
;
8824 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8825 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
8828 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
8829 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
8830 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
8831 ObjectStore::Transaction t
;
8832 write_superblock(t
);
8833 int err
= store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
8834 ceph_assert(err
== 0);
8838 if (osdmap
->require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
8839 heartbeat_dispatcher
.ms_set_require_authorizer(false);
8842 if (osdmap
->require_osd_release
!= last_require_osd_release
) {
8843 dout(1) << __func__
<< " require_osd_release " << last_require_osd_release
8844 << " -> " << to_string(osdmap
->require_osd_release
) << dendl
;
8845 store
->write_meta("require_osd_release",
8846 stringify((int)osdmap
->require_osd_release
));
8847 last_require_osd_release
= osdmap
->require_osd_release
;
8851 struct C_FinishSplits
: public Context
{
8854 C_FinishSplits(OSD
*osd
, const set
<PGRef
> &in
)
8855 : osd(osd
), pgs(in
) {}
8856 void finish(int r
) override
{
8857 osd
->_finish_splits(pgs
);
8861 void OSD::_finish_splits(set
<PGRef
>& pgs
)
8863 dout(10) << __func__
<< " " << pgs
<< dendl
;
8866 PG::RecoveryCtx rctx
= create_context();
8867 for (set
<PGRef
>::iterator i
= pgs
.begin();
8873 dout(10) << __func__
<< " " << *pg
<< dendl
;
8874 epoch_t e
= pg
->get_osdmap_epoch();
8875 pg
->handle_initialize(&rctx
);
8876 pg
->queue_null(e
, e
);
8877 dispatch_context_transaction(rctx
, pg
);
8880 unsigned shard_index
= pg
->pg_id
.hash_to_shard(num_shards
);
8881 shards
[shard_index
]->register_and_wake_split_child(pg
);
8884 dispatch_context(rctx
, 0, service
.get_osdmap());
8887 bool OSD::add_merge_waiter(OSDMapRef nextmap
, spg_t target
, PGRef src
,
8890 std::lock_guard
l(merge_lock
);
8891 auto& p
= merge_waiters
[nextmap
->get_epoch()][target
];
8892 p
[src
->pg_id
] = src
;
8893 dout(10) << __func__
<< " added merge_waiter " << src
->pg_id
8894 << " for " << target
<< ", have " << p
.size() << "/" << need
8896 return p
.size() == need
;
8899 bool OSD::advance_pg(
8902 ThreadPool::TPHandle
&handle
,
8903 PG::RecoveryCtx
*rctx
)
8905 if (osd_epoch
<= pg
->get_osdmap_epoch()) {
8908 ceph_assert(pg
->is_locked());
8909 OSDMapRef lastmap
= pg
->get_osdmap();
8910 ceph_assert(lastmap
->get_epoch() < osd_epoch
);
8911 set
<PGRef
> new_pgs
; // any split children
8914 unsigned old_pg_num
= lastmap
->have_pg_pool(pg
->pg_id
.pool()) ?
8915 lastmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8916 for (epoch_t next_epoch
= pg
->get_osdmap_epoch() + 1;
8917 next_epoch
<= osd_epoch
;
8919 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
8921 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
8925 unsigned new_pg_num
=
8926 (old_pg_num
&& nextmap
->have_pg_pool(pg
->pg_id
.pool())) ?
8927 nextmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8928 if (old_pg_num
&& new_pg_num
&& old_pg_num
!= new_pg_num
) {
8930 if (nextmap
->have_pg_pool(pg
->pg_id
.pool())) {
8932 if (pg
->pg_id
.is_merge_source(
8936 // we are merge source
8937 PGRef spg
= pg
; // carry a ref
8938 dout(1) << __func__
<< " " << pg
->pg_id
8939 << " is merge source, target is " << parent
8941 pg
->write_if_dirty(rctx
);
8942 dispatch_context_transaction(*rctx
, pg
, &handle
);
8944 // release backoffs explicitly, since the on_shutdown path
8945 // aggressively tears down backoff state.
8946 if (pg
->is_primary()) {
8947 pg
->release_pg_backoffs();
8950 OSDShard
*sdata
= pg
->osd_shard
;
8952 std::lock_guard
l(sdata
->shard_lock
);
8954 sdata
->_detach_pg(pg
->pg_slot
);
8955 // update pg count now since we might not get an osdmap
8957 if (pg
->is_primary())
8958 logger
->dec(l_osd_pg_primary
);
8959 else if (pg
->is_replica())
8960 logger
->dec(l_osd_pg_replica
);
8962 logger
->dec(l_osd_pg_stray
);
8967 set
<spg_t
> children
;
8968 parent
.is_split(new_pg_num
, old_pg_num
, &children
);
8969 if (add_merge_waiter(nextmap
, parent
, pg
, children
.size())) {
8970 enqueue_peering_evt(
8973 std::make_shared
<PGPeeringEvent
>(
8974 nextmap
->get_epoch(),
8975 nextmap
->get_epoch(),
8980 } else if (pg
->pg_id
.is_merge_target(old_pg_num
, new_pg_num
)) {
8981 // we are merge target
8982 set
<spg_t
> children
;
8983 pg
->pg_id
.is_split(new_pg_num
, old_pg_num
, &children
);
8984 dout(20) << __func__
<< " " << pg
->pg_id
8985 << " is merge target, sources are " << children
8987 map
<spg_t
,PGRef
> sources
;
8989 std::lock_guard
l(merge_lock
);
8990 auto& s
= merge_waiters
[nextmap
->get_epoch()][pg
->pg_id
];
8991 unsigned need
= children
.size();
8992 dout(20) << __func__
<< " have " << s
.size() << "/"
8994 if (s
.size() == need
) {
8996 merge_waiters
[nextmap
->get_epoch()].erase(pg
->pg_id
);
8997 if (merge_waiters
[nextmap
->get_epoch()].empty()) {
8998 merge_waiters
.erase(nextmap
->get_epoch());
9002 if (!sources
.empty()) {
9003 unsigned new_pg_num
= nextmap
->get_pg_num(pg
->pg_id
.pool());
9004 unsigned split_bits
= pg
->pg_id
.get_split_bits(new_pg_num
);
9005 dout(1) << __func__
<< " merging " << pg
->pg_id
<< dendl
;
9007 sources
, rctx
, split_bits
,
9008 nextmap
->get_pg_pool(
9009 pg
->pg_id
.pool())->last_pg_merge_meta
);
9010 pg
->pg_slot
->waiting_for_merge_epoch
= 0;
9012 dout(20) << __func__
<< " not ready to merge yet" << dendl
;
9013 pg
->write_if_dirty(rctx
);
9015 // kick source(s) to get them ready
9016 for (auto& i
: children
) {
9017 dout(20) << __func__
<< " kicking source " << i
<< dendl
;
9018 enqueue_peering_evt(
9021 std::make_shared
<PGPeeringEvent
>(
9022 nextmap
->get_epoch(),
9023 nextmap
->get_epoch(),
9033 vector
<int> newup
, newacting
;
9034 int up_primary
, acting_primary
;
9035 nextmap
->pg_to_up_acting_osds(
9037 &newup
, &up_primary
,
9038 &newacting
, &acting_primary
);
9039 pg
->handle_advance_map(
9040 nextmap
, lastmap
, newup
, up_primary
,
9041 newacting
, acting_primary
, rctx
);
9043 auto oldpool
= lastmap
->get_pools().find(pg
->pg_id
.pool());
9044 auto newpool
= nextmap
->get_pools().find(pg
->pg_id
.pool());
9045 if (oldpool
!= lastmap
->get_pools().end()
9046 && newpool
!= nextmap
->get_pools().end()) {
9047 dout(20) << __func__
9048 << " new pool opts " << newpool
->second
.opts
9049 << " old pool opts " << oldpool
->second
.opts
9052 double old_min_interval
= 0, new_min_interval
= 0;
9053 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &old_min_interval
);
9054 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &new_min_interval
);
9056 double old_max_interval
= 0, new_max_interval
= 0;
9057 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &old_max_interval
);
9058 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &new_max_interval
);
9060 // Assume if an interval is change from set to unset or vice versa the actual config
9061 // is different. Keep it simple even if it is possible to call resched_all_scrub()
9063 if (old_min_interval
!= new_min_interval
|| old_max_interval
!= new_max_interval
) {
9064 pg
->on_info_history_change();
9068 if (new_pg_num
&& old_pg_num
!= new_pg_num
) {
9070 set
<spg_t
> children
;
9071 if (pg
->pg_id
.is_split(
9076 pg
, children
, &new_pgs
, lastmap
, nextmap
,
9082 old_pg_num
= new_pg_num
;
9083 handle
.reset_tp_timeout();
9085 pg
->handle_activate_map(rctx
);
9089 if (!new_pgs
.empty()) {
9090 rctx
->transaction
->register_on_applied(new C_FinishSplits(this, new_pgs
));
9095 void OSD::consume_map()
9097 ceph_assert(osd_lock
.is_locked());
9098 dout(7) << "consume_map version " << osdmap
->get_epoch() << dendl
;
9100 /** make sure the cluster is speaking in SORTBITWISE, because we don't
9101 * speak the older sorting version any more. Be careful not to force
9102 * a shutdown if we are merely processing old maps, though.
9104 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
) && is_active()) {
9105 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
9109 service
.pre_publish_map(osdmap
);
9110 service
.await_reserved_maps();
9111 service
.publish_map(osdmap
);
9113 // prime splits and merges
9114 set
<pair
<spg_t
,epoch_t
>> newly_split
; // splits, and when
9115 set
<pair
<spg_t
,epoch_t
>> merge_pgs
; // merge participants, and when
9116 for (auto& shard
: shards
) {
9117 shard
->identify_splits_and_merges(osdmap
, &newly_split
, &merge_pgs
);
9119 if (!newly_split
.empty()) {
9120 for (auto& shard
: shards
) {
9121 shard
->prime_splits(osdmap
, &newly_split
);
9123 ceph_assert(newly_split
.empty());
9126 // prune sent_ready_to_merge
9127 service
.prune_sent_ready_to_merge(osdmap
);
9129 // FIXME, maybe: We could race against an incoming peering message
9130 // that instantiates a merge PG after identify_merges() below and
9131 // never set up its peer to complete the merge. An OSD restart
9132 // would clear it up. This is a hard race to resolve,
9133 // extraordinarily rare (we only merge PGs that are stable and
9134 // clean, so it'd have to be an imported PG to an OSD with a
9135 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
9136 // replace all of this with a seastar-based code soon anyway.
9137 if (!merge_pgs
.empty()) {
9138 // mark the pgs we already have, or create new and empty merge
9139 // participants for those we are missing. do this all under the
9140 // shard lock so we don't have to worry about racing pg creates
9142 for (auto& shard
: shards
) {
9143 shard
->prime_merges(osdmap
, &merge_pgs
);
9145 ceph_assert(merge_pgs
.empty());
9148 service
.prune_pg_created();
9150 unsigned pushes_to_free
= 0;
9151 for (auto& shard
: shards
) {
9152 shard
->consume_map(osdmap
, &pushes_to_free
);
9155 vector
<spg_t
> pgids
;
9158 // count (FIXME, probably during seastar rewrite)
9159 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
9162 for (auto& pg
: pgs
) {
9163 // FIXME (probably during seastar rewrite): this is lockless and
9164 // racy, but we don't want to take pg lock here.
9165 if (pg
->is_primary())
9167 else if (pg
->is_replica())
9174 // FIXME (as part of seastar rewrite): move to OSDShard
9175 std::lock_guard
l(pending_creates_lock
);
9176 for (auto pg
= pending_creates_from_osd
.begin();
9177 pg
!= pending_creates_from_osd
.end();) {
9178 if (osdmap
->get_pg_acting_rank(pg
->first
, whoami
) < 0) {
9179 dout(10) << __func__
<< " pg " << pg
->first
<< " doesn't map here, "
9180 << "discarding pending_create_from_osd" << dendl
;
9181 pg
= pending_creates_from_osd
.erase(pg
);
9188 service
.maybe_inject_dispatch_delay();
9190 dispatch_sessions_waiting_on_map();
9192 service
.maybe_inject_dispatch_delay();
9194 service
.release_reserved_pushes(pushes_to_free
);
9196 // queue null events to push maps down to individual PGs
9197 for (auto pgid
: pgids
) {
9198 enqueue_peering_evt(
9201 std::make_shared
<PGPeeringEvent
>(
9202 osdmap
->get_epoch(),
9203 osdmap
->get_epoch(),
9206 logger
->set(l_osd_pg
, pgids
.size());
9207 logger
->set(l_osd_pg_primary
, num_pg_primary
);
9208 logger
->set(l_osd_pg_replica
, num_pg_replica
);
9209 logger
->set(l_osd_pg_stray
, num_pg_stray
);
9212 void OSD::activate_map()
9214 ceph_assert(osd_lock
.is_locked());
9216 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
9218 if (osdmap
->test_flag(CEPH_OSDMAP_FULL
)) {
9219 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl
;
9220 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
9224 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
9225 if (!service
.recovery_is_paused()) {
9226 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
9227 service
.pause_recovery();
9230 if (service
.recovery_is_paused()) {
9231 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
9232 service
.unpause_recovery();
9236 service
.activate_map();
9239 take_waiters(waiting_for_osdmap
);
9242 bool OSD::require_mon_peer(const Message
*m
)
9244 if (!m
->get_connection()->peer_is_mon()) {
9245 dout(0) << "require_mon_peer received from non-mon "
9246 << m
->get_connection()->get_peer_addr()
9247 << " " << *m
<< dendl
;
9253 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
9255 if (!m
->get_connection()->peer_is_mon() &&
9256 !m
->get_connection()->peer_is_mgr()) {
9257 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
9258 << m
->get_connection()->get_peer_addr()
9259 << " " << *m
<< dendl
;
9265 bool OSD::require_osd_peer(const Message
*m
)
9267 if (!m
->get_connection()->peer_is_osd()) {
9268 dout(0) << "require_osd_peer received from non-osd "
9269 << m
->get_connection()->get_peer_addr()
9270 << " " << *m
<< dendl
;
9276 bool OSD::require_self_aliveness(const Message
*m
, epoch_t epoch
)
9278 epoch_t up_epoch
= service
.get_up_epoch();
9279 if (epoch
< up_epoch
) {
9280 dout(7) << "from pre-up epoch " << epoch
<< " < " << up_epoch
<< dendl
;
9285 dout(7) << "still in boot state, dropping message " << *m
<< dendl
;
9292 bool OSD::require_same_peer_instance(const Message
*m
, OSDMapRef
& map
,
9293 bool is_fast_dispatch
)
9295 int from
= m
->get_source().num();
9297 if (map
->is_down(from
) ||
9298 (map
->get_cluster_addrs(from
) != m
->get_source_addrs())) {
9299 dout(5) << "from dead osd." << from
<< ", marking down, "
9300 << " msg was " << m
->get_source_inst().addr
9302 << (map
->is_up(from
) ?
9303 map
->get_cluster_addrs(from
) : entity_addrvec_t())
9305 ConnectionRef con
= m
->get_connection();
9307 auto priv
= con
->get_priv();
9308 if (auto s
= static_cast<Session
*>(priv
.get()); s
) {
9309 if (!is_fast_dispatch
)
9310 s
->session_dispatch_lock
.Lock();
9311 clear_session_waiting_on_map(s
);
9312 con
->set_priv(nullptr); // break ref <-> session cycle, if any
9314 if (!is_fast_dispatch
)
9315 s
->session_dispatch_lock
.Unlock();
9324 * require that we have same (or newer) map, and that
9325 * the source is the pg primary.
9327 bool OSD::require_same_or_newer_map(OpRequestRef
& op
, epoch_t epoch
,
9328 bool is_fast_dispatch
)
9330 const Message
*m
= op
->get_req();
9331 dout(15) << "require_same_or_newer_map " << epoch
9332 << " (i am " << osdmap
->get_epoch() << ") " << m
<< dendl
;
9334 ceph_assert(osd_lock
.is_locked());
9336 // do they have a newer map?
9337 if (epoch
> osdmap
->get_epoch()) {
9338 dout(7) << "waiting for newer map epoch " << epoch
9339 << " > my " << osdmap
->get_epoch() << " with " << m
<< dendl
;
9340 wait_for_new_map(op
);
9344 if (!require_self_aliveness(op
->get_req(), epoch
)) {
9348 // ok, our map is same or newer.. do they still exist?
9349 if (m
->get_connection()->get_messenger() == cluster_messenger
&&
9350 !require_same_peer_instance(op
->get_req(), osdmap
, is_fast_dispatch
)) {
9361 // ----------------------------------------
9364 void OSD::split_pgs(
9366 const set
<spg_t
> &childpgids
, set
<PGRef
> *out_pgs
,
9369 PG::RecoveryCtx
*rctx
)
9371 unsigned pg_num
= nextmap
->get_pg_num(parent
->pg_id
.pool());
9372 parent
->update_snap_mapper_bits(parent
->get_pgid().get_split_bits(pg_num
));
9374 vector
<object_stat_sum_t
> updated_stats
;
9375 parent
->start_split_stats(childpgids
, &updated_stats
);
9377 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
9378 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
9379 i
!= childpgids
.end();
9381 ceph_assert(stat_iter
!= updated_stats
.end());
9382 dout(10) << __func__
<< " splitting " << *parent
<< " into " << *i
<< dendl
;
9383 PG
* child
= _make_pg(nextmap
, *i
);
9385 out_pgs
->insert(child
);
9386 child
->ch
= store
->create_new_collection(child
->coll
);
9389 uint32_t shard_index
= i
->hash_to_shard(shards
.size());
9390 assert(NULL
!= shards
[shard_index
]);
9391 store
->set_collection_commit_queue(child
->coll
, &(shards
[shard_index
]->context_queue
));
9394 unsigned split_bits
= i
->get_split_bits(pg_num
);
9395 dout(10) << " pg_num is " << pg_num
9396 << ", m_seed " << i
->ps()
9397 << ", split_bits is " << split_bits
<< dendl
;
9398 parent
->split_colls(
9402 &child
->get_pool().info
,
9409 child
->finish_split_stats(*stat_iter
, rctx
->transaction
);
9412 ceph_assert(stat_iter
!= updated_stats
.end());
9413 parent
->finish_split_stats(*stat_iter
, rctx
->transaction
);
9419 void OSD::handle_pg_create(OpRequestRef op
)
9421 const MOSDPGCreate
*m
= static_cast<const MOSDPGCreate
*>(op
->get_req());
9422 ceph_assert(m
->get_type() == MSG_OSD_PG_CREATE
);
9424 dout(10) << "handle_pg_create " << *m
<< dendl
;
9426 if (!require_mon_peer(op
->get_req())) {
9430 if (!require_same_or_newer_map(op
, m
->epoch
, false))
9435 map
<pg_t
,utime_t
>::const_iterator ci
= m
->ctimes
.begin();
9436 for (map
<pg_t
,pg_create_t
>::const_iterator p
= m
->mkpg
.begin();
9439 ceph_assert(ci
!= m
->ctimes
.end() && ci
->first
== p
->first
);
9440 epoch_t created
= p
->second
.created
;
9441 if (p
->second
.split_bits
) // Skip split pgs
9445 if (!osdmap
->have_pg_pool(on
.pool())) {
9446 dout(20) << "ignoring pg on deleted pool " << on
<< dendl
;
9450 dout(20) << "mkpg " << on
<< " e" << created
<< "@" << ci
->second
<< dendl
;
9452 // is it still ours?
9453 vector
<int> up
, acting
;
9454 int up_primary
= -1;
9455 int acting_primary
= -1;
9456 osdmap
->pg_to_up_acting_osds(on
, &up
, &up_primary
, &acting
, &acting_primary
);
9457 int role
= osdmap
->calc_pg_role(whoami
, acting
, acting
.size());
9459 if (acting_primary
!= whoami
) {
9460 dout(10) << "mkpg " << on
<< " not acting_primary (" << acting_primary
9461 << "), my role=" << role
<< ", skipping" << dendl
;
9466 bool mapped
= osdmap
->get_primary_shard(on
, &pgid
);
9467 ceph_assert(mapped
);
9470 pg_history_t history
;
9471 build_initial_pg_history(pgid
, created
, ci
->second
, &history
, &pi
);
9473 // The mon won't resend unless the primary changed, so we ignore
9474 // same_interval_since. We'll pass this history with the current
9475 // epoch as the event.
9476 if (history
.same_primary_since
> m
->epoch
) {
9477 dout(10) << __func__
<< ": got obsolete pg create on pgid "
9478 << pgid
<< " from epoch " << m
->epoch
9479 << ", primary changed in " << history
.same_primary_since
9483 enqueue_peering_evt(
9486 std::make_shared
<PGPeeringEvent
>(
9487 osdmap
->get_epoch(),
9488 osdmap
->get_epoch(),
9493 osdmap
->get_epoch(),
9501 std::lock_guard
l(pending_creates_lock
);
9502 if (pending_creates_from_mon
== 0) {
9503 last_pg_create_epoch
= m
->epoch
;
9507 maybe_update_heartbeat_peers();
9511 // ----------------------------------------
9512 // peering and recovery
9514 PG::RecoveryCtx
OSD::create_context()
9516 ObjectStore::Transaction
*t
= new ObjectStore::Transaction
;
9517 map
<int, map
<spg_t
,pg_query_t
> > *query_map
=
9518 new map
<int, map
<spg_t
, pg_query_t
> >;
9519 map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > > *notify_list
=
9520 new map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > >;
9521 map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > > *info_map
=
9522 new map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > >;
9523 PG::RecoveryCtx
rctx(query_map
, info_map
, notify_list
, t
);
9527 void OSD::dispatch_context_transaction(PG::RecoveryCtx
&ctx
, PG
*pg
,
9528 ThreadPool::TPHandle
*handle
)
9530 if (!ctx
.transaction
->empty() || ctx
.transaction
->has_contexts()) {
9531 int tr
= store
->queue_transaction(
9533 std::move(*ctx
.transaction
), TrackedOpRef(), handle
);
9534 ceph_assert(tr
== 0);
9535 delete (ctx
.transaction
);
9536 ctx
.transaction
= new ObjectStore::Transaction
;
9540 void OSD::dispatch_context(PG::RecoveryCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
9541 ThreadPool::TPHandle
*handle
)
9543 if (!service
.get_osdmap()->is_up(whoami
)) {
9544 dout(20) << __func__
<< " not up in osdmap" << dendl
;
9545 } else if (!is_active()) {
9546 dout(20) << __func__
<< " not active" << dendl
;
9548 do_notifies(*ctx
.notify_list
, curmap
);
9549 do_queries(*ctx
.query_map
, curmap
);
9550 do_infos(*ctx
.info_map
, curmap
);
9552 if ((!ctx
.transaction
->empty() || ctx
.transaction
->has_contexts()) && pg
) {
9553 int tr
= store
->queue_transaction(
9555 std::move(*ctx
.transaction
), TrackedOpRef(),
9557 ceph_assert(tr
== 0);
9559 delete ctx
.notify_list
;
9560 delete ctx
.query_map
;
9561 delete ctx
.info_map
;
9562 delete ctx
.transaction
;
9565 void OSD::discard_context(PG::RecoveryCtx
& ctx
)
9567 delete ctx
.notify_list
;
9568 delete ctx
.query_map
;
9569 delete ctx
.info_map
;
9570 delete ctx
.transaction
;
9575 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
9576 * content for, and they are primary for.
9579 void OSD::do_notifies(
9580 map
<int,vector
<pair
<pg_notify_t
,PastIntervals
> > >& notify_list
,
9584 vector
<pair
<pg_notify_t
,PastIntervals
> > >::iterator it
=
9585 notify_list
.begin();
9586 it
!= notify_list
.end();
9588 if (!curmap
->is_up(it
->first
)) {
9589 dout(20) << __func__
<< " skipping down osd." << it
->first
<< dendl
;
9592 ConnectionRef con
= service
.get_con_osd_cluster(
9593 it
->first
, curmap
->get_epoch());
9595 dout(20) << __func__
<< " skipping osd." << it
->first
9596 << " (NULL con)" << dendl
;
9599 service
.share_map_peer(it
->first
, con
.get(), curmap
);
9600 dout(7) << __func__
<< " osd." << it
->first
9601 << " on " << it
->second
.size() << " PGs" << dendl
;
9602 MOSDPGNotify
*m
= new MOSDPGNotify(curmap
->get_epoch(),
9604 con
->send_message(m
);
9610 * send out pending queries for info | summaries
9612 void OSD::do_queries(map
<int, map
<spg_t
,pg_query_t
> >& query_map
,
9615 for (map
<int, map
<spg_t
,pg_query_t
> >::iterator pit
= query_map
.begin();
9616 pit
!= query_map
.end();
9618 if (!curmap
->is_up(pit
->first
)) {
9619 dout(20) << __func__
<< " skipping down osd." << pit
->first
<< dendl
;
9622 int who
= pit
->first
;
9623 ConnectionRef con
= service
.get_con_osd_cluster(who
, curmap
->get_epoch());
9625 dout(20) << __func__
<< " skipping osd." << who
9626 << " (NULL con)" << dendl
;
9629 service
.share_map_peer(who
, con
.get(), curmap
);
9630 dout(7) << __func__
<< " querying osd." << who
9631 << " on " << pit
->second
.size() << " PGs" << dendl
;
9632 MOSDPGQuery
*m
= new MOSDPGQuery(curmap
->get_epoch(), pit
->second
);
9633 con
->send_message(m
);
9638 void OSD::do_infos(map
<int,
9639 vector
<pair
<pg_notify_t
, PastIntervals
> > >& info_map
,
9643 vector
<pair
<pg_notify_t
, PastIntervals
> > >::iterator p
=
9645 p
!= info_map
.end();
9647 if (!curmap
->is_up(p
->first
)) {
9648 dout(20) << __func__
<< " skipping down osd." << p
->first
<< dendl
;
9651 for (vector
<pair
<pg_notify_t
,PastIntervals
> >::iterator i
= p
->second
.begin();
9652 i
!= p
->second
.end();
9654 dout(20) << __func__
<< " sending info " << i
->first
.info
9655 << " to shard " << p
->first
<< dendl
;
9657 ConnectionRef con
= service
.get_con_osd_cluster(
9658 p
->first
, curmap
->get_epoch());
9660 dout(20) << __func__
<< " skipping osd." << p
->first
9661 << " (NULL con)" << dendl
;
9664 service
.share_map_peer(p
->first
, con
.get(), curmap
);
9665 MOSDPGInfo
*m
= new MOSDPGInfo(curmap
->get_epoch());
9666 m
->pg_list
= p
->second
;
9667 con
->send_message(m
);
9672 void OSD::handle_fast_pg_create(MOSDPGCreate2
*m
)
9674 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9675 if (!require_mon_peer(m
)) {
9679 for (auto& p
: m
->pgs
) {
9680 spg_t pgid
= p
.first
;
9681 epoch_t created
= p
.second
.first
;
9682 utime_t created_stamp
= p
.second
.second
;
9683 dout(20) << __func__
<< " " << pgid
<< " e" << created
9684 << "@" << created_stamp
<< dendl
;
9686 h
.epoch_created
= created
;
9687 h
.epoch_pool_created
= created
;
9688 h
.same_up_since
= created
;
9689 h
.same_interval_since
= created
;
9690 h
.same_primary_since
= created
;
9691 h
.last_scrub_stamp
= created_stamp
;
9692 h
.last_deep_scrub_stamp
= created_stamp
;
9693 h
.last_clean_scrub_stamp
= created_stamp
;
9695 enqueue_peering_evt(
9698 std::make_shared
<PGPeeringEvent
>(
9713 std::lock_guard
l(pending_creates_lock
);
9714 if (pending_creates_from_mon
== 0) {
9715 last_pg_create_epoch
= m
->epoch
;
9722 void OSD::handle_fast_pg_query(MOSDPGQuery
*m
)
9724 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9725 if (!require_osd_peer(m
)) {
9729 int from
= m
->get_source().num();
9730 for (auto& p
: m
->pg_list
) {
9731 enqueue_peering_evt(
9734 std::make_shared
<PGPeeringEvent
>(
9735 p
.second
.epoch_sent
, p
.second
.epoch_sent
,
9738 pg_shard_t(from
, p
.second
.from
),
9740 p
.second
.epoch_sent
),
9747 void OSD::handle_fast_pg_notify(MOSDPGNotify
* m
)
9749 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9750 if (!require_osd_peer(m
)) {
9754 int from
= m
->get_source().num();
9755 for (auto& p
: m
->get_pg_list()) {
9756 spg_t
pgid(p
.first
.info
.pgid
.pgid
, p
.first
.to
);
9757 enqueue_peering_evt(
9760 std::make_shared
<PGPeeringEvent
>(
9762 p
.first
.query_epoch
,
9764 pgid
, pg_shard_t(from
, p
.first
.from
),
9766 m
->get_connection()->get_features(),
9771 p
.first
.query_epoch
,
9772 p
.first
.info
.history
,
9780 void OSD::handle_fast_pg_info(MOSDPGInfo
* m
)
9782 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9783 if (!require_osd_peer(m
)) {
9787 int from
= m
->get_source().num();
9788 for (auto& p
: m
->pg_list
) {
9789 enqueue_peering_evt(
9790 spg_t(p
.first
.info
.pgid
.pgid
, p
.first
.to
),
9792 std::make_shared
<PGPeeringEvent
>(
9793 p
.first
.epoch_sent
, p
.first
.query_epoch
,
9795 pg_shard_t(from
, p
.first
.from
),
9797 p
.first
.epoch_sent
)))
9803 void OSD::handle_fast_pg_remove(MOSDPGRemove
*m
)
9805 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9806 if (!require_osd_peer(m
)) {
9810 for (auto& pgid
: m
->pg_list
) {
9811 enqueue_peering_evt(
9814 std::make_shared
<PGPeeringEvent
>(
9815 m
->get_epoch(), m
->get_epoch(),
9816 PG::DeleteStart())));
9821 void OSD::handle_fast_force_recovery(MOSDForceRecovery
*m
)
9823 dout(10) << __func__
<< " " << *m
<< dendl
;
9824 if (!require_mon_or_mgr_peer(m
)) {
9828 epoch_t epoch
= get_osdmap_epoch();
9829 for (auto pgid
: m
->forced_pgs
) {
9830 if (m
->options
& OFR_BACKFILL
) {
9831 if (m
->options
& OFR_CANCEL
) {
9832 enqueue_peering_evt(
9835 std::make_shared
<PGPeeringEvent
>(
9837 PG::UnsetForceBackfill())));
9839 enqueue_peering_evt(
9842 std::make_shared
<PGPeeringEvent
>(
9844 PG::SetForceBackfill())));
9846 } else if (m
->options
& OFR_RECOVERY
) {
9847 if (m
->options
& OFR_CANCEL
) {
9848 enqueue_peering_evt(
9851 std::make_shared
<PGPeeringEvent
>(
9853 PG::UnsetForceRecovery())));
9855 enqueue_peering_evt(
9858 std::make_shared
<PGPeeringEvent
>(
9860 PG::SetForceRecovery())));
9867 void OSD::handle_pg_query_nopg(const MQuery
& q
)
9869 spg_t pgid
= q
.pgid
;
9870 dout(10) << __func__
<< " " << pgid
<< dendl
;
9872 OSDMapRef osdmap
= get_osdmap();
9873 if (!osdmap
->have_pg_pool(pgid
.pool()))
9876 dout(10) << " pg " << pgid
<< " dne" << dendl
;
9877 pg_info_t
empty(spg_t(pgid
.pgid
, q
.query
.to
));
9878 ConnectionRef con
= service
.get_con_osd_cluster(q
.from
.osd
, osdmap
->get_epoch());
9881 if (q
.query
.type
== pg_query_t::LOG
||
9882 q
.query
.type
== pg_query_t::FULLLOG
) {
9884 q
.query
.from
, q
.query
.to
,
9885 osdmap
->get_epoch(), empty
,
9886 q
.query
.epoch_sent
);
9888 vector
<pair
<pg_notify_t
,PastIntervals
>> ls
;
9892 q
.query
.from
, q
.query
.to
,
9894 osdmap
->get_epoch(),
9897 m
= new MOSDPGNotify(osdmap
->get_epoch(), ls
);
9899 service
.share_map_peer(q
.from
.osd
, con
.get(), osdmap
);
9900 con
->send_message(m
);
9905 // =========================================================
9908 void OSDService::_maybe_queue_recovery() {
9909 ceph_assert(recovery_lock
.is_locked_by_me());
9910 uint64_t available_pushes
;
9911 while (!awaiting_throttle
.empty() &&
9912 _recover_now(&available_pushes
)) {
9913 uint64_t to_start
= std::min(
9915 cct
->_conf
->osd_recovery_max_single_start
);
9916 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
9917 awaiting_throttle
.pop_front();
9918 dout(10) << __func__
<< " starting " << to_start
9919 << ", recovery_ops_reserved " << recovery_ops_reserved
9920 << " -> " << (recovery_ops_reserved
+ to_start
) << dendl
;
9921 recovery_ops_reserved
+= to_start
;
9925 bool OSDService::_recover_now(uint64_t *available_pushes
)
9927 if (available_pushes
)
9928 *available_pushes
= 0;
9930 if (ceph_clock_now() < defer_recovery_until
) {
9931 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
9935 if (recovery_paused
) {
9936 dout(15) << __func__
<< " paused" << dendl
;
9940 uint64_t max
= cct
->_conf
->osd_recovery_max_active
;
9941 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
9942 dout(15) << __func__
<< " active " << recovery_ops_active
9943 << " + reserved " << recovery_ops_reserved
9944 << " >= max " << max
<< dendl
;
9948 if (available_pushes
)
9949 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
9954 void OSD::do_recovery(
9955 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
,
9956 ThreadPool::TPHandle
&handle
)
9958 uint64_t started
= 0;
9961 * When the value of osd_recovery_sleep is set greater than zero, recovery
9962 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9963 * recovery event's schedule time. This is done by adding a
9964 * recovery_requeue_callback event, which re-queues the recovery op using
9965 * queue_recovery_after_sleep.
9967 float recovery_sleep
= get_osd_recovery_sleep();
9969 std::lock_guard
l(service
.sleep_lock
);
9970 if (recovery_sleep
> 0 && service
.recovery_needs_sleep
) {
9972 auto recovery_requeue_callback
= new FunctionContext([this, pgref
, queued
, reserved_pushes
](int r
) {
9973 dout(20) << "do_recovery wake up at "
9975 << ", re-queuing recovery" << dendl
;
9976 std::lock_guard
l(service
.sleep_lock
);
9977 service
.recovery_needs_sleep
= false;
9978 service
.queue_recovery_after_sleep(pgref
.get(), queued
, reserved_pushes
);
9981 // This is true for the first recovery op and when the previous recovery op
9982 // has been scheduled in the past. The next recovery op is scheduled after
9983 // completing the sleep from now.
9984 if (service
.recovery_schedule_time
< ceph_clock_now()) {
9985 service
.recovery_schedule_time
= ceph_clock_now();
9987 service
.recovery_schedule_time
+= recovery_sleep
;
9988 service
.sleep_timer
.add_event_at(service
.recovery_schedule_time
,
9989 recovery_requeue_callback
);
9990 dout(20) << "Recovery event scheduled at "
9991 << service
.recovery_schedule_time
<< dendl
;
9998 std::lock_guard
l(service
.sleep_lock
);
9999 service
.recovery_needs_sleep
= true;
10002 if (pg
->pg_has_reset_since(queued
)) {
10006 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
10007 #ifdef DEBUG_RECOVERY_OIDS
10008 dout(20) << " active was " << service
.recovery_oids
[pg
->pg_id
] << dendl
;
10011 bool do_unfound
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
10012 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
10013 << " on " << *pg
<< dendl
;
10016 PG::RecoveryCtx rctx
= create_context();
10017 rctx
.handle
= &handle
;
10018 pg
->find_unfound(queued
, &rctx
);
10019 dispatch_context(rctx
, pg
, pg
->get_osdmap());
10024 ceph_assert(started
<= reserved_pushes
);
10025 service
.release_reserved_pushes(reserved_pushes
);
10028 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
10030 std::lock_guard
l(recovery_lock
);
10031 dout(10) << "start_recovery_op " << *pg
<< " " << soid
10032 << " (" << recovery_ops_active
<< "/"
10033 << cct
->_conf
->osd_recovery_max_active
<< " rops)"
10035 recovery_ops_active
++;
10037 #ifdef DEBUG_RECOVERY_OIDS
10038 dout(20) << " active was " << recovery_oids
[pg
->pg_id
] << dendl
;
10039 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
) == 0);
10040 recovery_oids
[pg
->pg_id
].insert(soid
);
10044 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
10046 std::lock_guard
l(recovery_lock
);
10047 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
10048 << " dequeue=" << dequeue
10049 << " (" << recovery_ops_active
<< "/" << cct
->_conf
->osd_recovery_max_active
<< " rops)"
10053 ceph_assert(recovery_ops_active
> 0);
10054 recovery_ops_active
--;
10056 #ifdef DEBUG_RECOVERY_OIDS
10057 dout(20) << " active oids was " << recovery_oids
[pg
->pg_id
] << dendl
;
10058 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
));
10059 recovery_oids
[pg
->pg_id
].erase(soid
);
10062 _maybe_queue_recovery();
10065 bool OSDService::is_recovery_active()
10067 if (cct
->_conf
->osd_debug_pretend_recovery_active
) {
10070 return local_reserver
.has_reservation() || remote_reserver
.has_reservation();
10073 void OSDService::release_reserved_pushes(uint64_t pushes
)
10075 std::lock_guard
l(recovery_lock
);
10076 dout(10) << __func__
<< "(" << pushes
<< "), recovery_ops_reserved "
10077 << recovery_ops_reserved
<< " -> " << (recovery_ops_reserved
-pushes
)
10079 ceph_assert(recovery_ops_reserved
>= pushes
);
10080 recovery_ops_reserved
-= pushes
;
10081 _maybe_queue_recovery();
10084 // =========================================================
10087 bool OSD::op_is_discardable(const MOSDOp
*op
)
10089 // drop client request if they are not connected and can't get the
10091 if (!op
->get_connection()->is_connected()) {
10097 void OSD::enqueue_op(spg_t pg
, OpRequestRef
&& op
, epoch_t epoch
)
10099 const utime_t stamp
= op
->get_req()->get_recv_stamp();
10100 const utime_t latency
= ceph_clock_now() - stamp
;
10101 const unsigned priority
= op
->get_req()->get_priority();
10102 const int cost
= op
->get_req()->get_cost();
10103 const uint64_t owner
= op
->get_req()->get_source().num();
10105 dout(15) << "enqueue_op " << op
<< " prio " << priority
10106 << " cost " << cost
10107 << " latency " << latency
10108 << " epoch " << epoch
10109 << " " << *(op
->get_req()) << dendl
;
10110 op
->osd_trace
.event("enqueue op");
10111 op
->osd_trace
.keyval("priority", priority
);
10112 op
->osd_trace
.keyval("cost", cost
);
10113 op
->mark_queued_for_pg();
10114 logger
->tinc(l_osd_op_before_queue_op_lat
, latency
);
10115 op_shardedwq
.queue(
10117 unique_ptr
<OpQueueItem::OpQueueable
>(new PGOpItem(pg
, std::move(op
))),
10118 cost
, priority
, stamp
, owner
, epoch
));
10121 void OSD::enqueue_peering_evt(spg_t pgid
, PGPeeringEventRef evt
)
10123 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
10124 op_shardedwq
.queue(
10126 unique_ptr
<OpQueueItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
10128 cct
->_conf
->osd_peering_op_priority
,
10131 evt
->get_epoch_sent()));
10134 void OSD::enqueue_peering_evt_front(spg_t pgid
, PGPeeringEventRef evt
)
10136 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
10137 op_shardedwq
.queue_front(
10139 unique_ptr
<OpQueueItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
10141 cct
->_conf
->osd_peering_op_priority
,
10144 evt
->get_epoch_sent()));
10148 * NOTE: dequeue called in worker thread, with pg lock
10150 void OSD::dequeue_op(
10151 PGRef pg
, OpRequestRef op
,
10152 ThreadPool::TPHandle
&handle
)
10155 OID_EVENT_TRACE_WITH_MSG(op
->get_req(), "DEQUEUE_OP_BEGIN", false);
10157 utime_t now
= ceph_clock_now();
10158 op
->set_dequeued_time(now
);
10159 utime_t latency
= now
- op
->get_req()->get_recv_stamp();
10160 dout(10) << "dequeue_op " << op
<< " prio " << op
->get_req()->get_priority()
10161 << " cost " << op
->get_req()->get_cost()
10162 << " latency " << latency
10163 << " " << *(op
->get_req())
10164 << " pg " << *pg
<< dendl
;
10166 logger
->tinc(l_osd_op_before_dequeue_op_lat
, latency
);
10168 auto priv
= op
->get_req()->get_connection()->get_priv();
10169 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
10170 maybe_share_map(session
, op
, pg
->get_osdmap());
10173 if (pg
->is_deleting())
10176 op
->mark_reached_pg();
10177 op
->osd_trace
.event("dequeue_op");
10179 pg
->do_request(op
, handle
);
10182 dout(10) << "dequeue_op " << op
<< " finish" << dendl
;
10183 OID_EVENT_TRACE_WITH_MSG(op
->get_req(), "DEQUEUE_OP_END", false);
10187 void OSD::dequeue_peering_evt(
10190 PGPeeringEventRef evt
,
10191 ThreadPool::TPHandle
& handle
)
10193 PG::RecoveryCtx rctx
= create_context();
10194 auto curmap
= sdata
->get_osdmap();
10195 epoch_t need_up_thru
= 0, same_interval_since
= 0;
10197 if (const MQuery
*q
= dynamic_cast<const MQuery
*>(evt
->evt
.get())) {
10198 handle_pg_query_nopg(*q
);
10200 derr
<< __func__
<< " unrecognized pg-less event " << evt
->get_desc() << dendl
;
10203 } else if (advance_pg(curmap
->get_epoch(), pg
, handle
, &rctx
)) {
10204 pg
->do_peering_event(evt
, &rctx
);
10205 if (pg
->is_deleted()) {
10206 // do not dispatch rctx; the final _delete_some already did it.
10207 discard_context(rctx
);
10211 dispatch_context_transaction(rctx
, pg
, &handle
);
10212 need_up_thru
= pg
->get_need_up_thru();
10213 same_interval_since
= pg
->get_same_interval_since();
10217 if (need_up_thru
) {
10218 queue_want_up_thru(same_interval_since
);
10220 dispatch_context(rctx
, pg
, curmap
, &handle
);
10222 service
.send_pg_temp();
10225 void OSD::dequeue_delete(
10229 ThreadPool::TPHandle
& handle
)
10231 dequeue_peering_evt(
10235 std::make_shared
<PGPeeringEvent
>(
10237 PG::DeleteSome())),
10243 // --------------------------------
10245 const char** OSD::get_tracked_conf_keys() const
10247 static const char* KEYS
[] = {
10248 "osd_max_backfills",
10249 "osd_min_recovery_priority",
10250 "osd_max_trimming_pgs",
10251 "osd_op_complaint_time",
10252 "osd_op_log_threshold",
10253 "osd_op_history_size",
10254 "osd_op_history_duration",
10255 "osd_op_history_slow_op_size",
10256 "osd_op_history_slow_op_threshold",
10257 "osd_enable_op_tracker",
10258 "osd_map_cache_size",
10259 "osd_pg_epoch_max_lag_factor",
10260 "osd_pg_epoch_persisted_max_stale",
10261 // clog & admin clog
10262 "clog_to_monitors",
10264 "clog_to_syslog_facility",
10265 "clog_to_syslog_level",
10266 "osd_objectstore_fuse",
10268 "clog_to_graylog_host",
10269 "clog_to_graylog_port",
10272 "osd_recovery_delay_start",
10273 "osd_client_message_size_cap",
10274 "osd_client_message_cap",
10275 "osd_heartbeat_min_size",
10276 "osd_heartbeat_interval",
10277 "osd_scrub_min_interval",
10278 "osd_scrub_max_interval",
10284 void OSD::handle_conf_change(const ConfigProxy
& conf
,
10285 const std::set
<std::string
> &changed
)
10287 Mutex::Locker
l(osd_lock
);
10288 if (changed
.count("osd_max_backfills")) {
10289 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
10290 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
10292 if (changed
.count("osd_min_recovery_priority")) {
10293 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
10294 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
10296 if (changed
.count("osd_max_trimming_pgs")) {
10297 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
10299 if (changed
.count("osd_op_complaint_time") ||
10300 changed
.count("osd_op_log_threshold")) {
10301 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
10302 cct
->_conf
->osd_op_log_threshold
);
10304 if (changed
.count("osd_op_history_size") ||
10305 changed
.count("osd_op_history_duration")) {
10306 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
10307 cct
->_conf
->osd_op_history_duration
);
10309 if (changed
.count("osd_op_history_slow_op_size") ||
10310 changed
.count("osd_op_history_slow_op_threshold")) {
10311 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
10312 cct
->_conf
->osd_op_history_slow_op_threshold
);
10314 if (changed
.count("osd_enable_op_tracker")) {
10315 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
10317 if (changed
.count("osd_map_cache_size")) {
10318 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10319 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10320 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10322 if (changed
.count("clog_to_monitors") ||
10323 changed
.count("clog_to_syslog") ||
10324 changed
.count("clog_to_syslog_level") ||
10325 changed
.count("clog_to_syslog_facility") ||
10326 changed
.count("clog_to_graylog") ||
10327 changed
.count("clog_to_graylog_host") ||
10328 changed
.count("clog_to_graylog_port") ||
10329 changed
.count("host") ||
10330 changed
.count("fsid")) {
10331 update_log_config();
10333 if (changed
.count("osd_pg_epoch_max_lag_factor")) {
10334 m_osd_pg_epoch_max_lag_factor
= conf
.get_val
<double>(
10335 "osd_pg_epoch_max_lag_factor");
10338 #ifdef HAVE_LIBFUSE
10339 if (changed
.count("osd_objectstore_fuse")) {
10341 enable_disable_fuse(false);
10346 if (changed
.count("osd_recovery_delay_start")) {
10347 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
10348 service
.kick_recovery_queue();
10351 if (changed
.count("osd_client_message_cap")) {
10352 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
10353 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10354 if (pol
.throttler_messages
&& newval
> 0) {
10355 pol
.throttler_messages
->reset_max(newval
);
10358 if (changed
.count("osd_client_message_size_cap")) {
10359 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
10360 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10361 if (pol
.throttler_bytes
&& newval
> 0) {
10362 pol
.throttler_bytes
->reset_max(newval
);
10366 if (changed
.count("osd_scrub_min_interval") ||
10367 changed
.count("osd_scrub_max_interval")) {
10368 resched_all_scrubs();
10369 dout(0) << __func__
<< ": scrub interval change" << dendl
;
10374 void OSD::update_log_config()
10376 map
<string
,string
> log_to_monitors
;
10377 map
<string
,string
> log_to_syslog
;
10378 map
<string
,string
> log_channel
;
10379 map
<string
,string
> log_prio
;
10380 map
<string
,string
> log_to_graylog
;
10381 map
<string
,string
> log_to_graylog_host
;
10382 map
<string
,string
> log_to_graylog_port
;
10386 if (parse_log_client_options(cct
, log_to_monitors
, log_to_syslog
,
10387 log_channel
, log_prio
, log_to_graylog
,
10388 log_to_graylog_host
, log_to_graylog_port
,
10390 clog
->update_config(log_to_monitors
, log_to_syslog
,
10391 log_channel
, log_prio
, log_to_graylog
,
10392 log_to_graylog_host
, log_to_graylog_port
,
10394 derr
<< "log_to_monitors " << log_to_monitors
<< dendl
;
10397 void OSD::check_config()
10399 // some sanity checks
10400 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
10401 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
10402 << " is not > osd_pg_epoch_persisted_max_stale ("
10403 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
10407 // --------------------------------
10409 void OSD::get_latest_osdmap()
10411 dout(10) << __func__
<< " -- start" << dendl
;
10414 service
.objecter
->wait_for_latest_osdmap(&cond
);
10417 dout(10) << __func__
<< " -- finish" << dendl
;
10420 // --------------------------------
10422 int OSD::init_op_flags(OpRequestRef
& op
)
10424 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
10425 vector
<OSDOp
>::const_iterator iter
;
10427 // client flags have no bearing on whether an op is a read, write, etc.
10430 if (m
->has_flag(CEPH_OSD_FLAG_RWORDERED
)) {
10431 op
->set_force_rwordered();
10434 // set bits based on op codes, called methods.
10435 for (iter
= m
->ops
.begin(); iter
!= m
->ops
.end(); ++iter
) {
10436 if ((iter
->op
.op
== CEPH_OSD_OP_WATCH
&&
10437 iter
->op
.watch
.op
== CEPH_OSD_WATCH_OP_PING
)) {
10438 /* This a bit odd. PING isn't actually a write. It can't
10439 * result in an update to the object_info. PINGs also aren't
10440 * resent, so there's no reason to write out a log entry.
10442 * However, we pipeline them behind writes, so let's force
10443 * the write_ordered flag.
10445 op
->set_force_rwordered();
10447 if (ceph_osd_op_mode_modify(iter
->op
.op
))
10450 if (ceph_osd_op_mode_read(iter
->op
.op
))
10453 // set READ flag if there are src_oids
10454 if (iter
->soid
.oid
.name
.length())
10457 // set PGOP flag if there are PG ops
10458 if (ceph_osd_op_type_pg(iter
->op
.op
))
10461 if (ceph_osd_op_mode_cache(iter
->op
.op
))
10464 // check for ec base pool
10465 int64_t poolid
= m
->get_pg().pool();
10466 const pg_pool_t
*pool
= osdmap
->get_pg_pool(poolid
);
10467 if (pool
&& pool
->is_tier()) {
10468 const pg_pool_t
*base_pool
= osdmap
->get_pg_pool(pool
->tier_of
);
10469 if (base_pool
&& base_pool
->require_rollback()) {
10470 if ((iter
->op
.op
!= CEPH_OSD_OP_READ
) &&
10471 (iter
->op
.op
!= CEPH_OSD_OP_CHECKSUM
) &&
10472 (iter
->op
.op
!= CEPH_OSD_OP_CMPEXT
) &&
10473 (iter
->op
.op
!= CEPH_OSD_OP_STAT
) &&
10474 (iter
->op
.op
!= CEPH_OSD_OP_ISDIRTY
) &&
10475 (iter
->op
.op
!= CEPH_OSD_OP_UNDIRTY
) &&
10476 (iter
->op
.op
!= CEPH_OSD_OP_GETXATTR
) &&
10477 (iter
->op
.op
!= CEPH_OSD_OP_GETXATTRS
) &&
10478 (iter
->op
.op
!= CEPH_OSD_OP_CMPXATTR
) &&
10479 (iter
->op
.op
!= CEPH_OSD_OP_ASSERT_VER
) &&
10480 (iter
->op
.op
!= CEPH_OSD_OP_LIST_WATCHERS
) &&
10481 (iter
->op
.op
!= CEPH_OSD_OP_LIST_SNAPS
) &&
10482 (iter
->op
.op
!= CEPH_OSD_OP_SETALLOCHINT
) &&
10483 (iter
->op
.op
!= CEPH_OSD_OP_WRITEFULL
) &&
10484 (iter
->op
.op
!= CEPH_OSD_OP_ROLLBACK
) &&
10485 (iter
->op
.op
!= CEPH_OSD_OP_CREATE
) &&
10486 (iter
->op
.op
!= CEPH_OSD_OP_DELETE
) &&
10487 (iter
->op
.op
!= CEPH_OSD_OP_SETXATTR
) &&
10488 (iter
->op
.op
!= CEPH_OSD_OP_RMXATTR
) &&
10489 (iter
->op
.op
!= CEPH_OSD_OP_STARTSYNC
) &&
10490 (iter
->op
.op
!= CEPH_OSD_OP_COPY_GET
) &&
10491 (iter
->op
.op
!= CEPH_OSD_OP_COPY_FROM
)) {
10497 switch (iter
->op
.op
) {
10498 case CEPH_OSD_OP_CALL
:
10500 bufferlist::iterator bp
= const_cast<bufferlist
&>(iter
->indata
).begin();
10501 int is_write
, is_read
;
10502 string cname
, mname
;
10503 bp
.copy(iter
->op
.cls
.class_len
, cname
);
10504 bp
.copy(iter
->op
.cls
.method_len
, mname
);
10506 ClassHandler::ClassData
*cls
;
10507 int r
= class_handler
->open_class(cname
, &cls
);
10509 derr
<< "class " << cname
<< " open got " << cpp_strerror(r
) << dendl
;
10512 else if (r
!= -EPERM
) // propagate permission errors
10516 int flags
= cls
->get_method_flags(mname
.c_str());
10518 if (flags
== -ENOENT
)
10524 is_read
= flags
& CLS_METHOD_RD
;
10525 is_write
= flags
& CLS_METHOD_WR
;
10526 bool is_promote
= flags
& CLS_METHOD_PROMOTE
;
10528 dout(10) << "class " << cname
<< " method " << mname
<< " "
10529 << "flags=" << (is_read
? "r" : "")
10530 << (is_write
? "w" : "")
10531 << (is_promote
? "p" : "")
10534 op
->set_class_read();
10536 op
->set_class_write();
10539 op
->add_class(std::move(cname
), std::move(mname
), is_read
, is_write
,
10544 case CEPH_OSD_OP_WATCH
:
10545 // force the read bit for watch since it is depends on previous
10546 // watch state (and may return early if the watch exists) or, in
10547 // the case of ping, is simply a read op.
10550 case CEPH_OSD_OP_NOTIFY
:
10551 case CEPH_OSD_OP_NOTIFY_ACK
:
10557 case CEPH_OSD_OP_DELETE
:
10558 // if we get a delete with FAILOK we can skip handle cache. without
10559 // FAILOK we still need to promote (or do something smarter) to
10560 // determine whether to return ENOENT or 0.
10561 if (iter
== m
->ops
.begin() &&
10562 iter
->op
.flags
== CEPH_OSD_OP_FLAG_FAILOK
) {
10563 op
->set_skip_handle_cache();
10565 // skip promotion when proxying a delete op
10566 if (m
->ops
.size() == 1) {
10567 op
->set_skip_promote();
10571 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
10572 case CEPH_OSD_OP_CACHE_FLUSH
:
10573 case CEPH_OSD_OP_CACHE_EVICT
:
10574 // If try_flush/flush/evict is the only op, can skip handle cache.
10575 if (m
->ops
.size() == 1) {
10576 op
->set_skip_handle_cache();
10580 case CEPH_OSD_OP_READ
:
10581 case CEPH_OSD_OP_SYNC_READ
:
10582 case CEPH_OSD_OP_SPARSE_READ
:
10583 case CEPH_OSD_OP_CHECKSUM
:
10584 case CEPH_OSD_OP_WRITEFULL
:
10585 if (m
->ops
.size() == 1 &&
10586 (iter
->op
.flags
& CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
||
10587 iter
->op
.flags
& CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
)) {
10588 op
->set_skip_promote();
10592 // force promotion when pin an object in cache tier
10593 case CEPH_OSD_OP_CACHE_PIN
:
10602 if (op
->rmw_flags
== 0)
10608 void OSD::set_perf_queries(
10609 const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
) {
10610 dout(10) << "setting " << queries
.size() << " queries" << dendl
;
10612 std::list
<OSDPerfMetricQuery
> supported_queries
;
10613 for (auto &it
: queries
) {
10614 auto &query
= it
.first
;
10615 if (!query
.key_descriptor
.empty()) {
10616 supported_queries
.push_back(query
);
10619 if (supported_queries
.size() < queries
.size()) {
10620 dout(1) << queries
.size() - supported_queries
.size()
10621 << " unsupported queries" << dendl
;
10625 Mutex::Locker
locker(m_perf_queries_lock
);
10626 m_perf_queries
= supported_queries
;
10627 m_perf_limits
= queries
;
10630 std::vector
<PGRef
> pgs
;
10632 for (auto& pg
: pgs
) {
10634 pg
->set_dynamic_perf_stats_queries(supported_queries
);
10639 void OSD::get_perf_reports(
10640 std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> *reports
) {
10641 std::vector
<PGRef
> pgs
;
10643 DynamicPerfStats dps
;
10644 for (auto& pg
: pgs
) {
10645 // m_perf_queries can be modified only in set_perf_queries by mgr client
10646 // request, and it is protected by by mgr client's lock, which is held
10647 // when set_perf_queries/get_perf_reports are called, so we may not hold
10648 // m_perf_queries_lock here.
10649 DynamicPerfStats
pg_dps(m_perf_queries
);
10651 pg
->get_dynamic_perf_stats(&pg_dps
);
10655 dps
.add_to_reports(m_perf_limits
, reports
);
10656 dout(20) << "reports for " << reports
->size() << " queries" << dendl
;
10659 // =============================================================
10661 #undef dout_context
10662 #define dout_context cct
10664 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10666 void OSDShard::_attach_pg(OSDShardPGSlot
*slot
, PG
*pg
)
10668 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10670 pg
->osd_shard
= this;
10671 pg
->pg_slot
= slot
;
10672 osd
->inc_num_pgs();
10674 slot
->epoch
= pg
->get_osdmap_epoch();
10675 pg_slots_by_epoch
.insert(*slot
);
10678 void OSDShard::_detach_pg(OSDShardPGSlot
*slot
)
10680 dout(10) << slot
->pg
->pg_id
<< " " << slot
->pg
<< dendl
;
10681 slot
->pg
->osd_shard
= nullptr;
10682 slot
->pg
->pg_slot
= nullptr;
10683 slot
->pg
= nullptr;
10684 osd
->dec_num_pgs();
10686 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10688 if (waiting_for_min_pg_epoch
) {
10689 min_pg_epoch_cond
.notify_all();
10693 void OSDShard::update_pg_epoch(OSDShardPGSlot
*slot
, epoch_t e
)
10695 std::lock_guard
l(shard_lock
);
10696 dout(30) << "min was " << pg_slots_by_epoch
.begin()->epoch
10697 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10698 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10699 dout(20) << slot
->pg
->pg_id
<< " " << slot
->epoch
<< " -> " << e
<< dendl
;
10701 pg_slots_by_epoch
.insert(*slot
);
10702 dout(30) << "min is now " << pg_slots_by_epoch
.begin()->epoch
10703 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10704 if (waiting_for_min_pg_epoch
) {
10705 min_pg_epoch_cond
.notify_all();
10709 epoch_t
OSDShard::get_min_pg_epoch()
10711 std::lock_guard
l(shard_lock
);
10712 auto p
= pg_slots_by_epoch
.begin();
10713 if (p
== pg_slots_by_epoch
.end()) {
10719 void OSDShard::wait_min_pg_epoch(epoch_t need
)
10721 std::unique_lock l
{shard_lock
};
10722 ++waiting_for_min_pg_epoch
;
10723 min_pg_epoch_cond
.wait(l
, [need
, this] {
10724 if (pg_slots_by_epoch
.empty()) {
10726 } else if (pg_slots_by_epoch
.begin()->epoch
>= need
) {
10729 dout(10) << need
<< " waiting on "
10730 << pg_slots_by_epoch
.begin()->epoch
<< dendl
;
10734 --waiting_for_min_pg_epoch
;
10737 epoch_t
OSDShard::get_max_waiting_epoch()
10739 std::lock_guard
l(shard_lock
);
10741 for (auto& i
: pg_slots
) {
10742 if (!i
.second
->waiting_peering
.empty()) {
10743 r
= std::max(r
, i
.second
->waiting_peering
.rbegin()->first
);
10749 void OSDShard::consume_map(
10750 OSDMapRef
& new_osdmap
,
10751 unsigned *pushes_to_free
)
10753 std::lock_guard
l(shard_lock
);
10754 OSDMapRef old_osdmap
;
10756 std::lock_guard
l(osdmap_lock
);
10757 old_osdmap
= std::move(shard_osdmap
);
10758 shard_osdmap
= new_osdmap
;
10760 dout(10) << new_osdmap
->get_epoch()
10761 << " (was " << (old_osdmap
? old_osdmap
->get_epoch() : 0) << ")"
10763 bool queued
= false;
10766 auto p
= pg_slots
.begin();
10767 while (p
!= pg_slots
.end()) {
10768 OSDShardPGSlot
*slot
= p
->second
.get();
10769 const spg_t
& pgid
= p
->first
;
10770 dout(20) << __func__
<< " " << pgid
<< dendl
;
10771 if (!slot
->waiting_for_split
.empty()) {
10772 dout(20) << __func__
<< " " << pgid
10773 << " waiting for split " << slot
->waiting_for_split
<< dendl
;
10777 if (slot
->waiting_for_merge_epoch
> new_osdmap
->get_epoch()) {
10778 dout(20) << __func__
<< " " << pgid
10779 << " waiting for merge by epoch " << slot
->waiting_for_merge_epoch
10784 if (!slot
->waiting_peering
.empty()) {
10785 epoch_t first
= slot
->waiting_peering
.begin()->first
;
10786 if (first
<= new_osdmap
->get_epoch()) {
10787 dout(20) << __func__
<< " " << pgid
10788 << " pending_peering first epoch " << first
10789 << " <= " << new_osdmap
->get_epoch() << ", requeueing" << dendl
;
10790 _wake_pg_slot(pgid
, slot
);
10796 if (!slot
->waiting
.empty()) {
10797 if (new_osdmap
->is_up_acting_osd_shard(pgid
, osd
->get_nodeid())) {
10798 dout(20) << __func__
<< " " << pgid
<< " maps to us, keeping"
10803 while (!slot
->waiting
.empty() &&
10804 slot
->waiting
.front().get_map_epoch() <= new_osdmap
->get_epoch()) {
10805 auto& qi
= slot
->waiting
.front();
10806 dout(20) << __func__
<< " " << pgid
10807 << " waiting item " << qi
10808 << " epoch " << qi
.get_map_epoch()
10809 << " <= " << new_osdmap
->get_epoch()
10811 << (qi
.get_map_epoch() < new_osdmap
->get_epoch() ? "stale" :
10813 << ", dropping" << dendl
;
10814 *pushes_to_free
+= qi
.get_reserved_pushes();
10815 slot
->waiting
.pop_front();
10818 if (slot
->waiting
.empty() &&
10819 slot
->num_running
== 0 &&
10820 slot
->waiting_for_split
.empty() &&
10822 dout(20) << __func__
<< " " << pgid
<< " empty, pruning" << dendl
;
10823 p
= pg_slots
.erase(p
);
10830 std::lock_guard l
{sdata_wait_lock
};
10831 sdata_cond
.notify_one();
10835 void OSDShard::_wake_pg_slot(
10837 OSDShardPGSlot
*slot
)
10839 dout(20) << __func__
<< " " << pgid
10840 << " to_process " << slot
->to_process
10841 << " waiting " << slot
->waiting
10842 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10843 for (auto i
= slot
->to_process
.rbegin();
10844 i
!= slot
->to_process
.rend();
10846 _enqueue_front(std::move(*i
), osd
->op_prio_cutoff
);
10848 slot
->to_process
.clear();
10849 for (auto i
= slot
->waiting
.rbegin();
10850 i
!= slot
->waiting
.rend();
10852 _enqueue_front(std::move(*i
), osd
->op_prio_cutoff
);
10854 slot
->waiting
.clear();
10855 for (auto i
= slot
->waiting_peering
.rbegin();
10856 i
!= slot
->waiting_peering
.rend();
10858 // this is overkill; we requeue everything, even if some of these
10859 // items are waiting for maps we don't have yet. FIXME, maybe,
10860 // someday, if we decide this inefficiency matters
10861 for (auto j
= i
->second
.rbegin(); j
!= i
->second
.rend(); ++j
) {
10862 _enqueue_front(std::move(*j
), osd
->op_prio_cutoff
);
10865 slot
->waiting_peering
.clear();
10866 ++slot
->requeue_seq
;
10869 void OSDShard::identify_splits_and_merges(
10870 const OSDMapRef
& as_of_osdmap
,
10871 set
<pair
<spg_t
,epoch_t
>> *split_pgs
,
10872 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10874 std::lock_guard
l(shard_lock
);
10875 if (shard_osdmap
) {
10876 for (auto& i
: pg_slots
) {
10877 const spg_t
& pgid
= i
.first
;
10878 auto *slot
= i
.second
.get();
10880 osd
->service
.identify_splits_and_merges(
10881 shard_osdmap
, as_of_osdmap
, pgid
,
10882 split_pgs
, merge_pgs
);
10883 } else if (!slot
->waiting_for_split
.empty()) {
10884 osd
->service
.identify_splits_and_merges(
10885 shard_osdmap
, as_of_osdmap
, pgid
,
10886 split_pgs
, nullptr);
10888 dout(20) << __func__
<< " slot " << pgid
10889 << " has no pg and waiting_for_split "
10890 << slot
->waiting_for_split
<< dendl
;
10896 void OSDShard::prime_splits(const OSDMapRef
& as_of_osdmap
,
10897 set
<pair
<spg_t
,epoch_t
>> *pgids
)
10899 std::lock_guard
l(shard_lock
);
10900 _prime_splits(pgids
);
10901 if (shard_osdmap
->get_epoch() > as_of_osdmap
->get_epoch()) {
10902 set
<pair
<spg_t
,epoch_t
>> newer_children
;
10903 for (auto i
: *pgids
) {
10904 osd
->service
.identify_splits_and_merges(
10905 as_of_osdmap
, shard_osdmap
, i
.first
,
10906 &newer_children
, nullptr);
10908 newer_children
.insert(pgids
->begin(), pgids
->end());
10909 dout(10) << "as_of_osdmap " << as_of_osdmap
->get_epoch() << " < shard "
10910 << shard_osdmap
->get_epoch() << ", new children " << newer_children
10912 _prime_splits(&newer_children
);
10913 // note: we don't care what is left over here for other shards.
10914 // if this shard is ahead of us and one isn't, e.g., one thread is
10915 // calling into prime_splits via _process (due to a newly created
10916 // pg) and this shard has a newer map due to a racing consume_map,
10917 // then any grandchildren left here will be identified (or were
10918 // identified) when the slower shard's osdmap is advanced.
10919 // _prime_splits() will tolerate the case where the pgid is
10924 void OSDShard::_prime_splits(set
<pair
<spg_t
,epoch_t
>> *pgids
)
10926 dout(10) << *pgids
<< dendl
;
10927 auto p
= pgids
->begin();
10928 while (p
!= pgids
->end()) {
10929 unsigned shard_index
= p
->first
.hash_to_shard(osd
->num_shards
);
10930 if (shard_index
== shard_id
) {
10931 auto r
= pg_slots
.emplace(p
->first
, nullptr);
10933 dout(10) << "priming slot " << p
->first
<< " e" << p
->second
<< dendl
;
10934 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10935 r
.first
->second
->waiting_for_split
.insert(p
->second
);
10938 ceph_assert(q
!= pg_slots
.end());
10939 dout(10) << "priming (existing) slot " << p
->first
<< " e" << p
->second
10941 q
->second
->waiting_for_split
.insert(p
->second
);
10943 p
= pgids
->erase(p
);
10950 void OSDShard::prime_merges(const OSDMapRef
& as_of_osdmap
,
10951 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10953 std::lock_guard
l(shard_lock
);
10954 dout(20) << __func__
<< " checking shard " << shard_id
10955 << " for remaining merge pgs " << merge_pgs
<< dendl
;
10956 auto p
= merge_pgs
->begin();
10957 while (p
!= merge_pgs
->end()) {
10958 spg_t pgid
= p
->first
;
10959 epoch_t epoch
= p
->second
;
10960 unsigned shard_index
= pgid
.hash_to_shard(osd
->num_shards
);
10961 if (shard_index
!= shard_id
) {
10965 OSDShardPGSlot
*slot
;
10966 auto r
= pg_slots
.emplace(pgid
, nullptr);
10968 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10970 slot
= r
.first
->second
.get();
10973 dout(20) << __func__
<< " have merge participant pg " << pgid
10974 << " " << slot
->pg
<< dendl
;
10975 } else if (!slot
->waiting_for_split
.empty() &&
10976 *slot
->waiting_for_split
.begin() < epoch
) {
10977 dout(20) << __func__
<< " pending split on merge participant pg " << pgid
10978 << " " << slot
->waiting_for_split
<< dendl
;
10980 dout(20) << __func__
<< " creating empty merge participant " << pgid
10981 << " for merge in " << epoch
<< dendl
;
10982 // leave history zeroed; PG::merge_from() will fill it in.
10983 pg_history_t history
;
10984 PGCreateInfo
cinfo(pgid
, epoch
- 1,
10985 history
, PastIntervals(), false);
10986 PGRef pg
= osd
->handle_pg_create_info(shard_osdmap
, &cinfo
);
10987 _attach_pg(r
.first
->second
.get(), pg
.get());
10988 _wake_pg_slot(pgid
, slot
);
10991 // mark slot for merge
10992 dout(20) << __func__
<< " marking merge participant " << pgid
<< dendl
;
10993 slot
->waiting_for_merge_epoch
= epoch
;
10994 p
= merge_pgs
->erase(p
);
10998 void OSDShard::register_and_wake_split_child(PG
*pg
)
11002 std::lock_guard
l(shard_lock
);
11003 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
11004 auto p
= pg_slots
.find(pg
->pg_id
);
11005 ceph_assert(p
!= pg_slots
.end());
11006 auto *slot
= p
->second
.get();
11007 dout(20) << pg
->pg_id
<< " waiting_for_split " << slot
->waiting_for_split
11009 ceph_assert(!slot
->pg
);
11010 ceph_assert(!slot
->waiting_for_split
.empty());
11011 _attach_pg(slot
, pg
);
11013 epoch
= pg
->get_osdmap_epoch();
11014 ceph_assert(slot
->waiting_for_split
.count(epoch
));
11015 slot
->waiting_for_split
.erase(epoch
);
11016 if (slot
->waiting_for_split
.empty()) {
11017 _wake_pg_slot(pg
->pg_id
, slot
);
11019 dout(10) << __func__
<< " still waiting for split on "
11020 << slot
->waiting_for_split
<< dendl
;
11024 // kick child to ensure it pulls up to the latest osdmap
11025 osd
->enqueue_peering_evt(
11028 std::make_shared
<PGPeeringEvent
>(
11033 std::lock_guard l
{sdata_wait_lock
};
11034 sdata_cond
.notify_one();
11037 void OSDShard::unprime_split_children(spg_t parent
, unsigned old_pg_num
)
11039 std::lock_guard
l(shard_lock
);
11040 vector
<spg_t
> to_delete
;
11041 for (auto& i
: pg_slots
) {
11042 if (i
.first
!= parent
&&
11043 i
.first
.get_ancestor(old_pg_num
) == parent
) {
11044 dout(10) << __func__
<< " parent " << parent
<< " clearing " << i
.first
11046 _wake_pg_slot(i
.first
, i
.second
.get());
11047 to_delete
.push_back(i
.first
);
11050 for (auto pgid
: to_delete
) {
11051 pg_slots
.erase(pgid
);
11056 // =============================================================
11058 #undef dout_context
11059 #define dout_context osd->cct
11061 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
11063 void OSD::ShardedOpWQ::_add_slot_waiter(
11065 OSDShardPGSlot
*slot
,
11068 if (qi
.is_peering()) {
11069 dout(20) << __func__
<< " " << pgid
11070 << " peering, item epoch is "
11071 << qi
.get_map_epoch()
11072 << ", will wait on " << qi
<< dendl
;
11073 slot
->waiting_peering
[qi
.get_map_epoch()].push_back(std::move(qi
));
11075 dout(20) << __func__
<< " " << pgid
11076 << " item epoch is "
11077 << qi
.get_map_epoch()
11078 << ", will wait on " << qi
<< dendl
;
11079 slot
->waiting
.push_back(std::move(qi
));
11084 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
11086 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
11088 uint32_t shard_index
= thread_index
% osd
->num_shards
;
11089 auto& sdata
= osd
->shards
[shard_index
];
11090 ceph_assert(sdata
);
11092 // If all threads of shards do oncommits, there is a out-of-order
11093 // problem. So we choose the thread which has the smallest
11094 // thread_index(thread_index < num_shards) of shard to do oncommit
11096 bool is_smallest_thread_index
= thread_index
< osd
->num_shards
;
11099 sdata
->shard_lock
.lock();
11100 if (sdata
->pqueue
->empty() &&
11101 (!is_smallest_thread_index
|| sdata
->context_queue
.empty())) {
11102 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
11103 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
11104 // we raced with a context_queue addition, don't wait
11105 wait_lock
.unlock();
11106 } else if (!sdata
->stop_waiting
) {
11107 dout(20) << __func__
<< " empty q, waiting" << dendl
;
11108 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
11109 sdata
->shard_lock
.unlock();
11110 sdata
->sdata_cond
.wait(wait_lock
);
11111 wait_lock
.unlock();
11112 sdata
->shard_lock
.lock();
11113 if (sdata
->pqueue
->empty() &&
11114 !(is_smallest_thread_index
&& !sdata
->context_queue
.empty())) {
11115 sdata
->shard_lock
.unlock();
11118 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
11119 osd
->cct
->_conf
->threadpool_default_timeout
, 0);
11121 dout(20) << __func__
<< " need return immediately" << dendl
;
11122 wait_lock
.unlock();
11123 sdata
->shard_lock
.unlock();
11128 list
<Context
*> oncommits
;
11129 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
11130 sdata
->context_queue
.swap(oncommits
);
11133 if (sdata
->pqueue
->empty()) {
11134 if (osd
->is_stopping()) {
11135 sdata
->shard_lock
.unlock();
11136 for (auto c
: oncommits
) {
11137 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
11140 return; // OSD shutdown, discard.
11142 sdata
->shard_lock
.unlock();
11143 handle_oncommits(oncommits
);
11147 OpQueueItem item
= sdata
->pqueue
->dequeue();
11148 if (osd
->is_stopping()) {
11149 sdata
->shard_lock
.unlock();
11150 for (auto c
: oncommits
) {
11151 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
11154 return; // OSD shutdown, discard.
11157 const auto token
= item
.get_ordering_token();
11158 auto r
= sdata
->pg_slots
.emplace(token
, nullptr);
11160 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
11162 OSDShardPGSlot
*slot
= r
.first
->second
.get();
11163 dout(20) << __func__
<< " " << token
11164 << (r
.second
? " (new)" : "")
11165 << " to_process " << slot
->to_process
11166 << " waiting " << slot
->waiting
11167 << " waiting_peering " << slot
->waiting_peering
11169 slot
->to_process
.push_back(std::move(item
));
11170 dout(20) << __func__
<< " " << slot
->to_process
.back()
11171 << " queued" << dendl
;
11174 PGRef pg
= slot
->pg
;
11176 // lock pg (if we have it)
11178 // note the requeue seq now...
11179 uint64_t requeue_seq
= slot
->requeue_seq
;
11180 ++slot
->num_running
;
11182 sdata
->shard_lock
.unlock();
11183 osd
->service
.maybe_inject_dispatch_delay();
11185 osd
->service
.maybe_inject_dispatch_delay();
11186 sdata
->shard_lock
.lock();
11188 auto q
= sdata
->pg_slots
.find(token
);
11189 if (q
== sdata
->pg_slots
.end()) {
11190 // this can happen if we race with pg removal.
11191 dout(20) << __func__
<< " slot " << token
<< " no longer there" << dendl
;
11193 sdata
->shard_lock
.unlock();
11194 handle_oncommits(oncommits
);
11197 slot
= q
->second
.get();
11198 --slot
->num_running
;
11200 if (slot
->to_process
.empty()) {
11201 // raced with _wake_pg_slot or consume_map
11202 dout(20) << __func__
<< " " << token
11203 << " nothing queued" << dendl
;
11205 sdata
->shard_lock
.unlock();
11206 handle_oncommits(oncommits
);
11209 if (requeue_seq
!= slot
->requeue_seq
) {
11210 dout(20) << __func__
<< " " << token
11211 << " requeue_seq " << slot
->requeue_seq
<< " > our "
11212 << requeue_seq
<< ", we raced with _wake_pg_slot"
11215 sdata
->shard_lock
.unlock();
11216 handle_oncommits(oncommits
);
11219 if (slot
->pg
!= pg
) {
11220 // this can happen if we race with pg removal.
11221 dout(20) << __func__
<< " slot " << token
<< " no longer attached to "
11228 dout(20) << __func__
<< " " << token
11229 << " to_process " << slot
->to_process
11230 << " waiting " << slot
->waiting
11231 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
11233 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
11237 auto qi
= std::move(slot
->to_process
.front());
11238 slot
->to_process
.pop_front();
11239 dout(20) << __func__
<< " " << qi
<< " pg " << pg
<< dendl
;
11240 set
<pair
<spg_t
,epoch_t
>> new_children
;
11244 // should this pg shard exist on this osd in this (or a later) epoch?
11245 osdmap
= sdata
->shard_osdmap
;
11246 const PGCreateInfo
*create_info
= qi
.creates_pg();
11247 if (!slot
->waiting_for_split
.empty()) {
11248 dout(20) << __func__
<< " " << token
11249 << " splitting " << slot
->waiting_for_split
<< dendl
;
11250 _add_slot_waiter(token
, slot
, std::move(qi
));
11251 } else if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
11252 dout(20) << __func__
<< " " << token
11253 << " map " << qi
.get_map_epoch() << " > "
11254 << osdmap
->get_epoch() << dendl
;
11255 _add_slot_waiter(token
, slot
, std::move(qi
));
11256 } else if (qi
.is_peering()) {
11257 if (!qi
.peering_requires_pg()) {
11258 // for pg-less events, we run them under the ordering lock, since
11259 // we don't have the pg lock to keep them ordered.
11260 qi
.run(osd
, sdata
, pg
, tp_handle
);
11261 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
11263 if (create_info
->by_mon
&&
11264 osdmap
->get_pg_acting_primary(token
.pgid
) != osd
->whoami
) {
11265 dout(20) << __func__
<< " " << token
11266 << " no pg, no longer primary, ignoring mon create on "
11269 dout(20) << __func__
<< " " << token
11270 << " no pg, should create on " << qi
<< dendl
;
11271 pg
= osd
->handle_pg_create_info(osdmap
, create_info
);
11273 // we created the pg! drop out and continue "normally"!
11274 sdata
->_attach_pg(slot
, pg
.get());
11275 sdata
->_wake_pg_slot(token
, slot
);
11277 // identify split children between create epoch and shard epoch.
11278 osd
->service
.identify_splits_and_merges(
11279 pg
->get_osdmap(), osdmap
, pg
->pg_id
, &new_children
, nullptr);
11280 sdata
->_prime_splits(&new_children
);
11281 // distribute remaining split children to other shards below!
11284 dout(20) << __func__
<< " ignored create on " << qi
<< dendl
;
11287 dout(20) << __func__
<< " " << token
11288 << " no pg, peering, !create, discarding " << qi
<< dendl
;
11291 dout(20) << __func__
<< " " << token
11292 << " no pg, peering, doesn't map here e" << osdmap
->get_epoch()
11293 << ", discarding " << qi
11296 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
11297 dout(20) << __func__
<< " " << token
11298 << " no pg, should exist e" << osdmap
->get_epoch()
11299 << ", will wait on " << qi
<< dendl
;
11300 _add_slot_waiter(token
, slot
, std::move(qi
));
11302 dout(20) << __func__
<< " " << token
11303 << " no pg, shouldn't exist e" << osdmap
->get_epoch()
11304 << ", dropping " << qi
<< dendl
;
11305 // share map with client?
11306 if (boost::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11307 auto priv
= (*_op
)->get_req()->get_connection()->get_priv();
11308 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
11309 osd
->maybe_share_map(session
, *_op
, sdata
->shard_osdmap
);
11312 unsigned pushes_to_free
= qi
.get_reserved_pushes();
11313 if (pushes_to_free
> 0) {
11314 sdata
->shard_lock
.unlock();
11315 osd
->service
.release_reserved_pushes(pushes_to_free
);
11316 handle_oncommits(oncommits
);
11320 sdata
->shard_lock
.unlock();
11321 handle_oncommits(oncommits
);
11324 if (qi
.is_peering()) {
11325 OSDMapRef osdmap
= sdata
->shard_osdmap
;
11326 if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
11327 _add_slot_waiter(token
, slot
, std::move(qi
));
11328 sdata
->shard_lock
.unlock();
11330 handle_oncommits(oncommits
);
11334 sdata
->shard_lock
.unlock();
11336 if (!new_children
.empty()) {
11337 for (auto shard
: osd
->shards
) {
11338 shard
->prime_splits(osdmap
, &new_children
);
11340 ceph_assert(new_children
.empty());
11343 // osd_opwq_process marks the point at which an operation has been dequeued
11344 // and will begin to be handled by a worker thread.
11348 if (boost::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11349 reqid
= (*_op
)->get_reqid();
11352 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
11353 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
11356 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
11357 Formatter
*f
= Formatter::create("json");
11358 f
->open_object_section("q");
11360 f
->close_section();
11365 qi
.run(osd
, sdata
, pg
, tp_handle
);
11370 if (boost::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11371 reqid
= (*_op
)->get_reqid();
11374 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
11375 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
11378 handle_oncommits(oncommits
);
11381 void OSD::ShardedOpWQ::_enqueue(OpQueueItem
&& item
) {
11382 uint32_t shard_index
=
11383 item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11385 OSDShard
* sdata
= osd
->shards
[shard_index
];
11386 assert (NULL
!= sdata
);
11387 unsigned priority
= item
.get_priority();
11388 unsigned cost
= item
.get_cost();
11389 sdata
->shard_lock
.lock();
11391 dout(20) << __func__
<< " " << item
<< dendl
;
11392 if (priority
>= osd
->op_prio_cutoff
)
11393 sdata
->pqueue
->enqueue_strict(
11394 item
.get_owner(), priority
, std::move(item
));
11396 sdata
->pqueue
->enqueue(
11397 item
.get_owner(), priority
, cost
, std::move(item
));
11398 sdata
->shard_lock
.unlock();
11400 std::lock_guard l
{sdata
->sdata_wait_lock
};
11401 sdata
->sdata_cond
.notify_one();
11404 void OSD::ShardedOpWQ::_enqueue_front(OpQueueItem
&& item
)
11406 auto shard_index
= item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11407 auto& sdata
= osd
->shards
[shard_index
];
11408 ceph_assert(sdata
);
11409 sdata
->shard_lock
.lock();
11410 auto p
= sdata
->pg_slots
.find(item
.get_ordering_token());
11411 if (p
!= sdata
->pg_slots
.end() &&
11412 !p
->second
->to_process
.empty()) {
11413 // we may be racing with _process, which has dequeued a new item
11414 // from pqueue, put it on to_process, and is now busy taking the
11415 // pg lock. ensure this old requeued item is ordered before any
11416 // such newer item in to_process.
11417 p
->second
->to_process
.push_front(std::move(item
));
11418 item
= std::move(p
->second
->to_process
.back());
11419 p
->second
->to_process
.pop_back();
11420 dout(20) << __func__
11421 << " " << p
->second
->to_process
.front()
11422 << " shuffled w/ " << item
<< dendl
;
11424 dout(20) << __func__
<< " " << item
<< dendl
;
11426 sdata
->_enqueue_front(std::move(item
), osd
->op_prio_cutoff
);
11427 sdata
->shard_lock
.unlock();
11428 std::lock_guard l
{sdata
->sdata_wait_lock
};
11429 sdata
->sdata_cond
.notify_one();
11433 namespace osd_cmds
{
11435 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
,
11438 if (!ceph_using_tcmalloc()) {
11439 os
<< "could not issue heap profiler command -- not using tcmalloc!";
11440 return -EOPNOTSUPP
;
11444 if (!cmd_getval(&cct
, cmdmap
, "heapcmd", cmd
)) {
11445 os
<< "unable to get value for command \"" << cmd
<< "\"";
11449 std::vector
<std::string
> cmd_vec
;
11450 get_str_vec(cmd
, cmd_vec
);
11453 if (cmd_getval(&cct
, cmdmap
, "value", val
)) {
11454 cmd_vec
.push_back(val
);
11457 ceph_heap_profiler_handle_command(cmd_vec
, os
);
11462 }} // namespace ceph::osd_cmds
11465 std::ostream
& operator<<(std::ostream
& out
, const io_queue
& q
) {
11467 case io_queue::prioritized
:
11468 out
<< "prioritized";
11470 case io_queue::weightedpriority
:
11471 out
<< "weightedpriority";
11473 case io_queue::mclock_opclass
:
11474 out
<< "mclock_opclass";
11476 case io_queue::mclock_client
:
11477 out
<< "mclock_client";