1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
47 #include "osdc/Objecter.h"
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_time.h"
52 #include "common/version.h"
53 #include "common/pick_address.h"
54 #include "common/blkdev.h"
55 #include "common/numa.h"
57 #include "os/ObjectStore.h"
59 #include "os/FuseStore.h"
62 #include "PrimaryLogPG.h"
64 #include "msg/Messenger.h"
65 #include "msg/Message.h"
67 #include "mon/MonClient.h"
69 #include "messages/MLog.h"
71 #include "messages/MGenericMessage.h"
72 #include "messages/MOSDPing.h"
73 #include "messages/MOSDFailure.h"
74 #include "messages/MOSDMarkMeDown.h"
75 #include "messages/MOSDFull.h"
76 #include "messages/MOSDOp.h"
77 #include "messages/MOSDOpReply.h"
78 #include "messages/MOSDBackoff.h"
79 #include "messages/MOSDBeacon.h"
80 #include "messages/MOSDRepOp.h"
81 #include "messages/MOSDRepOpReply.h"
82 #include "messages/MOSDBoot.h"
83 #include "messages/MOSDPGTemp.h"
84 #include "messages/MOSDPGReadyToMerge.h"
86 #include "messages/MOSDMap.h"
87 #include "messages/MMonGetOSDMap.h"
88 #include "messages/MOSDPGNotify.h"
89 #include "messages/MOSDPGQuery.h"
90 #include "messages/MOSDPGLog.h"
91 #include "messages/MOSDPGRemove.h"
92 #include "messages/MOSDPGInfo.h"
93 #include "messages/MOSDPGCreate.h"
94 #include "messages/MOSDPGCreate2.h"
95 #include "messages/MOSDPGTrim.h"
96 #include "messages/MOSDPGScan.h"
97 #include "messages/MBackfillReserve.h"
98 #include "messages/MRecoveryReserve.h"
99 #include "messages/MOSDForceRecovery.h"
100 #include "messages/MOSDECSubOpWrite.h"
101 #include "messages/MOSDECSubOpWriteReply.h"
102 #include "messages/MOSDECSubOpRead.h"
103 #include "messages/MOSDECSubOpReadReply.h"
104 #include "messages/MOSDPGCreated.h"
105 #include "messages/MOSDPGUpdateLogMissing.h"
106 #include "messages/MOSDPGUpdateLogMissingReply.h"
108 #include "messages/MOSDPeeringOp.h"
110 #include "messages/MOSDAlive.h"
112 #include "messages/MOSDScrub.h"
113 #include "messages/MOSDScrub2.h"
114 #include "messages/MOSDRepScrub.h"
116 #include "messages/MMonCommand.h"
117 #include "messages/MCommand.h"
118 #include "messages/MCommandReply.h"
120 #include "messages/MPGStats.h"
121 #include "messages/MPGStatsAck.h"
123 #include "messages/MWatchNotify.h"
124 #include "messages/MOSDPGPush.h"
125 #include "messages/MOSDPGPushReply.h"
126 #include "messages/MOSDPGPull.h"
128 #include "common/perf_counters.h"
129 #include "common/Timer.h"
130 #include "common/LogClient.h"
131 #include "common/AsyncReserver.h"
132 #include "common/HeartbeatMap.h"
133 #include "common/admin_socket.h"
134 #include "common/ceph_context.h"
136 #include "global/signal_handler.h"
137 #include "global/pidfile.h"
139 #include "include/color.h"
140 #include "perfglue/cpu_profiler.h"
141 #include "perfglue/heap_profiler.h"
143 #include "osd/OpRequest.h"
145 #include "auth/AuthAuthorizeHandler.h"
146 #include "auth/RotatingKeyRing.h"
148 #include "objclass/objclass.h"
150 #include "common/cmdparse.h"
151 #include "include/str_list.h"
152 #include "include/util.h"
154 #include "include/ceph_assert.h"
155 #include "common/config.h"
156 #include "common/EventTrace.h"
158 #include "json_spirit/json_spirit_reader.h"
159 #include "json_spirit/json_spirit_writer.h"
162 #define TRACEPOINT_DEFINE
163 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
164 #include "tracing/osd.h"
165 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
166 #undef TRACEPOINT_DEFINE
168 #define tracepoint(...)
171 #define dout_context cct
172 #define dout_subsys ceph_subsys_osd
174 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
177 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
178 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
181 //Initial features in new superblock.
182 //Features here are also automatically upgraded
183 CompatSet
OSD::get_osd_initial_compat_set() {
184 CompatSet::FeatureSet ceph_osd_feature_compat
;
185 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
186 CompatSet::FeatureSet ceph_osd_feature_incompat
;
187 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
188 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
189 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
190 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
191 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
192 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
193 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
194 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
195 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
196 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
197 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
198 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
199 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
200 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
201 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES
);
202 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
203 ceph_osd_feature_incompat
);
206 //Features are added here that this OSD supports.
207 CompatSet
OSD::get_osd_compat_set() {
208 CompatSet compat
= get_osd_initial_compat_set();
209 //Any features here can be set in code, but not in initial superblock
210 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
214 OSDService::OSDService(OSD
*osd
) :
217 whoami(osd
->whoami
), store(osd
->store
),
218 log_client(osd
->log_client
), clog(osd
->clog
),
219 pg_recovery_stats(osd
->pg_recovery_stats
),
220 cluster_messenger(osd
->cluster_messenger
),
221 client_messenger(osd
->client_messenger
),
223 recoverystate_perf(osd
->recoverystate_perf
),
225 class_handler(osd
->class_handler
),
226 osd_max_object_size(cct
->_conf
, "osd_max_object_size"),
227 osd_skip_data_digest(cct
->_conf
, "osd_skip_data_digest"),
228 publish_lock
{ceph::make_mutex("OSDService::publish_lock")},
229 pre_publish_lock
{ceph::make_mutex("OSDService::pre_publish_lock")},
231 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
232 sched_scrub_lock("OSDService::sched_scrub_lock"),
235 agent_lock("OSDService::agent_lock"),
236 agent_valid_iterator(false),
238 flush_mode_high_count(0),
241 agent_stop_flag(false),
242 agent_timer_lock("OSDService::agent_timer_lock"),
243 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
244 last_recalibrate(ceph_clock_now()),
245 promote_max_objects(0),
246 promote_max_bytes(0),
247 objecter(new Objecter(osd
->client_messenger
->cct
, osd
->objecter_messenger
, osd
->monc
, NULL
, 0, 0)),
248 m_objecter_finishers(cct
->_conf
->osd_objecter_finishers
),
249 watch_lock("OSDService::watch_lock"),
250 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
252 recovery_request_lock("OSDService::recovery_request_lock"),
253 recovery_request_timer(cct
, recovery_request_lock
, false),
254 sleep_lock("OSDService::sleep_lock"),
255 sleep_timer(cct
, sleep_lock
, false),
256 reserver_finisher(cct
),
257 local_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
258 cct
->_conf
->osd_min_recovery_priority
),
259 remote_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
260 cct
->_conf
->osd_min_recovery_priority
),
261 pg_temp_lock("OSDService::pg_temp_lock"),
262 snap_reserver(cct
, &reserver_finisher
,
263 cct
->_conf
->osd_max_trimming_pgs
),
264 recovery_lock("OSDService::recovery_lock"),
265 recovery_ops_active(0),
266 recovery_ops_reserved(0),
267 recovery_paused(false),
268 map_cache_lock("OSDService::map_cache_lock"),
269 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
270 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
271 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
272 stat_lock("OSDService::stat_lock"),
273 full_status_lock("OSDService::full_status_lock"),
275 cur_ratio(0), physical_ratio(0),
276 epoch_lock("OSDService::epoch_lock"),
277 boot_epoch(0), up_epoch(0), bind_epoch(0),
278 is_stopping_lock("OSDService::is_stopping_lock")
280 , pgid_lock("OSDService::pgid_lock")
285 for (int i
= 0; i
< m_objecter_finishers
; i
++) {
287 str
<< "objecter-finisher-" << i
;
288 Finisher
*fin
= new Finisher(osd
->client_messenger
->cct
, str
.str(), "finisher");
289 objecter_finishers
.push_back(fin
);
293 OSDService::~OSDService()
297 for (auto f
: objecter_finishers
) {
306 void OSDService::add_pgid(spg_t pgid
, PG
*pg
){
307 std::lock_guard
l(pgid_lock
);
308 if (!pgid_tracker
.count(pgid
)) {
311 pgid_tracker
[pgid
]++;
313 void OSDService::remove_pgid(spg_t pgid
, PG
*pg
)
315 std::lock_guard
l(pgid_lock
);
316 ceph_assert(pgid_tracker
.count(pgid
));
317 ceph_assert(pgid_tracker
[pgid
] > 0);
318 pgid_tracker
[pgid
]--;
319 if (pgid_tracker
[pgid
] == 0) {
320 pgid_tracker
.erase(pgid
);
321 live_pgs
.erase(pgid
);
324 void OSDService::dump_live_pgids()
326 std::lock_guard
l(pgid_lock
);
327 derr
<< "live pgids:" << dendl
;
328 for (map
<spg_t
, int>::const_iterator i
= pgid_tracker
.cbegin();
329 i
!= pgid_tracker
.cend();
331 derr
<< "\t" << *i
<< dendl
;
332 live_pgs
[i
->first
]->dump_live_ids();
339 void OSDService::identify_splits_and_merges(
343 set
<pair
<spg_t
,epoch_t
>> *split_children
,
344 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
346 if (!old_map
->have_pg_pool(pgid
.pool())) {
349 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
350 auto p
= osd
->pg_num_history
.pg_nums
.find(pgid
.pool());
351 if (p
== osd
->pg_num_history
.pg_nums
.end()) {
354 dout(20) << __func__
<< " " << pgid
<< " e" << old_map
->get_epoch()
355 << " to e" << new_map
->get_epoch()
356 << " pg_nums " << p
->second
<< dendl
;
358 queue
.push_back(pgid
);
360 while (!queue
.empty()) {
361 auto cur
= queue
.front();
364 unsigned pgnum
= old_pgnum
;
365 for (auto q
= p
->second
.lower_bound(old_map
->get_epoch());
366 q
!= p
->second
.end() &&
367 q
->first
<= new_map
->get_epoch();
369 if (pgnum
< q
->second
) {
371 if (cur
.ps() < pgnum
) {
373 if (cur
.is_split(pgnum
, q
->second
, &children
)) {
374 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
375 << " pg_num " << pgnum
<< " -> " << q
->second
376 << " children " << children
<< dendl
;
377 for (auto i
: children
) {
378 split_children
->insert(make_pair(i
, q
->first
));
383 } else if (cur
.ps() < q
->second
) {
384 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
385 << " pg_num " << pgnum
<< " -> " << q
->second
386 << " is a child" << dendl
;
387 // normally we'd capture this from the parent, but it's
388 // possible the parent doesn't exist yet (it will be
389 // fabricated to allow an intervening merge). note this PG
390 // as a split child here to be sure we catch it.
391 split_children
->insert(make_pair(cur
, q
->first
));
393 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
394 << " pg_num " << pgnum
<< " -> " << q
->second
395 << " is post-split, skipping" << dendl
;
397 } else if (merge_pgs
) {
399 if (cur
.ps() >= q
->second
) {
400 if (cur
.ps() < pgnum
) {
402 if (cur
.is_merge_source(pgnum
, q
->second
, &parent
)) {
404 parent
.is_split(q
->second
, pgnum
, &children
);
405 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
406 << " pg_num " << pgnum
<< " -> " << q
->second
407 << " is merge source, target " << parent
408 << ", source(s) " << children
<< dendl
;
409 merge_pgs
->insert(make_pair(parent
, q
->first
));
410 if (!did
.count(parent
)) {
411 // queue (and re-scan) parent in case it might not exist yet
412 // and there are some future splits pending on it
413 queue
.push_back(parent
);
415 for (auto c
: children
) {
416 merge_pgs
->insert(make_pair(c
, q
->first
));
422 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
423 << " pg_num " << pgnum
<< " -> " << q
->second
424 << " is beyond old pgnum, skipping" << dendl
;
428 if (cur
.is_split(q
->second
, pgnum
, &children
)) {
429 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
430 << " pg_num " << pgnum
<< " -> " << q
->second
431 << " is merge target, source " << children
<< dendl
;
432 for (auto c
: children
) {
433 merge_pgs
->insert(make_pair(c
, q
->first
));
437 merge_pgs
->insert(make_pair(cur
, q
->first
));
446 void OSDService::need_heartbeat_peer_update()
448 osd
->need_heartbeat_peer_update();
451 void OSDService::start_shutdown()
454 std::lock_guard
l(agent_timer_lock
);
455 agent_timer
.shutdown();
459 std::lock_guard
l(sleep_lock
);
460 sleep_timer
.shutdown();
464 std::lock_guard
l(recovery_request_lock
);
465 recovery_request_timer
.shutdown();
469 void OSDService::shutdown_reserver()
471 reserver_finisher
.wait_for_empty();
472 reserver_finisher
.stop();
475 void OSDService::shutdown()
478 std::lock_guard
l(watch_lock
);
479 watch_timer
.shutdown();
482 objecter
->shutdown();
483 for (auto f
: objecter_finishers
) {
488 publish_map(OSDMapRef());
489 next_osdmap
= OSDMapRef();
492 void OSDService::init()
494 reserver_finisher
.start();
495 for (auto f
: objecter_finishers
) {
498 objecter
->set_client_incarnation(0);
500 // deprioritize objecter in daemonperf output
501 objecter
->get_logger()->set_prio_adjust(-3);
506 agent_thread
.create("osd_srv_agent");
508 if (cct
->_conf
->osd_recovery_delay_start
)
509 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
512 void OSDService::final_init()
514 objecter
->start(osdmap
.get());
517 void OSDService::activate_map()
519 // wake/unwake the tiering agent
522 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
528 void OSDService::request_osdmap_update(epoch_t e
)
530 osd
->osdmap_subscribe(e
, false);
533 class AgentTimeoutCB
: public Context
{
536 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
537 void finish(int) override
{
538 pg
->agent_choose_mode_restart();
542 void OSDService::agent_entry()
544 dout(10) << __func__
<< " start" << dendl
;
547 while (!agent_stop_flag
) {
548 if (agent_queue
.empty()) {
549 dout(20) << __func__
<< " empty queue" << dendl
;
550 agent_cond
.Wait(agent_lock
);
553 uint64_t level
= agent_queue
.rbegin()->first
;
554 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
556 << " tiers " << agent_queue
.size()
557 << ", top is " << level
558 << " with pgs " << top
.size()
559 << ", ops " << agent_ops
<< "/"
560 << cct
->_conf
->osd_agent_max_ops
561 << (agent_active
? " active" : " NOT ACTIVE")
563 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
564 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
565 int agent_flush_quota
= max
;
566 if (!flush_mode_high_count
)
567 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
568 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
569 agent_cond
.Wait(agent_lock
);
573 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
574 agent_queue_pos
= top
.begin();
575 agent_valid_iterator
= true;
577 PGRef pg
= *agent_queue_pos
;
578 dout(10) << "high_count " << flush_mode_high_count
579 << " agent_ops " << agent_ops
580 << " flush_quota " << agent_flush_quota
<< dendl
;
582 if (!pg
->agent_work(max
, agent_flush_quota
)) {
583 dout(10) << __func__
<< " " << pg
->pg_id
584 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
585 << " seconds" << dendl
;
587 osd
->logger
->inc(l_osd_tier_delay
);
588 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
589 agent_timer_lock
.Lock();
590 Context
*cb
= new AgentTimeoutCB(pg
);
591 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
592 agent_timer_lock
.Unlock();
597 dout(10) << __func__
<< " finish" << dendl
;
600 void OSDService::agent_stop()
603 std::lock_guard
l(agent_lock
);
605 // By this time all ops should be cancelled
606 ceph_assert(agent_ops
== 0);
607 // By this time all PGs are shutdown and dequeued
608 if (!agent_queue
.empty()) {
609 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
610 derr
<< "agent queue not empty, for example " << (*top
.begin())->get_pgid() << dendl
;
611 ceph_abort_msg("agent queue not empty");
614 agent_stop_flag
= true;
620 // -------------------------------------
622 void OSDService::promote_throttle_recalibrate()
624 utime_t now
= ceph_clock_now();
625 double dur
= now
- last_recalibrate
;
626 last_recalibrate
= now
;
627 unsigned prob
= promote_probability_millis
;
629 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
630 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
632 unsigned min_prob
= 1;
634 uint64_t attempts
, obj
, bytes
;
635 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
636 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
637 << obj
<< " objects and " << byte_u_t(bytes
) << "; target "
638 << target_obj_sec
<< " obj/sec or "
639 << byte_u_t(target_bytes_sec
) << "/sec"
642 // calculate what the probability *should* be, given the targets
644 if (attempts
&& dur
> 0) {
645 uint64_t avg_size
= 1;
647 avg_size
= std::max
<uint64_t>(bytes
/ obj
, 1);
648 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
649 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
651 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
652 << avg_size
<< dendl
;
653 if (target_obj_sec
&& target_bytes_sec
)
654 new_prob
= std::min(po
, pb
);
655 else if (target_obj_sec
)
657 else if (target_bytes_sec
)
664 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
666 // correct for persistent skew between target rate and actual rate, adjust
669 if (attempts
&& obj
) {
670 actual
= obj
* 1000 / attempts
;
671 ratio
= (double)actual
/ (double)prob
;
672 new_prob
= (double)new_prob
/ ratio
;
674 new_prob
= std::max(new_prob
, min_prob
);
675 new_prob
= std::min(new_prob
, 1000u);
678 prob
= (prob
+ new_prob
) / 2;
679 prob
= std::max(prob
, min_prob
);
680 prob
= std::min(prob
, 1000u);
681 dout(10) << __func__
<< " actual " << actual
682 << ", actual/prob ratio " << ratio
683 << ", adjusted new_prob " << new_prob
684 << ", prob " << promote_probability_millis
<< " -> " << prob
686 promote_probability_millis
= prob
;
688 // set hard limits for this interval to mitigate stampedes
689 promote_max_objects
= target_obj_sec
* osd
->OSD_TICK_INTERVAL
* 2;
690 promote_max_bytes
= target_bytes_sec
* osd
->OSD_TICK_INTERVAL
* 2;
693 // -------------------------------------
695 float OSDService::get_failsafe_full_ratio()
697 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
698 if (full_ratio
> 1.0) full_ratio
/= 100.0;
702 OSDService::s_names
OSDService::recalc_full_state(float ratio
, float pratio
, string
&inject
)
704 // The OSDMap ratios take precendence. So if the failsafe is .95 and
705 // the admin sets the cluster full to .96, the failsafe moves up to .96
706 // too. (Not that having failsafe == full is ideal, but it's better than
707 // dropping writes before the clusters appears full.)
708 OSDMapRef osdmap
= get_osdmap();
709 if (!osdmap
|| osdmap
->get_epoch() == 0) {
712 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
713 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
714 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
715 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
717 if (osdmap
->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
718 // use the failsafe for nearfull and full; the mon isn't using the
719 // flags anyway because we're mid-upgrade.
720 full_ratio
= failsafe_ratio
;
721 backfillfull_ratio
= failsafe_ratio
;
722 nearfull_ratio
= failsafe_ratio
;
723 } else if (full_ratio
<= 0 ||
724 backfillfull_ratio
<= 0 ||
725 nearfull_ratio
<= 0) {
726 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
727 // use failsafe flag. ick. the monitor did something wrong or the user
728 // did something stupid.
729 full_ratio
= failsafe_ratio
;
730 backfillfull_ratio
= failsafe_ratio
;
731 nearfull_ratio
= failsafe_ratio
;
734 if (injectfull_state
> NONE
&& injectfull
) {
735 inject
= "(Injected)";
736 return injectfull_state
;
737 } else if (pratio
> failsafe_ratio
) {
739 } else if (ratio
> full_ratio
) {
741 } else if (ratio
> backfillfull_ratio
) {
743 } else if (pratio
> nearfull_ratio
) {
749 void OSDService::check_full_status(float ratio
, float pratio
)
751 std::lock_guard
l(full_status_lock
);
754 physical_ratio
= pratio
;
758 new_state
= recalc_full_state(ratio
, pratio
, inject
);
760 dout(20) << __func__
<< " cur ratio " << ratio
761 << ", physical ratio " << pratio
762 << ", new state " << get_full_state_name(new_state
)
767 if (cur_state
!= new_state
) {
768 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
769 << " -> " << get_full_state_name(new_state
) << dendl
;
770 if (new_state
== FAILSAFE
) {
771 clog
->error() << "full status failsafe engaged, dropping updates, now "
772 << (int)roundf(ratio
* 100) << "% full";
773 } else if (cur_state
== FAILSAFE
) {
774 clog
->error() << "full status failsafe disengaged, no longer dropping "
775 << "updates, now " << (int)roundf(ratio
* 100) << "% full";
777 cur_state
= new_state
;
781 bool OSDService::need_fullness_update()
783 OSDMapRef osdmap
= get_osdmap();
785 if (osdmap
->exists(whoami
)) {
786 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
788 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
790 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
797 else if (is_backfillfull())
799 else if (is_nearfull())
804 bool OSDService::_check_inject_full(DoutPrefixProvider
*dpp
, s_names type
) const
806 if (injectfull
&& injectfull_state
>= type
) {
807 // injectfull is either a count of the number of times to return failsafe full
808 // or if -1 then always return full
811 ldpp_dout(dpp
, 10) << __func__
<< " Injected " << get_full_state_name(type
) << " OSD ("
812 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")"
819 bool OSDService::_check_full(DoutPrefixProvider
*dpp
, s_names type
) const
821 std::lock_guard
l(full_status_lock
);
823 if (_check_inject_full(dpp
, type
))
826 if (cur_state
>= type
)
827 ldpp_dout(dpp
, 10) << __func__
<< " current usage is " << cur_ratio
828 << " physical " << physical_ratio
<< dendl
;
830 return cur_state
>= type
;
833 bool OSDService::_tentative_full(DoutPrefixProvider
*dpp
, s_names type
, uint64_t adjust_used
, osd_stat_t adjusted_stat
)
835 ldpp_dout(dpp
, 20) << __func__
<< " type " << get_full_state_name(type
) << " adjust_used " << (adjust_used
>> 10) << "KiB" << dendl
;
837 std::lock_guard
l(full_status_lock
);
838 if (_check_inject_full(dpp
, type
)) {
844 float ratio
= compute_adjusted_ratio(adjusted_stat
, &pratio
, adjust_used
);
847 s_names tentative_state
= recalc_full_state(ratio
, pratio
, notused
);
849 if (tentative_state
>= type
)
850 ldpp_dout(dpp
, 10) << __func__
<< " tentative usage is " << ratio
<< dendl
;
852 return tentative_state
>= type
;
855 bool OSDService::check_failsafe_full(DoutPrefixProvider
*dpp
) const
857 return _check_full(dpp
, FAILSAFE
);
860 bool OSDService::check_full(DoutPrefixProvider
*dpp
) const
862 return _check_full(dpp
, FULL
);
865 bool OSDService::tentative_backfill_full(DoutPrefixProvider
*dpp
, uint64_t adjust_used
, osd_stat_t stats
)
867 return _tentative_full(dpp
, BACKFILLFULL
, adjust_used
, stats
);
870 bool OSDService::check_backfill_full(DoutPrefixProvider
*dpp
) const
872 return _check_full(dpp
, BACKFILLFULL
);
875 bool OSDService::check_nearfull(DoutPrefixProvider
*dpp
) const
877 return _check_full(dpp
, NEARFULL
);
880 bool OSDService::is_failsafe_full() const
882 std::lock_guard
l(full_status_lock
);
883 return cur_state
== FAILSAFE
;
886 bool OSDService::is_full() const
888 std::lock_guard
l(full_status_lock
);
889 return cur_state
>= FULL
;
892 bool OSDService::is_backfillfull() const
894 std::lock_guard
l(full_status_lock
);
895 return cur_state
>= BACKFILLFULL
;
898 bool OSDService::is_nearfull() const
900 std::lock_guard
l(full_status_lock
);
901 return cur_state
>= NEARFULL
;
904 void OSDService::set_injectfull(s_names type
, int64_t count
)
906 std::lock_guard
l(full_status_lock
);
907 injectfull_state
= type
;
911 void OSDService::set_statfs(const struct store_statfs_t
&stbuf
,
912 osd_alert_list_t
& alerts
)
914 uint64_t bytes
= stbuf
.total
;
915 uint64_t avail
= stbuf
.available
;
916 uint64_t used
= stbuf
.get_used_raw();
918 // For testing fake statfs values so it doesn't matter if all
919 // OSDs are using the same partition.
920 if (cct
->_conf
->fake_statfs_for_testing
) {
921 uint64_t total_num_bytes
= 0;
925 total_num_bytes
+= p
->get_stats_num_bytes();
927 bytes
= cct
->_conf
->fake_statfs_for_testing
;
928 if (total_num_bytes
< bytes
)
929 avail
= bytes
- total_num_bytes
;
932 dout(0) << __func__
<< " fake total " << cct
->_conf
->fake_statfs_for_testing
933 << " adjust available " << avail
935 used
= bytes
- avail
;
938 osd
->logger
->set(l_osd_stat_bytes
, bytes
);
939 osd
->logger
->set(l_osd_stat_bytes_used
, used
);
940 osd
->logger
->set(l_osd_stat_bytes_avail
, avail
);
942 std::lock_guard
l(stat_lock
);
943 osd_stat
.statfs
= stbuf
;
944 osd_stat
.os_alerts
.clear();
945 osd_stat
.os_alerts
[whoami
].swap(alerts
);
946 if (cct
->_conf
->fake_statfs_for_testing
) {
947 osd_stat
.statfs
.total
= bytes
;
948 osd_stat
.statfs
.available
= avail
;
949 // For testing don't want used to go negative, so clear reserved
950 osd_stat
.statfs
.internally_reserved
= 0;
954 osd_stat_t
OSDService::set_osd_stat(vector
<int>& hb_peers
,
957 utime_t now
= ceph_clock_now();
958 auto stale_time
= g_conf().get_val
<int64_t>("osd_mon_heartbeat_stat_stale");
959 std::lock_guard
l(stat_lock
);
960 osd_stat
.hb_peers
.swap(hb_peers
);
961 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
962 osd_stat
.num_pgs
= num_pgs
;
963 // Clean entries that aren't updated
964 // This is called often enough that we can just remove 1 at a time
965 for (auto i
: osd_stat
.hb_pingtime
) {
966 if (i
.second
.last_update
== 0)
968 if (stale_time
&& now
.sec() - i
.second
.last_update
> stale_time
) {
969 dout(20) << __func__
<< " time out heartbeat for osd " << i
.first
970 << " last_update " << i
.second
.last_update
<< dendl
;
971 osd_stat
.hb_pingtime
.erase(i
.first
);
978 void OSDService::inc_osd_stat_repaired()
980 std::lock_guard
l(stat_lock
);
981 osd_stat
.num_shards_repaired
++;
985 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat
, float *pratio
,
986 uint64_t adjust_used
)
989 ((float)new_stat
.statfs
.get_used()) / ((float)new_stat
.statfs
.total
);
992 dout(20) << __func__
<< " Before kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
993 if (new_stat
.statfs
.available
> adjust_used
)
994 new_stat
.statfs
.available
-= adjust_used
;
996 new_stat
.statfs
.available
= 0;
997 dout(20) << __func__
<< " After kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1000 // Check all pgs and adjust kb_used to include all pending backfill data
1001 int backfill_adjusted
= 0;
1003 osd
->_get_pgs(&pgs
);
1004 for (auto p
: pgs
) {
1005 backfill_adjusted
+= p
->pg_stat_adjust(&new_stat
);
1007 if (backfill_adjusted
) {
1008 dout(20) << __func__
<< " backfill adjusted " << new_stat
<< dendl
;
1010 return ((float)new_stat
.statfs
.get_used()) / ((float)new_stat
.statfs
.total
);
1013 bool OSDService::check_osdmap_full(const set
<pg_shard_t
> &missing_on
)
1015 OSDMapRef osdmap
= get_osdmap();
1016 for (auto shard
: missing_on
) {
1017 if (osdmap
->get_state(shard
.osd
) & CEPH_OSD_FULL
)
1023 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
1025 OSDMapRef next_map
= get_nextmap_reserved();
1026 // service map is always newer/newest
1027 ceph_assert(from_epoch
<= next_map
->get_epoch());
1029 if (next_map
->is_down(peer
) ||
1030 next_map
->get_info(peer
).up_from
> from_epoch
) {
1032 release_map(next_map
);
1035 ConnectionRef peer_con
= osd
->cluster_messenger
->connect_to_osd(
1036 next_map
->get_cluster_addrs(peer
));
1037 share_map_peer(peer
, peer_con
.get(), next_map
);
1038 peer_con
->send_message(m
);
1039 release_map(next_map
);
1042 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
1044 OSDMapRef next_map
= get_nextmap_reserved();
1045 // service map is always newer/newest
1046 ceph_assert(from_epoch
<= next_map
->get_epoch());
1048 if (next_map
->is_down(peer
) ||
1049 next_map
->get_info(peer
).up_from
> from_epoch
) {
1050 release_map(next_map
);
1053 ConnectionRef con
= osd
->cluster_messenger
->connect_to_osd(
1054 next_map
->get_cluster_addrs(peer
));
1055 release_map(next_map
);
1059 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
1061 OSDMapRef next_map
= get_nextmap_reserved();
1062 // service map is always newer/newest
1063 ceph_assert(from_epoch
<= next_map
->get_epoch());
1065 pair
<ConnectionRef
,ConnectionRef
> ret
;
1066 if (next_map
->is_down(peer
) ||
1067 next_map
->get_info(peer
).up_from
> from_epoch
) {
1068 release_map(next_map
);
1071 ret
.first
= osd
->hb_back_client_messenger
->connect_to_osd(
1072 next_map
->get_hb_back_addrs(peer
));
1073 ret
.second
= osd
->hb_front_client_messenger
->connect_to_osd(
1074 next_map
->get_hb_front_addrs(peer
));
1075 release_map(next_map
);
1079 entity_name_t
OSDService::get_cluster_msgr_name() const
1081 return cluster_messenger
->get_myname();
1084 void OSDService::queue_want_pg_temp(pg_t pgid
,
1085 const vector
<int>& want
,
1088 std::lock_guard
l(pg_temp_lock
);
1089 auto p
= pg_temp_pending
.find(pgid
);
1090 if (p
== pg_temp_pending
.end() ||
1091 p
->second
.acting
!= want
||
1093 pg_temp_wanted
[pgid
] = {want
, forced
};
1097 void OSDService::remove_want_pg_temp(pg_t pgid
)
1099 std::lock_guard
l(pg_temp_lock
);
1100 pg_temp_wanted
.erase(pgid
);
1101 pg_temp_pending
.erase(pgid
);
1104 void OSDService::_sent_pg_temp()
1106 #ifdef HAVE_STDLIB_MAP_SPLICING
1107 pg_temp_pending
.merge(pg_temp_wanted
);
1109 pg_temp_pending
.insert(make_move_iterator(begin(pg_temp_wanted
)),
1110 make_move_iterator(end(pg_temp_wanted
)));
1112 pg_temp_wanted
.clear();
1115 void OSDService::requeue_pg_temp()
1117 std::lock_guard
l(pg_temp_lock
);
1118 // wanted overrides pending. note that remove_want_pg_temp
1119 // clears the item out of both.
1120 unsigned old_wanted
= pg_temp_wanted
.size();
1121 unsigned old_pending
= pg_temp_pending
.size();
1123 pg_temp_wanted
.swap(pg_temp_pending
);
1124 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1125 << pg_temp_wanted
.size() << dendl
;
1128 std::ostream
& operator<<(std::ostream
& out
,
1129 const OSDService::pg_temp_t
& pg_temp
)
1131 out
<< pg_temp
.acting
;
1132 if (pg_temp
.forced
) {
1138 void OSDService::send_pg_temp()
1140 std::lock_guard
l(pg_temp_lock
);
1141 if (pg_temp_wanted
.empty())
1143 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1144 MOSDPGTemp
*ms
[2] = {nullptr, nullptr};
1145 for (auto& [pgid
, pg_temp
] : pg_temp_wanted
) {
1146 auto& m
= ms
[pg_temp
.forced
];
1148 m
= new MOSDPGTemp(osdmap
->get_epoch());
1149 m
->forced
= pg_temp
.forced
;
1151 m
->pg_temp
.emplace(pgid
, pg_temp
.acting
);
1155 monc
->send_mon_message(m
);
1161 void OSDService::send_pg_created(pg_t pgid
)
1163 std::lock_guard
l(pg_created_lock
);
1164 dout(20) << __func__
<< dendl
;
1165 auto o
= get_osdmap();
1166 if (o
->require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
1167 pg_created
.insert(pgid
);
1168 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1172 void OSDService::send_pg_created()
1174 std::lock_guard
l(pg_created_lock
);
1175 dout(20) << __func__
<< dendl
;
1176 auto o
= get_osdmap();
1177 if (o
->require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
1178 for (auto pgid
: pg_created
) {
1179 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1184 void OSDService::prune_pg_created()
1186 std::lock_guard
l(pg_created_lock
);
1187 dout(20) << __func__
<< dendl
;
1188 auto o
= get_osdmap();
1189 auto i
= pg_created
.begin();
1190 while (i
!= pg_created
.end()) {
1191 auto p
= o
->get_pg_pool(i
->pool());
1192 if (!p
|| !p
->has_flag(pg_pool_t::FLAG_CREATING
)) {
1193 dout(20) << __func__
<< " pruning " << *i
<< dendl
;
1194 i
= pg_created
.erase(i
);
1196 dout(20) << __func__
<< " keeping " << *i
<< dendl
;
1203 // --------------------------------------
1206 epoch_t
OSDService::get_peer_epoch(int peer
)
1208 std::lock_guard
l(peer_map_epoch_lock
);
1209 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1210 if (p
== peer_map_epoch
.end())
1215 epoch_t
OSDService::note_peer_epoch(int peer
, epoch_t e
)
1217 std::lock_guard
l(peer_map_epoch_lock
);
1218 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1219 if (p
!= peer_map_epoch
.end()) {
1220 if (p
->second
< e
) {
1221 dout(10) << "note_peer_epoch osd." << peer
<< " has " << e
<< dendl
;
1224 dout(30) << "note_peer_epoch osd." << peer
<< " has " << p
->second
<< " >= " << e
<< dendl
;
1228 dout(10) << "note_peer_epoch osd." << peer
<< " now has " << e
<< dendl
;
1229 peer_map_epoch
[peer
] = e
;
1234 void OSDService::forget_peer_epoch(int peer
, epoch_t as_of
)
1236 std::lock_guard
l(peer_map_epoch_lock
);
1237 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1238 if (p
!= peer_map_epoch
.end()) {
1239 if (p
->second
<= as_of
) {
1240 dout(10) << "forget_peer_epoch osd." << peer
<< " as_of " << as_of
1241 << " had " << p
->second
<< dendl
;
1242 peer_map_epoch
.erase(p
);
1244 dout(10) << "forget_peer_epoch osd." << peer
<< " as_of " << as_of
1245 << " has " << p
->second
<< " - not forgetting" << dendl
;
1250 bool OSDService::should_share_map(entity_name_t name
, Connection
*con
,
1251 epoch_t epoch
, const OSDMapRef
& osdmap
,
1252 const epoch_t
*sent_epoch_p
)
1254 dout(20) << "should_share_map "
1255 << name
<< " " << con
->get_peer_addr()
1256 << " " << epoch
<< dendl
;
1258 // does client have old map?
1259 if (name
.is_client()) {
1260 bool message_sendmap
= epoch
< osdmap
->get_epoch();
1261 if (message_sendmap
&& sent_epoch_p
) {
1262 dout(20) << "client session last_sent_epoch: "
1264 << " versus osdmap epoch " << osdmap
->get_epoch() << dendl
;
1265 if (*sent_epoch_p
< osdmap
->get_epoch()) {
1267 } // else we don't need to send it out again
1271 if (con
->get_messenger() == osd
->cluster_messenger
&&
1272 con
!= osd
->cluster_messenger
->get_loopback_connection() &&
1273 osdmap
->is_up(name
.num()) &&
1274 (osdmap
->get_cluster_addrs(name
.num()) == con
->get_peer_addrs() ||
1275 osdmap
->get_hb_back_addrs(name
.num()) == con
->get_peer_addrs())) {
1277 epoch_t has
= std::max(get_peer_epoch(name
.num()), epoch
);
1280 if (has
< osdmap
->get_epoch()) {
1281 dout(10) << name
<< " " << con
->get_peer_addr()
1282 << " has old map " << epoch
<< " < "
1283 << osdmap
->get_epoch() << dendl
;
1291 void OSDService::share_map(
1296 epoch_t
*sent_epoch_p
)
1298 dout(20) << "share_map "
1299 << name
<< " " << con
->get_peer_addr()
1300 << " " << epoch
<< dendl
;
1302 if (!osd
->is_active()) {
1303 /*It is safe not to proceed as OSD is not in healthy state*/
1307 bool want_shared
= should_share_map(name
, con
, epoch
,
1308 osdmap
, sent_epoch_p
);
1311 if (name
.is_client()) {
1312 dout(10) << name
<< " has old map " << epoch
1313 << " < " << osdmap
->get_epoch() << dendl
;
1314 // we know the Session is valid or we wouldn't be sending
1316 *sent_epoch_p
= osdmap
->get_epoch();
1318 send_incremental_map(epoch
, con
, osdmap
);
1319 } else if (con
->get_messenger() == osd
->cluster_messenger
&&
1320 osdmap
->is_up(name
.num()) &&
1321 (osdmap
->get_cluster_addrs(name
.num()) == con
->get_peer_addrs() ||
1322 osdmap
->get_hb_back_addrs(name
.num()) == con
->get_peer_addrs())) {
1323 dout(10) << name
<< " " << con
->get_peer_addrs()
1324 << " has old map " << epoch
<< " < "
1325 << osdmap
->get_epoch() << dendl
;
1326 note_peer_epoch(name
.num(), osdmap
->get_epoch());
1327 send_incremental_map(epoch
, con
, osdmap
);
1332 void OSDService::share_map_peer(int peer
, Connection
*con
, OSDMapRef map
)
1338 epoch_t pe
= get_peer_epoch(peer
);
1340 if (pe
< map
->get_epoch()) {
1341 send_incremental_map(pe
, con
, map
);
1342 note_peer_epoch(peer
, map
->get_epoch());
1344 dout(20) << "share_map_peer " << con
<< " already has epoch " << pe
<< dendl
;
1346 dout(20) << "share_map_peer " << con
<< " don't know epoch, doing nothing" << dendl
;
1347 // no idea about peer's epoch.
1348 // ??? send recent ???
1353 bool OSDService::can_inc_scrubs()
1355 bool can_inc
= false;
1356 std::lock_guard
l(sched_scrub_lock
);
1358 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1359 dout(20) << __func__
<< " == true " << scrubs_local
<< " local + " << scrubs_remote
1360 << " remote < max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1363 dout(20) << __func__
<< " == false " << scrubs_local
<< " local + " << scrubs_remote
1364 << " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1370 bool OSDService::inc_scrubs_local()
1372 bool result
= false;
1373 std::lock_guard l
{sched_scrub_lock
};
1374 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1375 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
+1)
1376 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1380 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1385 void OSDService::dec_scrubs_local()
1387 std::lock_guard l
{sched_scrub_lock
};
1388 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
-1)
1389 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1391 ceph_assert(scrubs_local
>= 0);
1394 bool OSDService::inc_scrubs_remote()
1396 bool result
= false;
1397 std::lock_guard l
{sched_scrub_lock
};
1398 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1399 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
+1)
1400 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1404 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1409 void OSDService::dec_scrubs_remote()
1411 std::lock_guard l
{sched_scrub_lock
};
1412 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
-1)
1413 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1415 ceph_assert(scrubs_remote
>= 0);
1418 void OSDService::dump_scrub_reservations(Formatter
*f
)
1420 std::lock_guard l
{sched_scrub_lock
};
1421 f
->dump_int("scrubs_local", scrubs_local
);
1422 f
->dump_int("scrubs_remote", scrubs_remote
);
1423 f
->dump_int("osd_max_scrubs", cct
->_conf
->osd_max_scrubs
);
1426 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1427 epoch_t
*_bind_epoch
) const
1429 std::lock_guard
l(epoch_lock
);
1431 *_boot_epoch
= boot_epoch
;
1433 *_up_epoch
= up_epoch
;
1435 *_bind_epoch
= bind_epoch
;
1438 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1439 const epoch_t
*_bind_epoch
)
1441 std::lock_guard
l(epoch_lock
);
1443 ceph_assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1444 boot_epoch
= *_boot_epoch
;
1447 ceph_assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1448 up_epoch
= *_up_epoch
;
1451 ceph_assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1452 bind_epoch
= *_bind_epoch
;
1456 bool OSDService::prepare_to_stop()
1458 std::lock_guard
l(is_stopping_lock
);
1459 if (get_state() != NOT_STOPPING
)
1462 OSDMapRef osdmap
= get_osdmap();
1463 if (osdmap
&& osdmap
->is_up(whoami
)) {
1464 dout(0) << __func__
<< " telling mon we are shutting down" << dendl
;
1465 set_state(PREPARING_TO_STOP
);
1466 monc
->send_mon_message(
1470 osdmap
->get_addrs(whoami
),
1471 osdmap
->get_epoch(),
1474 utime_t now
= ceph_clock_now();
1476 timeout
.set_from_double(now
+ cct
->_conf
->osd_mon_shutdown_timeout
);
1477 while ((ceph_clock_now() < timeout
) &&
1478 (get_state() != STOPPING
)) {
1479 is_stopping_cond
.WaitUntil(is_stopping_lock
, timeout
);
1482 dout(0) << __func__
<< " starting shutdown" << dendl
;
1483 set_state(STOPPING
);
1487 void OSDService::got_stop_ack()
1489 std::lock_guard
l(is_stopping_lock
);
1490 if (get_state() == PREPARING_TO_STOP
) {
1491 dout(0) << __func__
<< " starting shutdown" << dendl
;
1492 set_state(STOPPING
);
1493 is_stopping_cond
.Signal();
1495 dout(10) << __func__
<< " ignoring msg" << dendl
;
1499 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1500 OSDSuperblock
& sblock
)
1502 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1503 osdmap
->get_encoding_features());
1504 m
->oldest_map
= max_oldest_map
;
1505 m
->newest_map
= sblock
.newest_map
;
1507 int max
= cct
->_conf
->osd_map_message_max
;
1508 ssize_t max_bytes
= cct
->_conf
->osd_map_message_max_bytes
;
1510 if (since
< m
->oldest_map
) {
1511 // we don't have the next map the target wants, so start with a
1514 dout(10) << __func__
<< " oldest map " << max_oldest_map
<< " > since "
1515 << since
<< ", starting with full map" << dendl
;
1516 since
= m
->oldest_map
;
1517 if (!get_map_bl(since
, bl
)) {
1518 derr
<< __func__
<< " missing full map " << since
<< dendl
;
1522 max_bytes
-= bl
.length();
1523 m
->maps
[since
].claim(bl
);
1525 for (epoch_t e
= since
+ 1; e
<= to
; ++e
) {
1527 if (get_inc_map_bl(e
, bl
)) {
1528 m
->incremental_maps
[e
].claim(bl
);
1530 derr
<< __func__
<< " missing incremental map " << e
<< dendl
;
1531 if (!get_map_bl(e
, bl
)) {
1532 derr
<< __func__
<< " also missing full map " << e
<< dendl
;
1535 m
->maps
[e
].claim(bl
);
1538 max_bytes
-= bl
.length();
1539 if (max
<= 0 || max_bytes
<= 0) {
1546 if (!m
->maps
.empty() ||
1547 !m
->incremental_maps
.empty()) {
1548 // send what we have so far
1553 if (get_inc_map_bl(m
->newest_map
, bl
)) {
1554 m
->incremental_maps
[m
->newest_map
].claim(bl
);
1556 derr
<< __func__
<< " unable to load latest map " << m
->newest_map
<< dendl
;
1557 if (!get_map_bl(m
->newest_map
, bl
)) {
1558 derr
<< __func__
<< " unable to load latest full map " << m
->newest_map
1562 m
->maps
[m
->newest_map
].claim(bl
);
1567 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1569 con
->send_message(m
);
1572 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1575 epoch_t to
= osdmap
->get_epoch();
1576 dout(10) << "send_incremental_map " << since
<< " -> " << to
1577 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1581 OSDSuperblock
sblock(get_superblock());
1582 if (since
< sblock
.oldest_map
) {
1583 // just send latest full map
1584 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1585 osdmap
->get_encoding_features());
1586 m
->oldest_map
= max_oldest_map
;
1587 m
->newest_map
= sblock
.newest_map
;
1588 get_map_bl(to
, m
->maps
[to
]);
1593 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1594 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1595 << ", only sending most recent" << dendl
;
1596 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1599 m
= build_incremental_map_msg(since
, to
, sblock
);
1604 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1606 bool found
= map_bl_cache
.lookup(e
, &bl
);
1609 logger
->inc(l_osd_map_bl_cache_hit
);
1613 logger
->inc(l_osd_map_bl_cache_miss
);
1614 found
= store
->read(meta_ch
,
1615 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
,
1616 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1623 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1625 std::lock_guard
l(map_cache_lock
);
1626 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1629 logger
->inc(l_osd_map_bl_cache_hit
);
1633 logger
->inc(l_osd_map_bl_cache_miss
);
1634 found
= store
->read(meta_ch
,
1635 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
,
1636 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1638 _add_map_inc_bl(e
, bl
);
1643 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1645 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1646 // cache a contiguous buffer
1647 if (bl
.get_num_buffers() > 1) {
1650 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1651 map_bl_cache
.add(e
, bl
);
1654 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1656 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1657 // cache a contiguous buffer
1658 if (bl
.get_num_buffers() > 1) {
1661 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1662 map_bl_inc_cache
.add(e
, bl
);
1665 int OSDService::get_deleted_pool_pg_num(int64_t pool
)
1667 std::lock_guard
l(map_cache_lock
);
1668 auto p
= deleted_pool_pg_nums
.find(pool
);
1669 if (p
!= deleted_pool_pg_nums
.end()) {
1672 dout(20) << __func__
<< " " << pool
<< " loading" << dendl
;
1673 ghobject_t oid
= OSD::make_final_pool_info_oid(pool
);
1675 int r
= store
->read(meta_ch
, oid
, 0, 0, bl
);
1676 ceph_assert(r
>= 0);
1677 auto blp
= bl
.cbegin();
1680 deleted_pool_pg_nums
[pool
] = pi
.get_pg_num();
1681 dout(20) << __func__
<< " " << pool
<< " got " << pi
.get_pg_num() << dendl
;
1682 return pi
.get_pg_num();
1685 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1687 epoch_t e
= o
->get_epoch();
1689 if (cct
->_conf
->osd_map_dedup
) {
1690 // Dedup against an existing map at a nearby epoch
1691 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1693 OSDMap::dedup(for_dedup
.get(), o
);
1697 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1704 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1706 std::lock_guard
l(map_cache_lock
);
1707 OSDMapRef retval
= map_cache
.lookup(epoch
);
1709 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1711 logger
->inc(l_osd_map_cache_hit
);
1716 logger
->inc(l_osd_map_cache_miss
);
1717 epoch_t lb
= map_cache
.cached_key_lower_bound();
1719 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1720 logger
->inc(l_osd_map_cache_miss_low
);
1721 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1725 OSDMap
*map
= new OSDMap
;
1727 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1729 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1730 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1736 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1738 return _add_map(map
);
1744 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1746 reply_op_error(op
, err
, eversion_t(), 0);
1749 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1752 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1753 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1755 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1757 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
, true);
1758 reply
->set_reply_versions(v
, uv
);
1759 m
->get_connection()->send_message(reply
);
1762 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1764 if (!cct
->_conf
->osd_debug_misdirected_ops
) {
1768 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1769 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1771 ceph_assert(m
->get_map_epoch() >= pg
->get_history().same_primary_since
);
1773 if (pg
->is_ec_pg()) {
1775 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1776 * can get this result:
1777 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1778 * [CRUSH_ITEM_NONE, 2, 3]/3
1779 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1781 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1783 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1786 * We can't compute the op target based on the sending map epoch due to
1787 * splitting. The simplest thing is to detect such cases here and drop
1788 * them without an error (the client will resend anyway).
1790 ceph_assert(m
->get_map_epoch() <= superblock
.newest_map
);
1791 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1793 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1794 << m
->get_map_epoch() << ", dropping" << dendl
;
1797 pg_t _pgid
= m
->get_raw_pg();
1799 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1800 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1801 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1802 pgid
.shard
!= pg
->pg_id
.shard
) {
1803 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1804 << m
->get_map_epoch() << ", dropping" << dendl
;
1809 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1810 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1811 << " pg " << m
->get_raw_pg()
1812 << " to osd." << whoami
1813 << " not " << pg
->get_acting()
1814 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1817 void OSDService::enqueue_back(OpQueueItem
&& qi
)
1819 osd
->op_shardedwq
.queue(std::move(qi
));
1822 void OSDService::enqueue_front(OpQueueItem
&& qi
)
1824 osd
->op_shardedwq
.queue_front(std::move(qi
));
1827 void OSDService::queue_recovery_context(
1829 GenContext
<ThreadPool::TPHandle
&> *c
)
1831 epoch_t e
= get_osdmap_epoch();
1834 unique_ptr
<OpQueueItem::OpQueueable
>(
1835 new PGRecoveryContext(pg
->get_pgid(), c
, e
)),
1836 cct
->_conf
->osd_recovery_cost
,
1837 cct
->_conf
->osd_recovery_priority
,
1843 void OSDService::queue_for_snap_trim(PG
*pg
)
1845 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1848 unique_ptr
<OpQueueItem::OpQueueable
>(
1849 new PGSnapTrim(pg
->get_pgid(), pg
->get_osdmap_epoch())),
1850 cct
->_conf
->osd_snap_trim_cost
,
1851 cct
->_conf
->osd_snap_trim_priority
,
1854 pg
->get_osdmap_epoch()));
1857 void OSDService::queue_for_scrub(PG
*pg
, bool with_high_priority
)
1859 unsigned scrub_queue_priority
= pg
->scrubber
.priority
;
1860 if (with_high_priority
&& scrub_queue_priority
< cct
->_conf
->osd_client_op_priority
) {
1861 scrub_queue_priority
= cct
->_conf
->osd_client_op_priority
;
1863 const auto epoch
= pg
->get_osdmap_epoch();
1866 unique_ptr
<OpQueueItem::OpQueueable
>(new PGScrub(pg
->get_pgid(), epoch
)),
1867 cct
->_conf
->osd_scrub_cost
,
1868 scrub_queue_priority
,
1874 void OSDService::queue_for_pg_delete(spg_t pgid
, epoch_t e
)
1876 dout(10) << __func__
<< " on " << pgid
<< " e " << e
<< dendl
;
1879 unique_ptr
<OpQueueItem::OpQueueable
>(
1880 new PGDelete(pgid
, e
)),
1881 cct
->_conf
->osd_pg_delete_cost
,
1882 cct
->_conf
->osd_pg_delete_priority
,
1888 bool OSDService::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
1890 return osd
->try_finish_pg_delete(pg
, old_pg_num
);
1895 void OSDService::set_ready_to_merge_source(PG
*pg
, eversion_t version
)
1897 std::lock_guard
l(merge_lock
);
1898 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1899 ready_to_merge_source
[pg
->pg_id
.pgid
] = version
;
1900 assert(not_ready_to_merge_source
.count(pg
->pg_id
.pgid
) == 0);
1901 _send_ready_to_merge();
1904 void OSDService::set_ready_to_merge_target(PG
*pg
,
1906 epoch_t last_epoch_started
,
1907 epoch_t last_epoch_clean
)
1909 std::lock_guard
l(merge_lock
);
1910 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1911 ready_to_merge_target
.insert(make_pair(pg
->pg_id
.pgid
,
1914 last_epoch_clean
)));
1915 assert(not_ready_to_merge_target
.count(pg
->pg_id
.pgid
) == 0);
1916 _send_ready_to_merge();
1919 void OSDService::set_not_ready_to_merge_source(pg_t source
)
1921 std::lock_guard
l(merge_lock
);
1922 dout(10) << __func__
<< " " << source
<< dendl
;
1923 not_ready_to_merge_source
.insert(source
);
1924 assert(ready_to_merge_source
.count(source
) == 0);
1925 _send_ready_to_merge();
1928 void OSDService::set_not_ready_to_merge_target(pg_t target
, pg_t source
)
1930 std::lock_guard
l(merge_lock
);
1931 dout(10) << __func__
<< " " << target
<< " source " << source
<< dendl
;
1932 not_ready_to_merge_target
[target
] = source
;
1933 assert(ready_to_merge_target
.count(target
) == 0);
1934 _send_ready_to_merge();
1937 void OSDService::send_ready_to_merge()
1939 std::lock_guard
l(merge_lock
);
1940 _send_ready_to_merge();
1943 void OSDService::_send_ready_to_merge()
1945 dout(20) << __func__
1946 << " ready_to_merge_source " << ready_to_merge_source
1947 << " not_ready_to_merge_source " << not_ready_to_merge_source
1948 << " ready_to_merge_target " << ready_to_merge_target
1949 << " not_ready_to_merge_target " << not_ready_to_merge_target
1950 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1952 for (auto src
: not_ready_to_merge_source
) {
1953 if (sent_ready_to_merge_source
.count(src
) == 0) {
1954 monc
->send_mon_message(new MOSDPGReadyToMerge(
1958 osdmap
->get_epoch()));
1959 sent_ready_to_merge_source
.insert(src
);
1962 for (auto p
: not_ready_to_merge_target
) {
1963 if (sent_ready_to_merge_source
.count(p
.second
) == 0) {
1964 monc
->send_mon_message(new MOSDPGReadyToMerge(
1968 osdmap
->get_epoch()));
1969 sent_ready_to_merge_source
.insert(p
.second
);
1972 for (auto src
: ready_to_merge_source
) {
1973 if (not_ready_to_merge_source
.count(src
.first
) ||
1974 not_ready_to_merge_target
.count(src
.first
.get_parent())) {
1977 auto p
= ready_to_merge_target
.find(src
.first
.get_parent());
1978 if (p
!= ready_to_merge_target
.end() &&
1979 sent_ready_to_merge_source
.count(src
.first
) == 0) {
1980 monc
->send_mon_message(new MOSDPGReadyToMerge(
1981 src
.first
, // source pgid
1982 src
.second
, // src version
1983 std::get
<0>(p
->second
), // target version
1984 std::get
<1>(p
->second
), // PG's last_epoch_started
1985 std::get
<2>(p
->second
), // PG's last_epoch_clean
1987 osdmap
->get_epoch()));
1988 sent_ready_to_merge_source
.insert(src
.first
);
1993 void OSDService::clear_ready_to_merge(PG
*pg
)
1995 std::lock_guard
l(merge_lock
);
1996 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1997 ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1998 ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1999 not_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
2000 not_ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
2001 sent_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
2004 void OSDService::clear_sent_ready_to_merge()
2006 std::lock_guard
l(merge_lock
);
2007 sent_ready_to_merge_source
.clear();
2010 void OSDService::prune_sent_ready_to_merge(OSDMapRef
& osdmap
)
2012 std::lock_guard
l(merge_lock
);
2013 auto i
= sent_ready_to_merge_source
.begin();
2014 while (i
!= sent_ready_to_merge_source
.end()) {
2015 if (!osdmap
->pg_exists(*i
)) {
2016 dout(10) << __func__
<< " " << *i
<< dendl
;
2017 i
= sent_ready_to_merge_source
.erase(i
);
2026 void OSDService::_queue_for_recovery(
2027 std::pair
<epoch_t
, PGRef
> p
,
2028 uint64_t reserved_pushes
)
2030 ceph_assert(recovery_lock
.is_locked_by_me());
2033 unique_ptr
<OpQueueItem::OpQueueable
>(
2035 p
.second
->get_pgid(), p
.first
, reserved_pushes
)),
2036 cct
->_conf
->osd_recovery_cost
,
2037 cct
->_conf
->osd_recovery_priority
,
2043 // ====================================================================
2047 #define dout_prefix *_dout
2049 // Commands shared between OSD's console and admin console:
2051 namespace osd_cmds
{
2053 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
);
2055 }} // namespace ceph::osd_cmds
2057 int OSD::mkfs(CephContext
*cct
, ObjectStore
*store
, uuid_d fsid
, int whoami
)
2063 ObjectStore::CollectionHandle ch
;
2065 // if we are fed a uuid for this osd, use it.
2066 store
->set_fsid(cct
->_conf
->osd_uuid
);
2068 ret
= store
->mkfs();
2070 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error "
2071 << cpp_strerror(ret
) << dendl
;
2075 store
->set_cache_shards(1); // doesn't matter for mkfs!
2077 ret
= store
->mount();
2079 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error "
2080 << cpp_strerror(ret
) << dendl
;
2084 ch
= store
->open_collection(coll_t::meta());
2086 ret
= store
->read(ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
2088 derr
<< "OSD::mkfs: have meta collection but no superblock" << dendl
;
2091 /* if we already have superblock, check content of superblock */
2092 dout(0) << " have superblock" << dendl
;
2093 auto p
= sbbl
.cbegin();
2095 if (whoami
!= sb
.whoami
) {
2096 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
2101 if (fsid
!= sb
.cluster_fsid
) {
2102 derr
<< "provided cluster fsid " << fsid
2103 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
2108 // create superblock
2109 sb
.cluster_fsid
= fsid
;
2110 sb
.osd_fsid
= store
->get_fsid();
2112 sb
.compat_features
= get_osd_initial_compat_set();
2117 ObjectStore::CollectionHandle ch
= store
->create_new_collection(
2119 ObjectStore::Transaction t
;
2120 t
.create_collection(coll_t::meta(), 0);
2121 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
2122 ret
= store
->queue_transaction(ch
, std::move(t
));
2124 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2125 << "queue_transaction returned " << cpp_strerror(ret
) << dendl
;
2130 ret
= write_meta(cct
, store
, sb
.cluster_fsid
, sb
.osd_fsid
, whoami
);
2132 derr
<< "OSD::mkfs: failed to write fsid file: error "
2133 << cpp_strerror(ret
) << dendl
;
2147 int OSD::write_meta(CephContext
*cct
, ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
)
2152 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
2153 r
= store
->write_meta("magic", val
);
2157 snprintf(val
, sizeof(val
), "%d", whoami
);
2158 r
= store
->write_meta("whoami", val
);
2162 cluster_fsid
.print(val
);
2163 r
= store
->write_meta("ceph_fsid", val
);
2167 string key
= cct
->_conf
.get_val
<string
>("key");
2169 r
= store
->write_meta("osd_key", key
);
2173 string keyfile
= cct
->_conf
.get_val
<string
>("keyfile");
2174 if (!keyfile
.empty()) {
2177 r
= keybl
.read_file(keyfile
.c_str(), &err
);
2179 derr
<< __func__
<< " failed to read keyfile " << keyfile
<< ": "
2180 << err
<< ": " << cpp_strerror(r
) << dendl
;
2183 r
= store
->write_meta("osd_key", keybl
.to_str());
2189 r
= store
->write_meta("ready", "ready");
2196 int OSD::peek_meta(ObjectStore
*store
,
2198 uuid_d
*cluster_fsid
,
2201 int *require_osd_release
)
2205 int r
= store
->read_meta("magic", &val
);
2210 r
= store
->read_meta("whoami", &val
);
2213 *whoami
= atoi(val
.c_str());
2215 r
= store
->read_meta("ceph_fsid", &val
);
2218 r
= cluster_fsid
->parse(val
.c_str());
2222 r
= store
->read_meta("fsid", &val
);
2224 *osd_fsid
= uuid_d();
2226 r
= osd_fsid
->parse(val
.c_str());
2231 r
= store
->read_meta("require_osd_release", &val
);
2233 *require_osd_release
= atoi(val
.c_str());
2241 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2245 OSD::OSD(CephContext
*cct_
, ObjectStore
*store_
,
2247 Messenger
*internal_messenger
,
2248 Messenger
*external_messenger
,
2249 Messenger
*hb_client_front
,
2250 Messenger
*hb_client_back
,
2251 Messenger
*hb_front_serverm
,
2252 Messenger
*hb_back_serverm
,
2253 Messenger
*osdc_messenger
,
2255 const std::string
&dev
, const std::string
&jdev
) :
2257 osd_lock("OSD::osd_lock"),
2258 tick_timer(cct
, osd_lock
),
2259 tick_timer_lock("OSD::tick_timer_lock"),
2260 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
2261 gss_ktfile_client(cct
->_conf
.get_val
<std::string
>("gss_ktab_client_file")),
2262 cluster_messenger(internal_messenger
),
2263 client_messenger(external_messenger
),
2264 objecter_messenger(osdc_messenger
),
2266 mgrc(cct_
, client_messenger
),
2268 recoverystate_perf(NULL
),
2270 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
2271 clog(log_client
.create_channel()),
2273 dev_path(dev
), journal_path(jdev
),
2274 store_is_rotational(store
->is_rotational()),
2275 trace_endpoint("0.0.0.0", 0, "osd"),
2277 m_osd_pg_epoch_max_lag_factor(cct
->_conf
.get_val
<double>(
2278 "osd_pg_epoch_max_lag_factor")),
2279 osd_compat(get_osd_compat_set()),
2280 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
2281 get_num_op_threads()),
2282 command_tp(cct
, "OSD::command_tp", "tp_osd_cmd", 1),
2283 session_waiting_lock("OSD::session_waiting_lock"),
2284 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
2285 heartbeat_lock("OSD::heartbeat_lock"),
2286 heartbeat_stop(false),
2287 heartbeat_need_update(true),
2288 hb_front_client_messenger(hb_client_front
),
2289 hb_back_client_messenger(hb_client_back
),
2290 hb_front_server_messenger(hb_front_serverm
),
2291 hb_back_server_messenger(hb_back_serverm
),
2293 heartbeat_thread(this),
2294 heartbeat_dispatcher(this),
2295 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
2296 cct
->_conf
->osd_num_op_tracker_shard
),
2297 test_ops_hook(NULL
),
2298 op_queue(get_io_queue()),
2299 op_prio_cutoff(get_io_prio_cut()),
2302 cct
->_conf
->osd_op_thread_timeout
,
2303 cct
->_conf
->osd_op_thread_suicide_timeout
,
2305 map_lock("OSD::map_lock"),
2306 last_pg_create_epoch(0),
2307 mon_report_lock("OSD::mon_report_lock"),
2310 requested_full_first(0),
2311 requested_full_last(0),
2314 cct
->_conf
->osd_command_thread_timeout
,
2315 cct
->_conf
->osd_command_thread_suicide_timeout
,
2320 if (!gss_ktfile_client
.empty()) {
2321 // Assert we can export environment variable
2323 The default client keytab is used, if it is present and readable,
2324 to automatically obtain initial credentials for GSSAPI client
2325 applications. The principal name of the first entry in the client
2326 keytab is used by default when obtaining initial credentials.
2327 1. The KRB5_CLIENT_KTNAME environment variable.
2328 2. The default_client_keytab_name profile variable in [libdefaults].
2329 3. The hardcoded default, DEFCKTNAME.
2331 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2332 gss_ktfile_client
.c_str(), 1));
2333 ceph_assert(set_result
== 0);
2336 monc
->set_messenger(client_messenger
);
2337 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
2338 cct
->_conf
->osd_op_log_threshold
);
2339 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
2340 cct
->_conf
->osd_op_history_duration
);
2341 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
2342 cct
->_conf
->osd_op_history_slow_op_threshold
);
2344 std::stringstream ss
;
2345 ss
<< "osd." << whoami
;
2346 trace_endpoint
.copy_name(ss
.str());
2349 // initialize shards
2350 num_shards
= get_num_op_shards();
2351 for (uint32_t i
= 0; i
< num_shards
; i
++) {
2352 OSDShard
*one_shard
= new OSDShard(
2356 cct
->_conf
->osd_op_pq_max_tokens_per_priority
,
2357 cct
->_conf
->osd_op_pq_min_cost
,
2359 shards
.push_back(one_shard
);
2365 while (!shards
.empty()) {
2366 delete shards
.back();
2369 delete class_handler
;
2370 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
2371 cct
->get_perfcounters_collection()->remove(logger
);
2372 delete recoverystate_perf
;
2377 double OSD::get_tick_interval() const
2379 // vary +/- 5% to avoid scrub scheduling livelocks
2380 constexpr auto delta
= 0.05;
2381 return (OSD_TICK_INTERVAL
*
2382 ceph::util::generate_random_number(1.0 - delta
, 1.0 + delta
));
2385 void cls_initialize(ClassHandler
*ch
);
2387 void OSD::handle_signal(int signum
)
2389 ceph_assert(signum
== SIGINT
|| signum
== SIGTERM
);
2390 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
2396 std::lock_guard
lock(osd_lock
);
2400 if (store
->test_mount_in_use()) {
2401 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
2402 << "currently in use. (Is ceph-osd already running?)" << dendl
;
2406 cct
->_conf
.add_observer(this);
2410 int OSD::set_numa_affinity()
2412 // storage numa node
2413 int store_node
= -1;
2414 store
->get_numa_node(&store_node
, nullptr, nullptr);
2415 if (store_node
>= 0) {
2416 dout(1) << __func__
<< " storage numa node " << store_node
<< dendl
;
2419 // check network numa node(s)
2420 int front_node
= -1, back_node
= -1;
2421 string front_iface
= pick_iface(
2423 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
2424 string back_iface
= pick_iface(
2426 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
2427 int r
= get_iface_numa_node(front_iface
, &front_node
);
2428 if (r
>= 0 && front_node
>= 0) {
2429 dout(1) << __func__
<< " public network " << front_iface
<< " numa node "
2430 << front_node
<< dendl
;
2431 r
= get_iface_numa_node(back_iface
, &back_node
);
2432 if (r
>= 0 && back_node
>= 0) {
2433 dout(1) << __func__
<< " cluster network " << back_iface
<< " numa node "
2434 << back_node
<< dendl
;
2435 if (front_node
== back_node
&&
2436 front_node
== store_node
) {
2437 dout(1) << " objectstore and network numa nodes all match" << dendl
;
2438 if (g_conf().get_val
<bool>("osd_numa_auto_affinity")) {
2439 numa_node
= front_node
;
2441 } else if (front_node
!= back_node
) {
2442 dout(1) << __func__
<< " public and cluster network numa nodes do not match"
2445 dout(1) << __func__
<< " objectstore and network numa nodes do not match"
2448 } else if (back_node
== -2) {
2449 dout(1) << __func__
<< " cluster network " << back_iface
2450 << " ports numa nodes do not match" << dendl
;
2452 derr
<< __func__
<< " unable to identify cluster interface '" << back_iface
2453 << "' numa node: " << cpp_strerror(r
) << dendl
;
2455 } else if (front_node
== -2) {
2456 dout(1) << __func__
<< " public network " << front_iface
2457 << " ports numa nodes do not match" << dendl
;
2459 derr
<< __func__
<< " unable to identify public interface '" << front_iface
2460 << "' numa node: " << cpp_strerror(r
) << dendl
;
2462 if (int node
= g_conf().get_val
<int64_t>("osd_numa_node"); node
>= 0) {
2463 // this takes precedence over the automagic logic above
2466 if (numa_node
>= 0) {
2467 int r
= get_numa_node_cpu_set(numa_node
, &numa_cpu_set_size
, &numa_cpu_set
);
2469 dout(1) << __func__
<< " unable to determine numa node " << numa_node
2470 << " CPUs" << dendl
;
2473 dout(1) << __func__
<< " setting numa affinity to node " << numa_node
2475 << cpu_set_to_str_list(numa_cpu_set_size
, &numa_cpu_set
)
2477 r
= set_cpu_affinity_all_threads(numa_cpu_set_size
, &numa_cpu_set
);
2480 derr
<< __func__
<< " failed to set numa affinity: " << cpp_strerror(r
)
2486 dout(1) << __func__
<< " not setting numa affinity" << dendl
;
2493 class OSDSocketHook
: public AdminSocketHook
{
2496 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
2497 bool call(std::string_view admin_command
, const cmdmap_t
& cmdmap
,
2498 std::string_view format
, bufferlist
& out
) override
{
2502 r
= osd
->asok_command(admin_command
, cmdmap
, format
, ss
);
2503 } catch (const bad_cmd_get
& e
) {
2512 std::set
<int64_t> OSD::get_mapped_pools()
2514 std::set
<int64_t> pools
;
2515 std::vector
<spg_t
> pgids
;
2517 for (const auto &pgid
: pgids
) {
2518 pools
.insert(pgid
.pool());
2523 bool OSD::asok_command(std::string_view admin_command
, const cmdmap_t
& cmdmap
,
2524 std::string_view format
, ostream
& ss
)
2526 Formatter
*f
= Formatter::create(format
, "json-pretty", "json-pretty");
2527 if (admin_command
== "status") {
2528 f
->open_object_section("status");
2529 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
2530 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
2531 f
->dump_unsigned("whoami", superblock
.whoami
);
2532 f
->dump_string("state", get_state_name(get_state()));
2533 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
2534 f
->dump_unsigned("newest_map", superblock
.newest_map
);
2535 f
->dump_unsigned("num_pgs", num_pgs
);
2537 } else if (admin_command
== "flush_journal") {
2538 store
->flush_journal();
2539 } else if (admin_command
== "dump_ops_in_flight" ||
2540 admin_command
== "ops" ||
2541 admin_command
== "dump_blocked_ops" ||
2542 admin_command
== "dump_historic_ops" ||
2543 admin_command
== "dump_historic_ops_by_duration" ||
2544 admin_command
== "dump_historic_slow_ops") {
2546 const string error_str
= "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2547 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2548 will start to track new ops received afterwards.";
2550 set
<string
> filters
;
2551 vector
<string
> filter_str
;
2552 if (cmd_getval(cct
, cmdmap
, "filterstr", filter_str
)) {
2553 copy(filter_str
.begin(), filter_str
.end(),
2554 inserter(filters
, filters
.end()));
2557 if (admin_command
== "dump_ops_in_flight" ||
2558 admin_command
== "ops") {
2559 if (!op_tracker
.dump_ops_in_flight(f
, false, filters
)) {
2563 if (admin_command
== "dump_blocked_ops") {
2564 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
)) {
2568 if (admin_command
== "dump_historic_ops") {
2569 if (!op_tracker
.dump_historic_ops(f
, false, filters
)) {
2573 if (admin_command
== "dump_historic_ops_by_duration") {
2574 if (!op_tracker
.dump_historic_ops(f
, true, filters
)) {
2578 if (admin_command
== "dump_historic_slow_ops") {
2579 if (!op_tracker
.dump_historic_slow_ops(f
, filters
)) {
2583 } else if (admin_command
== "dump_op_pq_state") {
2584 f
->open_object_section("pq");
2585 op_shardedwq
.dump(f
);
2587 } else if (admin_command
== "dump_blacklist") {
2588 list
<pair
<entity_addr_t
,utime_t
> > bl
;
2589 OSDMapRef curmap
= service
.get_osdmap();
2591 f
->open_array_section("blacklist");
2592 curmap
->get_blacklist(&bl
);
2593 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
2594 it
!= bl
.end(); ++it
) {
2595 f
->open_object_section("entry");
2596 f
->open_object_section("entity_addr_t");
2598 f
->close_section(); //entity_addr_t
2599 it
->second
.localtime(f
->dump_stream("expire_time"));
2600 f
->close_section(); //entry
2602 f
->close_section(); //blacklist
2603 } else if (admin_command
== "dump_watchers") {
2604 list
<obj_watch_item_t
> watchers
;
2608 for (auto& pg
: pgs
) {
2609 list
<obj_watch_item_t
> pg_watchers
;
2610 pg
->get_watchers(&pg_watchers
);
2611 watchers
.splice(watchers
.end(), pg_watchers
);
2614 f
->open_array_section("watchers");
2615 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2616 it
!= watchers
.end(); ++it
) {
2618 f
->open_object_section("watch");
2620 f
->dump_string("namespace", it
->obj
.nspace
);
2621 f
->dump_string("object", it
->obj
.oid
.name
);
2623 f
->open_object_section("entity_name");
2624 it
->wi
.name
.dump(f
);
2625 f
->close_section(); //entity_name_t
2627 f
->dump_unsigned("cookie", it
->wi
.cookie
);
2628 f
->dump_unsigned("timeout", it
->wi
.timeout_seconds
);
2630 f
->open_object_section("entity_addr_t");
2631 it
->wi
.addr
.dump(f
);
2632 f
->close_section(); //entity_addr_t
2634 f
->close_section(); //watch
2637 f
->close_section(); //watchers
2638 } else if (admin_command
== "dump_recovery_reservations") {
2639 f
->open_object_section("reservations");
2640 f
->open_object_section("local_reservations");
2641 service
.local_reserver
.dump(f
);
2643 f
->open_object_section("remote_reservations");
2644 service
.remote_reserver
.dump(f
);
2647 } else if (admin_command
== "dump_scrub_reservations") {
2648 f
->open_object_section("scrub_reservations");
2649 service
.dump_scrub_reservations(f
);
2651 } else if (admin_command
== "get_latest_osdmap") {
2652 get_latest_osdmap();
2653 } else if (admin_command
== "heap") {
2654 auto result
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ss
);
2656 // Note: Failed heap profile commands won't necessarily trigger an error:
2657 f
->open_object_section("result");
2658 f
->dump_string("error", cpp_strerror(result
));
2659 f
->dump_bool("success", result
>= 0);
2661 } else if (admin_command
== "set_heap_property") {
2665 bool success
= false;
2666 if (!cmd_getval(cct
, cmdmap
, "property", property
)) {
2667 error
= "unable to get property";
2669 } else if (!cmd_getval(cct
, cmdmap
, "value", value
)) {
2670 error
= "unable to get value";
2672 } else if (value
< 0) {
2673 error
= "negative value not allowed";
2675 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2676 error
= "invalid property";
2681 f
->open_object_section("result");
2682 f
->dump_string("error", error
);
2683 f
->dump_bool("success", success
);
2685 } else if (admin_command
== "get_heap_property") {
2689 bool success
= false;
2690 if (!cmd_getval(cct
, cmdmap
, "property", property
)) {
2691 error
= "unable to get property";
2693 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2694 error
= "invalid property";
2699 f
->open_object_section("result");
2700 f
->dump_string("error", error
);
2701 f
->dump_bool("success", success
);
2702 f
->dump_int("value", value
);
2704 } else if (admin_command
== "dump_objectstore_kv_stats") {
2705 store
->get_db_statistics(f
);
2706 } else if (admin_command
== "dump_scrubs") {
2707 service
.dumps_scrub(f
);
2708 } else if (admin_command
== "calc_objectstore_db_histogram") {
2709 store
->generate_db_histogram(f
);
2710 } else if (admin_command
== "flush_store_cache") {
2711 store
->flush_cache(&ss
);
2712 } else if (admin_command
== "dump_pgstate_history") {
2713 f
->open_object_section("pgstate_history");
2716 for (auto& pg
: pgs
) {
2717 f
->dump_stream("pg") << pg
->pg_id
;
2718 pg
->dump_pgstate_history(f
);
2721 } else if (admin_command
== "compact") {
2722 dout(1) << "triggering manual compaction" << dendl
;
2723 auto start
= ceph::coarse_mono_clock::now();
2725 auto end
= ceph::coarse_mono_clock::now();
2726 double duration
= std::chrono::duration
<double>(end
-start
).count();
2727 dout(1) << "finished manual compaction in "
2729 << " seconds" << dendl
;
2730 f
->open_object_section("compact_result");
2731 f
->dump_float("elapsed_time", duration
);
2733 } else if (admin_command
== "get_mapped_pools") {
2734 f
->open_array_section("mapped_pools");
2735 set
<int64_t> poollist
= get_mapped_pools();
2736 for (auto pool
: poollist
) {
2737 f
->dump_int("pool_id", pool
);
2740 } else if (admin_command
== "smart") {
2742 cmd_getval(cct
, cmdmap
, "devid", devid
);
2743 probe_smart(devid
, ss
);
2744 } else if (admin_command
== "list_devices") {
2745 set
<string
> devnames
;
2746 store
->get_devices(&devnames
);
2747 f
->open_object_section("list_devices");
2748 for (auto dev
: devnames
) {
2749 if (dev
.find("dm-") == 0) {
2752 f
->dump_string("device", "/dev/" + dev
);
2755 } else if (admin_command
== "send_beacon") {
2757 send_beacon(ceph::coarse_mono_clock::now());
2759 } else if (admin_command
== "dump_osd_network") {
2761 if (!(cmd_getval(cct
, cmdmap
, "value", value
))) {
2762 // Convert milliseconds to microseconds
2763 value
= static_cast<int64_t>(g_conf().get_val
<double>("mon_warn_on_slow_ping_time")) * 1000;
2765 double ratio
= g_conf().get_val
<double>("mon_warn_on_slow_ping_ratio");
2766 value
= g_conf().get_val
<int64_t>("osd_heartbeat_grace");
2767 value
*= 1000000 * ratio
; // Seconds of grace to microseconds at ratio
2770 // Convert user input to microseconds
2773 if (value
< 0) value
= 0;
2775 struct osd_ping_time_t
{
2779 std::array
<uint32_t,3> times
;
2780 std::array
<uint32_t,3> min
;
2781 std::array
<uint32_t,3> max
;
2783 uint32_t last_update
;
2785 bool operator<(const osd_ping_time_t
& rhs
) const {
2786 if (pingtime
< rhs
.pingtime
)
2788 if (pingtime
> rhs
.pingtime
)
2798 set
<osd_ping_time_t
> sorted
;
2799 // Get pingtimes under lock and not on the stack
2800 map
<int, osd_stat_t::Interfaces
> *pingtimes
= new map
<int, osd_stat_t::Interfaces
>;
2801 service
.get_hb_pingtime(pingtimes
);
2802 for (auto j
: *pingtimes
) {
2803 if (j
.second
.last_update
== 0)
2805 osd_ping_time_t item
;
2806 item
.pingtime
= std::max(j
.second
.back_pingtime
[0], j
.second
.back_pingtime
[1]);
2807 item
.pingtime
= std::max(item
.pingtime
, j
.second
.back_pingtime
[2]);
2808 if (item
.pingtime
>= value
) {
2810 item
.times
[0] = j
.second
.back_pingtime
[0];
2811 item
.times
[1] = j
.second
.back_pingtime
[1];
2812 item
.times
[2] = j
.second
.back_pingtime
[2];
2813 item
.min
[0] = j
.second
.back_min
[0];
2814 item
.min
[1] = j
.second
.back_min
[1];
2815 item
.min
[2] = j
.second
.back_min
[2];
2816 item
.max
[0] = j
.second
.back_max
[0];
2817 item
.max
[1] = j
.second
.back_max
[1];
2818 item
.max
[2] = j
.second
.back_max
[2];
2819 item
.last
= j
.second
.back_last
;
2821 item
.last_update
= j
.second
.last_update
;
2822 sorted
.emplace(item
);
2824 if (j
.second
.front_last
== 0)
2826 item
.pingtime
= std::max(j
.second
.front_pingtime
[0], j
.second
.front_pingtime
[1]);
2827 item
.pingtime
= std::max(item
.pingtime
, j
.second
.front_pingtime
[2]);
2828 if (item
.pingtime
>= value
) {
2830 item
.times
[0] = j
.second
.front_pingtime
[0];
2831 item
.times
[1] = j
.second
.front_pingtime
[1];
2832 item
.times
[2] = j
.second
.front_pingtime
[2];
2833 item
.min
[0] = j
.second
.front_min
[0];
2834 item
.min
[1] = j
.second
.front_min
[1];
2835 item
.min
[2] = j
.second
.front_min
[2];
2836 item
.max
[0] = j
.second
.front_max
[0];
2837 item
.max
[1] = j
.second
.front_max
[1];
2838 item
.max
[2] = j
.second
.front_max
[2];
2839 item
.last
= j
.second
.front_last
;
2840 item
.last_update
= j
.second
.last_update
;
2842 sorted
.emplace(item
);
2847 // Network ping times (1min 5min 15min)
2848 f
->open_object_section("network_ping_times");
2849 f
->dump_int("threshold", value
/ 1000);
2850 f
->open_array_section("entries");
2851 for (auto &sitem
: boost::adaptors::reverse(sorted
)) {
2852 ceph_assert(sitem
.pingtime
>= value
);
2853 f
->open_object_section("entry");
2855 const time_t lu(sitem
.last_update
);
2857 string
lustr(ctime_r(&lu
, buffer
));
2858 lustr
.pop_back(); // Remove trailing \n
2859 auto stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
2860 f
->dump_string("last update", lustr
);
2861 f
->dump_bool("stale", ceph_clock_now().sec() - sitem
.last_update
> stale
);
2862 f
->dump_int("from osd", whoami
);
2863 f
->dump_int("to osd", sitem
.to
);
2864 f
->dump_string("interface", (sitem
.back
? "back" : "front"));
2865 f
->open_object_section("average");
2866 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.times
[0],3).c_str());
2867 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.times
[1],3).c_str());
2868 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.times
[2],3).c_str());
2869 f
->close_section(); // average
2870 f
->open_object_section("min");
2871 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
2872 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
2873 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
2874 f
->close_section(); // min
2875 f
->open_object_section("max");
2876 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
2877 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
2878 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
2879 f
->close_section(); // max
2880 f
->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem
.last
,3).c_str());
2881 f
->close_section(); // entry
2883 f
->close_section(); // entries
2884 f
->close_section(); // network_ping_times
2886 ceph_abort_msg("broken asok registration");
2893 class TestOpsSocketHook
: public AdminSocketHook
{
2894 OSDService
*service
;
2897 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
2898 bool call(std::string_view command
, const cmdmap_t
& cmdmap
,
2899 std::string_view format
, bufferlist
& out
) override
{
2902 test_ops(service
, store
, command
, cmdmap
, ss
);
2903 } catch (const bad_cmd_get
& e
) {
2909 void test_ops(OSDService
*service
, ObjectStore
*store
,
2910 std::string_view command
, const cmdmap_t
& cmdmap
, ostream
&ss
);
2914 class OSD::C_Tick
: public Context
{
2917 explicit C_Tick(OSD
*o
) : osd(o
) {}
2918 void finish(int r
) override
{
2923 class OSD::C_Tick_WithoutOSDLock
: public Context
{
2926 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
2927 void finish(int r
) override
{
2928 osd
->tick_without_osd_lock();
2932 int OSD::enable_disable_fuse(bool stop
)
2936 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
2937 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
2938 dout(1) << __func__
<< " disabling" << dendl
;
2942 r
= ::rmdir(mntpath
.c_str());
2945 derr
<< __func__
<< " failed to rmdir " << mntpath
<< ": "
2946 << cpp_strerror(r
) << dendl
;
2951 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
2952 dout(1) << __func__
<< " enabling" << dendl
;
2953 r
= ::mkdir(mntpath
.c_str(), 0700);
2956 if (r
< 0 && r
!= -EEXIST
) {
2957 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
2958 << cpp_strerror(r
) << dendl
;
2961 fuse_store
= new FuseStore(store
, mntpath
);
2962 r
= fuse_store
->start();
2964 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
2970 #endif // HAVE_LIBFUSE
2974 int OSD::get_num_op_shards()
2976 if (cct
->_conf
->osd_op_num_shards
)
2977 return cct
->_conf
->osd_op_num_shards
;
2978 if (store_is_rotational
)
2979 return cct
->_conf
->osd_op_num_shards_hdd
;
2981 return cct
->_conf
->osd_op_num_shards_ssd
;
2984 int OSD::get_num_op_threads()
2986 if (cct
->_conf
->osd_op_num_threads_per_shard
)
2987 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard
;
2988 if (store_is_rotational
)
2989 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_hdd
;
2991 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_ssd
;
2994 float OSD::get_osd_recovery_sleep()
2996 if (cct
->_conf
->osd_recovery_sleep
)
2997 return cct
->_conf
->osd_recovery_sleep
;
2998 if (!store_is_rotational
&& !journal_is_rotational
)
2999 return cct
->_conf
->osd_recovery_sleep_ssd
;
3000 else if (store_is_rotational
&& !journal_is_rotational
)
3001 return cct
->_conf
.get_val
<double>("osd_recovery_sleep_hybrid");
3003 return cct
->_conf
->osd_recovery_sleep_hdd
;
3006 float OSD::get_osd_delete_sleep()
3008 float osd_delete_sleep
= cct
->_conf
.get_val
<double>("osd_delete_sleep");
3009 if (osd_delete_sleep
> 0)
3010 return osd_delete_sleep
;
3011 if (!store_is_rotational
&& !journal_is_rotational
)
3012 return cct
->_conf
.get_val
<double>("osd_delete_sleep_ssd");
3013 if (store_is_rotational
&& !journal_is_rotational
)
3014 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hybrid");
3015 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hdd");
3018 float OSD::get_osd_snap_trim_sleep()
3020 float osd_snap_trim_sleep
= cct
->_conf
.get_val
<double>("osd_snap_trim_sleep");
3021 if (osd_snap_trim_sleep
> 0)
3022 return osd_snap_trim_sleep
;
3023 if (!store_is_rotational
&& !journal_is_rotational
)
3024 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_ssd");
3025 if (store_is_rotational
&& !journal_is_rotational
)
3026 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hybrid");
3027 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hdd");
3032 CompatSet initial
, diff
;
3033 std::lock_guard
lock(osd_lock
);
3038 tick_timer_without_osd_lock
.init();
3039 service
.recovery_request_timer
.init();
3040 service
.sleep_timer
.init();
3042 boot_finisher
.start();
3046 store
->read_meta("require_osd_release", &val
);
3047 last_require_osd_release
= atoi(val
.c_str());
3051 dout(2) << "init " << dev_path
3052 << " (looks like " << (store_is_rotational
? "hdd" : "ssd") << ")"
3054 dout(2) << "journal " << journal_path
<< dendl
;
3055 ceph_assert(store
); // call pre_init() first!
3057 store
->set_cache_shards(get_num_op_shards());
3059 int r
= store
->mount();
3061 derr
<< "OSD:init: unable to mount object store" << dendl
;
3064 journal_is_rotational
= store
->is_journal_rotational();
3065 dout(2) << "journal looks like " << (journal_is_rotational
? "hdd" : "ssd")
3068 enable_disable_fuse(false);
3070 dout(2) << "boot" << dendl
;
3072 service
.meta_ch
= store
->open_collection(coll_t::meta());
3074 // initialize the daily loadavg with current 15min loadavg
3076 if (getloadavg(loadavgs
, 3) == 3) {
3077 daily_loadavg
= loadavgs
[2];
3079 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
3080 daily_loadavg
= 1.0;
3083 int rotating_auth_attempts
= 0;
3084 auto rotating_auth_timeout
=
3085 g_conf().get_val
<int64_t>("rotating_keys_bootstrap_timeout");
3087 // sanity check long object name handling
3090 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
3091 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
3092 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
3093 r
= store
->validate_hobject_key(l
);
3095 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
3096 << "object name[space] len" << dendl
;
3097 derr
<< " osd max object name len = "
3098 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
3099 derr
<< " osd max object namespace len = "
3100 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
3101 derr
<< cpp_strerror(r
) << dendl
;
3102 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
3105 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
3108 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
3113 r
= read_superblock();
3115 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
3120 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
3121 derr
<< "The disk uses features unsupported by the executable." << dendl
;
3122 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
3123 derr
<< " daemon features " << osd_compat
<< dendl
;
3125 if (osd_compat
.writeable(superblock
.compat_features
)) {
3126 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3127 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
3132 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3133 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
3139 assert_warn(whoami
== superblock
.whoami
);
3140 if (whoami
!= superblock
.whoami
) {
3141 derr
<< "OSD::init: superblock says osd"
3142 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
3147 // load up "current" osdmap
3148 assert_warn(!osdmap
);
3150 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
3154 osdmap
= get_map(superblock
.current_epoch
);
3156 // make sure we don't have legacy pgs deleting
3159 int r
= store
->list_collections(ls
);
3160 ceph_assert(r
>= 0);
3163 if (c
.is_pg(&pgid
) &&
3164 !osdmap
->have_pg_pool(pgid
.pool())) {
3165 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
3166 if (!store
->exists(service
.meta_ch
, oid
)) {
3167 derr
<< __func__
<< " missing pg_pool_t for deleted pool "
3168 << pgid
.pool() << " for pg " << pgid
3169 << "; please downgrade to luminous and allow "
3170 << "pg deletion to complete before upgrading" << dendl
;
3177 initial
= get_osd_initial_compat_set();
3178 diff
= superblock
.compat_features
.unsupported(initial
);
3179 if (superblock
.compat_features
.merge(initial
)) {
3180 // We need to persist the new compat_set before we
3182 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
3183 ObjectStore::Transaction t
;
3184 write_superblock(t
);
3185 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3190 // make sure snap mapper object exists
3191 if (!store
->exists(service
.meta_ch
, OSD::make_snapmapper_oid())) {
3192 dout(10) << "init creating/touching snapmapper object" << dendl
;
3193 ObjectStore::Transaction t
;
3194 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3195 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3200 class_handler
= new ClassHandler(cct
);
3201 cls_initialize(class_handler
);
3203 if (cct
->_conf
->osd_open_classes_on_start
) {
3204 int r
= class_handler
->open_all_classes();
3206 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
3209 check_osdmap_features();
3211 create_recoverystate_perf();
3214 epoch_t bind_epoch
= osdmap
->get_epoch();
3215 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
3218 clear_temp_objects();
3220 // initialize osdmap references in sharded wq
3221 for (auto& shard
: shards
) {
3222 std::lock_guard
l(shard
->osdmap_lock
);
3223 shard
->shard_osdmap
= osdmap
;
3226 // load up pgs (as they previously existed)
3229 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
3230 dout(0) << "using " << op_queue
<< " op queue with priority op cut off at " <<
3231 op_prio_cutoff
<< "." << dendl
;
3237 struct store_statfs_t stbuf
;
3238 osd_alert_list_t alerts
;
3239 int r
= store
->statfs(&stbuf
, &alerts
);
3240 ceph_assert(r
== 0);
3241 service
.set_statfs(stbuf
, alerts
);
3244 // client_messenger auth_client is already set up by monc.
3245 for (auto m
: { cluster_messenger
,
3247 hb_front_client_messenger
,
3248 hb_back_client_messenger
,
3249 hb_front_server_messenger
,
3250 hb_back_server_messenger
} ) {
3251 m
->set_auth_client(monc
);
3253 for (auto m
: { client_messenger
,
3255 hb_front_server_messenger
,
3256 hb_back_server_messenger
}) {
3257 m
->set_auth_server(monc
);
3259 monc
->set_handle_authentication_dispatcher(this);
3261 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
3262 | CEPH_ENTITY_TYPE_MGR
);
3267 mgrc
.set_pgstats_cb([this](){ return collect_pg_stats(); });
3268 mgrc
.set_perf_metric_query_cb(
3269 [this](const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
) {
3270 set_perf_queries(queries
);
3272 [this](std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> *reports
) {
3273 get_perf_reports(reports
);
3277 // tell monc about log_client so it will know about mon session resets
3278 monc
->set_log_client(&log_client
);
3279 update_log_config();
3282 client_messenger
->add_dispatcher_tail(&mgrc
);
3283 client_messenger
->add_dispatcher_tail(this);
3284 cluster_messenger
->add_dispatcher_head(this);
3286 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3287 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3288 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3289 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3291 objecter_messenger
->add_dispatcher_head(service
.objecter
);
3294 service
.publish_map(osdmap
);
3295 service
.publish_superblock(superblock
);
3296 service
.max_oldest_map
= superblock
.oldest_map
;
3298 for (auto& shard
: shards
) {
3299 // put PGs in a temporary set because we may modify pg_slots
3300 // unordered_map below.
3302 for (auto& i
: shard
->pg_slots
) {
3303 PGRef pg
= i
.second
->pg
;
3309 for (auto pg
: pgs
) {
3311 set
<pair
<spg_t
,epoch_t
>> new_children
;
3312 set
<pair
<spg_t
,epoch_t
>> merge_pgs
;
3313 service
.identify_splits_and_merges(pg
->get_osdmap(), osdmap
, pg
->pg_id
,
3314 &new_children
, &merge_pgs
);
3315 if (!new_children
.empty()) {
3316 for (auto shard
: shards
) {
3317 shard
->prime_splits(osdmap
, &new_children
);
3319 assert(new_children
.empty());
3321 if (!merge_pgs
.empty()) {
3322 for (auto shard
: shards
) {
3323 shard
->prime_merges(osdmap
, &merge_pgs
);
3325 assert(merge_pgs
.empty());
3334 // start the heartbeat
3335 heartbeat_thread
.create("osd_srv_heartbt");
3338 tick_timer
.add_event_after(get_tick_interval(),
3341 std::lock_guard
l(tick_timer_lock
);
3342 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
3343 new C_Tick_WithoutOSDLock(this));
3348 r
= monc
->authenticate();
3350 derr
<< __func__
<< " authentication failed: " << cpp_strerror(r
)
3355 while (monc
->wait_auth_rotating(rotating_auth_timeout
) < 0) {
3356 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
3357 ++rotating_auth_attempts
;
3358 if (rotating_auth_attempts
> g_conf()->max_rotating_auth_attempts
) {
3359 derr
<< __func__
<< " wait_auth_rotating timed out" << dendl
;
3364 r
= update_crush_device_class();
3366 derr
<< __func__
<< " unable to update_crush_device_class: "
3367 << cpp_strerror(r
) << dendl
;
3371 r
= update_crush_location();
3373 derr
<< __func__
<< " unable to update_crush_location: "
3374 << cpp_strerror(r
) << dendl
;
3382 // start objecter *after* we have authenticated, so that we don't ignore
3383 // the OSDMaps it requests.
3384 service
.final_init();
3388 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
3391 dout(0) << "done with init, starting boot process" << dendl
;
3393 // subscribe to any pg creations
3394 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
3396 // MgrClient needs this (it doesn't have MonClient reference itself)
3397 monc
->sub_want("mgrmap", 0, 0);
3399 // we don't need to ask for an osdmap here; objecter will
3400 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3409 enable_disable_fuse(true);
3416 void OSD::final_init()
3418 AdminSocket
*admin_socket
= cct
->get_admin_socket();
3419 asok_hook
= new OSDSocketHook(this);
3420 int r
= admin_socket
->register_command("status", "status", asok_hook
,
3421 "high-level status of OSD");
3422 ceph_assert(r
== 0);
3423 r
= admin_socket
->register_command("flush_journal", "flush_journal",
3425 "flush the journal to permanent store");
3426 ceph_assert(r
== 0);
3427 r
= admin_socket
->register_command("dump_ops_in_flight",
3428 "dump_ops_in_flight " \
3429 "name=filterstr,type=CephString,n=N,req=false",
3431 "show the ops currently in flight");
3432 ceph_assert(r
== 0);
3433 r
= admin_socket
->register_command("ops",
3435 "name=filterstr,type=CephString,n=N,req=false",
3437 "show the ops currently in flight");
3438 ceph_assert(r
== 0);
3439 r
= admin_socket
->register_command("dump_blocked_ops",
3440 "dump_blocked_ops " \
3441 "name=filterstr,type=CephString,n=N,req=false",
3443 "show the blocked ops currently in flight");
3444 ceph_assert(r
== 0);
3445 r
= admin_socket
->register_command("dump_historic_ops",
3446 "dump_historic_ops " \
3447 "name=filterstr,type=CephString,n=N,req=false",
3450 ceph_assert(r
== 0);
3451 r
= admin_socket
->register_command("dump_historic_slow_ops",
3452 "dump_historic_slow_ops " \
3453 "name=filterstr,type=CephString,n=N,req=false",
3455 "show slowest recent ops");
3456 ceph_assert(r
== 0);
3457 r
= admin_socket
->register_command("dump_historic_ops_by_duration",
3458 "dump_historic_ops_by_duration " \
3459 "name=filterstr,type=CephString,n=N,req=false",
3461 "show slowest recent ops, sorted by duration");
3462 ceph_assert(r
== 0);
3463 r
= admin_socket
->register_command("dump_op_pq_state", "dump_op_pq_state",
3465 "dump op priority queue state");
3466 ceph_assert(r
== 0);
3467 r
= admin_socket
->register_command("dump_blacklist", "dump_blacklist",
3469 "dump blacklisted clients and times");
3470 ceph_assert(r
== 0);
3471 r
= admin_socket
->register_command("dump_watchers", "dump_watchers",
3473 "show clients which have active watches,"
3474 " and on which objects");
3475 ceph_assert(r
== 0);
3476 r
= admin_socket
->register_command("dump_recovery_reservations", "dump_recovery_reservations",
3478 "show recovery reservations");
3479 ceph_assert(r
== 0);
3480 r
= admin_socket
->register_command("dump_scrub_reservations", "dump_scrub_reservations",
3482 "show scrub reservations");
3483 ceph_assert(r
== 0);
3484 r
= admin_socket
->register_command("get_latest_osdmap", "get_latest_osdmap",
3486 "force osd to update the latest map from "
3488 ceph_assert(r
== 0);
3490 r
= admin_socket
->register_command( "heap",
3492 "name=heapcmd,type=CephString " \
3493 "name=value,type=CephString,req=false",
3495 "show heap usage info (available only if "
3496 "compiled with tcmalloc)");
3497 ceph_assert(r
== 0);
3499 r
= admin_socket
->register_command("set_heap_property",
3500 "set_heap_property " \
3501 "name=property,type=CephString " \
3502 "name=value,type=CephInt",
3504 "update malloc extension heap property");
3505 ceph_assert(r
== 0);
3507 r
= admin_socket
->register_command("get_heap_property",
3508 "get_heap_property " \
3509 "name=property,type=CephString",
3511 "get malloc extension heap property");
3512 ceph_assert(r
== 0);
3514 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
3515 "dump_objectstore_kv_stats",
3517 "print statistics of kvdb which used by bluestore");
3518 ceph_assert(r
== 0);
3520 r
= admin_socket
->register_command("dump_scrubs",
3523 "print scheduled scrubs");
3524 ceph_assert(r
== 0);
3526 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
3527 "calc_objectstore_db_histogram",
3529 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3530 ceph_assert(r
== 0);
3532 r
= admin_socket
->register_command("flush_store_cache",
3533 "flush_store_cache",
3535 "Flush bluestore internal cache");
3536 ceph_assert(r
== 0);
3537 r
= admin_socket
->register_command("dump_pgstate_history", "dump_pgstate_history",
3539 "show recent state history");
3540 ceph_assert(r
== 0);
3542 r
= admin_socket
->register_command("compact", "compact",
3544 "Commpact object store's omap."
3545 " WARNING: Compaction probably slows your requests");
3546 ceph_assert(r
== 0);
3548 r
= admin_socket
->register_command("get_mapped_pools", "get_mapped_pools",
3550 "dump pools whose PG(s) are mapped to this OSD.");
3552 ceph_assert(r
== 0);
3554 r
= admin_socket
->register_command("smart", "smart name=devid,type=CephString,req=False",
3556 "probe OSD devices for SMART data.");
3558 ceph_assert(r
== 0);
3560 r
= admin_socket
->register_command("list_devices", "list_devices",
3562 "list OSD devices.");
3563 r
= admin_socket
->register_command("send_beacon", "send_beacon",
3565 "send OSD beacon to mon immediately");
3567 r
= admin_socket
->register_command("dump_osd_network", "dump_osd_network name=value,type=CephInt,req=false", asok_hook
,
3568 "Dump osd heartbeat network ping times");
3569 ceph_assert(r
== 0);
3571 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
);
3572 // Note: pools are CephString instead of CephPoolname because
3573 // these commands traditionally support both pool names and numbers
3574 r
= admin_socket
->register_command(
3577 "name=pool,type=CephString " \
3578 "name=objname,type=CephObjectname " \
3579 "name=key,type=CephString "\
3580 "name=val,type=CephString",
3583 ceph_assert(r
== 0);
3584 r
= admin_socket
->register_command(
3587 "name=pool,type=CephString " \
3588 "name=objname,type=CephObjectname " \
3589 "name=key,type=CephString",
3592 ceph_assert(r
== 0);
3593 r
= admin_socket
->register_command(
3596 "name=pool,type=CephString " \
3597 "name=objname,type=CephObjectname " \
3598 "name=header,type=CephString",
3601 ceph_assert(r
== 0);
3603 r
= admin_socket
->register_command(
3606 "name=pool,type=CephString " \
3607 "name=objname,type=CephObjectname",
3609 "output entire object map");
3610 ceph_assert(r
== 0);
3612 r
= admin_socket
->register_command(
3615 "name=pool,type=CephString " \
3616 "name=objname,type=CephObjectname " \
3617 "name=len,type=CephInt",
3619 "truncate object to length");
3620 ceph_assert(r
== 0);
3622 r
= admin_socket
->register_command(
3625 "name=pool,type=CephString " \
3626 "name=objname,type=CephObjectname " \
3627 "name=shardid,type=CephInt,req=false,range=0|255",
3629 "inject data error to an object");
3630 ceph_assert(r
== 0);
3632 r
= admin_socket
->register_command(
3635 "name=pool,type=CephString " \
3636 "name=objname,type=CephObjectname " \
3637 "name=shardid,type=CephInt,req=false,range=0|255",
3639 "inject metadata error to an object");
3640 ceph_assert(r
== 0);
3641 r
= admin_socket
->register_command(
3642 "set_recovery_delay",
3643 "set_recovery_delay " \
3644 "name=utime,type=CephInt,req=false",
3646 "Delay osd recovery by specified seconds");
3647 ceph_assert(r
== 0);
3648 r
= admin_socket
->register_command(
3651 "name=pgid,type=CephString " \
3652 "name=time,type=CephInt,req=false",
3654 "Trigger a scheduled scrub ");
3655 ceph_assert(r
== 0);
3656 r
= admin_socket
->register_command(
3657 "trigger_deep_scrub",
3658 "trigger_deep_scrub " \
3659 "name=pgid,type=CephString " \
3660 "name=time,type=CephInt,req=false",
3662 "Trigger a scheduled deep scrub ");
3663 ceph_assert(r
== 0);
3664 r
= admin_socket
->register_command(
3667 "name=type,type=CephString,req=false " \
3668 "name=count,type=CephInt,req=false ",
3670 "Inject a full disk (optional count times)");
3671 ceph_assert(r
== 0);
3674 void OSD::create_logger()
3676 dout(10) << "create_logger" << dendl
;
3678 PerfCountersBuilder
osd_plb(cct
, "osd", l_osd_first
, l_osd_last
);
3680 // Latency axis configuration for op histograms, values are in nanoseconds
3681 PerfHistogramCommon::axis_config_d op_hist_x_axis_config
{
3683 PerfHistogramCommon::SCALE_LOG2
, ///< Latency in logarithmic scale
3685 100000, ///< Quantization unit is 100usec
3686 32, ///< Enough to cover much longer than slow requests
3689 // Op size axis configuration for op histograms, values are in bytes
3690 PerfHistogramCommon::axis_config_d op_hist_y_axis_config
{
3691 "Request size (bytes)",
3692 PerfHistogramCommon::SCALE_LOG2
, ///< Request size in logarithmic scale
3694 512, ///< Quantization unit is 512 bytes
3695 32, ///< Enough to cover requests larger than GB
3699 // All the basic OSD operation stats are to be considered useful
3700 osd_plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
3703 l_osd_op_wip
, "op_wip",
3704 "Replication operations currently being processed (primary)");
3705 osd_plb
.add_u64_counter(
3707 "Client operations",
3708 "ops", PerfCountersBuilder::PRIO_CRITICAL
);
3709 osd_plb
.add_u64_counter(
3710 l_osd_op_inb
, "op_in_bytes",
3711 "Client operations total write size",
3712 "wr", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
3713 osd_plb
.add_u64_counter(
3714 l_osd_op_outb
, "op_out_bytes",
3715 "Client operations total read size",
3716 "rd", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
3717 osd_plb
.add_time_avg(
3718 l_osd_op_lat
, "op_latency",
3719 "Latency of client operations (including queue time)",
3721 osd_plb
.add_time_avg(
3722 l_osd_op_process_lat
, "op_process_latency",
3723 "Latency of client operations (excluding queue time)");
3724 osd_plb
.add_time_avg(
3725 l_osd_op_prepare_lat
, "op_prepare_latency",
3726 "Latency of client operations (excluding queue time and wait for finished)");
3728 osd_plb
.add_u64_counter(
3729 l_osd_op_r
, "op_r", "Client read operations");
3730 osd_plb
.add_u64_counter(
3731 l_osd_op_r_outb
, "op_r_out_bytes", "Client data read", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3732 osd_plb
.add_time_avg(
3733 l_osd_op_r_lat
, "op_r_latency",
3734 "Latency of read operation (including queue time)");
3735 osd_plb
.add_u64_counter_histogram(
3736 l_osd_op_r_lat_outb_hist
, "op_r_latency_out_bytes_histogram",
3737 op_hist_x_axis_config
, op_hist_y_axis_config
,
3738 "Histogram of operation latency (including queue time) + data read");
3739 osd_plb
.add_time_avg(
3740 l_osd_op_r_process_lat
, "op_r_process_latency",
3741 "Latency of read operation (excluding queue time)");
3742 osd_plb
.add_time_avg(
3743 l_osd_op_r_prepare_lat
, "op_r_prepare_latency",
3744 "Latency of read operations (excluding queue time and wait for finished)");
3745 osd_plb
.add_u64_counter(
3746 l_osd_op_w
, "op_w", "Client write operations");
3747 osd_plb
.add_u64_counter(
3748 l_osd_op_w_inb
, "op_w_in_bytes", "Client data written");
3749 osd_plb
.add_time_avg(
3750 l_osd_op_w_lat
, "op_w_latency",
3751 "Latency of write operation (including queue time)");
3752 osd_plb
.add_u64_counter_histogram(
3753 l_osd_op_w_lat_inb_hist
, "op_w_latency_in_bytes_histogram",
3754 op_hist_x_axis_config
, op_hist_y_axis_config
,
3755 "Histogram of operation latency (including queue time) + data written");
3756 osd_plb
.add_time_avg(
3757 l_osd_op_w_process_lat
, "op_w_process_latency",
3758 "Latency of write operation (excluding queue time)");
3759 osd_plb
.add_time_avg(
3760 l_osd_op_w_prepare_lat
, "op_w_prepare_latency",
3761 "Latency of write operations (excluding queue time and wait for finished)");
3762 osd_plb
.add_u64_counter(
3763 l_osd_op_rw
, "op_rw",
3764 "Client read-modify-write operations");
3765 osd_plb
.add_u64_counter(
3766 l_osd_op_rw_inb
, "op_rw_in_bytes",
3767 "Client read-modify-write operations write in", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3768 osd_plb
.add_u64_counter(
3769 l_osd_op_rw_outb
,"op_rw_out_bytes",
3770 "Client read-modify-write operations read out ", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3771 osd_plb
.add_time_avg(
3772 l_osd_op_rw_lat
, "op_rw_latency",
3773 "Latency of read-modify-write operation (including queue time)");
3774 osd_plb
.add_u64_counter_histogram(
3775 l_osd_op_rw_lat_inb_hist
, "op_rw_latency_in_bytes_histogram",
3776 op_hist_x_axis_config
, op_hist_y_axis_config
,
3777 "Histogram of rw operation latency (including queue time) + data written");
3778 osd_plb
.add_u64_counter_histogram(
3779 l_osd_op_rw_lat_outb_hist
, "op_rw_latency_out_bytes_histogram",
3780 op_hist_x_axis_config
, op_hist_y_axis_config
,
3781 "Histogram of rw operation latency (including queue time) + data read");
3782 osd_plb
.add_time_avg(
3783 l_osd_op_rw_process_lat
, "op_rw_process_latency",
3784 "Latency of read-modify-write operation (excluding queue time)");
3785 osd_plb
.add_time_avg(
3786 l_osd_op_rw_prepare_lat
, "op_rw_prepare_latency",
3787 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3789 // Now we move on to some more obscure stats, revert to assuming things
3790 // are low priority unless otherwise specified.
3791 osd_plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
3793 osd_plb
.add_time_avg(l_osd_op_before_queue_op_lat
, "op_before_queue_op_lat",
3794 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3795 osd_plb
.add_time_avg(l_osd_op_before_dequeue_op_lat
, "op_before_dequeue_op_lat",
3796 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3798 osd_plb
.add_u64_counter(
3799 l_osd_sop
, "subop", "Suboperations");
3800 osd_plb
.add_u64_counter(
3801 l_osd_sop_inb
, "subop_in_bytes", "Suboperations total size", NULL
, 0, unit_t(UNIT_BYTES
));
3802 osd_plb
.add_time_avg(l_osd_sop_lat
, "subop_latency", "Suboperations latency");
3804 osd_plb
.add_u64_counter(l_osd_sop_w
, "subop_w", "Replicated writes");
3805 osd_plb
.add_u64_counter(
3806 l_osd_sop_w_inb
, "subop_w_in_bytes", "Replicated written data size", NULL
, 0, unit_t(UNIT_BYTES
));
3807 osd_plb
.add_time_avg(
3808 l_osd_sop_w_lat
, "subop_w_latency", "Replicated writes latency");
3809 osd_plb
.add_u64_counter(
3810 l_osd_sop_pull
, "subop_pull", "Suboperations pull requests");
3811 osd_plb
.add_time_avg(
3812 l_osd_sop_pull_lat
, "subop_pull_latency", "Suboperations pull latency");
3813 osd_plb
.add_u64_counter(
3814 l_osd_sop_push
, "subop_push", "Suboperations push messages");
3815 osd_plb
.add_u64_counter(
3816 l_osd_sop_push_inb
, "subop_push_in_bytes", "Suboperations pushed size", NULL
, 0, unit_t(UNIT_BYTES
));
3817 osd_plb
.add_time_avg(
3818 l_osd_sop_push_lat
, "subop_push_latency", "Suboperations push latency");
3820 osd_plb
.add_u64_counter(l_osd_pull
, "pull", "Pull requests sent");
3821 osd_plb
.add_u64_counter(l_osd_push
, "push", "Push messages sent");
3822 osd_plb
.add_u64_counter(l_osd_push_outb
, "push_out_bytes", "Pushed size", NULL
, 0, unit_t(UNIT_BYTES
));
3824 osd_plb
.add_u64_counter(
3825 l_osd_rop
, "recovery_ops",
3826 "Started recovery operations",
3827 "rop", PerfCountersBuilder::PRIO_INTERESTING
);
3829 osd_plb
.add_u64_counter(
3830 l_osd_rbytes
, "recovery_bytes",
3832 "rbt", PerfCountersBuilder::PRIO_INTERESTING
);
3834 osd_plb
.add_u64(l_osd_loadavg
, "loadavg", "CPU load");
3836 l_osd_cached_crc
, "cached_crc", "Total number getting crc from crc_cache");
3838 l_osd_cached_crc_adjusted
, "cached_crc_adjusted",
3839 "Total number getting crc from crc_cache with adjusting");
3840 osd_plb
.add_u64(l_osd_missed_crc
, "missed_crc",
3841 "Total number of crc cache misses");
3843 osd_plb
.add_u64(l_osd_pg
, "numpg", "Placement groups",
3844 "pgs", PerfCountersBuilder::PRIO_USEFUL
);
3846 l_osd_pg_primary
, "numpg_primary",
3847 "Placement groups for which this osd is primary");
3849 l_osd_pg_replica
, "numpg_replica",
3850 "Placement groups for which this osd is replica");
3852 l_osd_pg_stray
, "numpg_stray",
3853 "Placement groups ready to be deleted from this osd");
3855 l_osd_pg_removing
, "numpg_removing",
3856 "Placement groups queued for local deletion", "pgsr",
3857 PerfCountersBuilder::PRIO_USEFUL
);
3859 l_osd_hb_to
, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3860 osd_plb
.add_u64_counter(l_osd_map
, "map_messages", "OSD map messages");
3861 osd_plb
.add_u64_counter(l_osd_mape
, "map_message_epochs", "OSD map epochs");
3862 osd_plb
.add_u64_counter(
3863 l_osd_mape_dup
, "map_message_epoch_dups", "OSD map duplicates");
3864 osd_plb
.add_u64_counter(
3865 l_osd_waiting_for_map
, "messages_delayed_for_map",
3866 "Operations waiting for OSD map");
3868 osd_plb
.add_u64_counter(
3869 l_osd_map_cache_hit
, "osd_map_cache_hit", "osdmap cache hit");
3870 osd_plb
.add_u64_counter(
3871 l_osd_map_cache_miss
, "osd_map_cache_miss", "osdmap cache miss");
3872 osd_plb
.add_u64_counter(
3873 l_osd_map_cache_miss_low
, "osd_map_cache_miss_low",
3874 "osdmap cache miss below cache lower bound");
3875 osd_plb
.add_u64_avg(
3876 l_osd_map_cache_miss_low_avg
, "osd_map_cache_miss_low_avg",
3877 "osdmap cache miss, avg distance below cache lower bound");
3878 osd_plb
.add_u64_counter(
3879 l_osd_map_bl_cache_hit
, "osd_map_bl_cache_hit",
3880 "OSDMap buffer cache hits");
3881 osd_plb
.add_u64_counter(
3882 l_osd_map_bl_cache_miss
, "osd_map_bl_cache_miss",
3883 "OSDMap buffer cache misses");
3886 l_osd_stat_bytes
, "stat_bytes", "OSD size", "size",
3887 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3889 l_osd_stat_bytes_used
, "stat_bytes_used", "Used space", "used",
3890 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3891 osd_plb
.add_u64(l_osd_stat_bytes_avail
, "stat_bytes_avail", "Available space", NULL
, 0, unit_t(UNIT_BYTES
));
3893 osd_plb
.add_u64_counter(
3894 l_osd_copyfrom
, "copyfrom", "Rados \"copy-from\" operations");
3896 osd_plb
.add_u64_counter(l_osd_tier_promote
, "tier_promote", "Tier promotions");
3897 osd_plb
.add_u64_counter(l_osd_tier_flush
, "tier_flush", "Tier flushes");
3898 osd_plb
.add_u64_counter(
3899 l_osd_tier_flush_fail
, "tier_flush_fail", "Failed tier flushes");
3900 osd_plb
.add_u64_counter(
3901 l_osd_tier_try_flush
, "tier_try_flush", "Tier flush attempts");
3902 osd_plb
.add_u64_counter(
3903 l_osd_tier_try_flush_fail
, "tier_try_flush_fail",
3904 "Failed tier flush attempts");
3905 osd_plb
.add_u64_counter(
3906 l_osd_tier_evict
, "tier_evict", "Tier evictions");
3907 osd_plb
.add_u64_counter(
3908 l_osd_tier_whiteout
, "tier_whiteout", "Tier whiteouts");
3909 osd_plb
.add_u64_counter(
3910 l_osd_tier_dirty
, "tier_dirty", "Dirty tier flag set");
3911 osd_plb
.add_u64_counter(
3912 l_osd_tier_clean
, "tier_clean", "Dirty tier flag cleaned");
3913 osd_plb
.add_u64_counter(
3914 l_osd_tier_delay
, "tier_delay", "Tier delays (agent waiting)");
3915 osd_plb
.add_u64_counter(
3916 l_osd_tier_proxy_read
, "tier_proxy_read", "Tier proxy reads");
3917 osd_plb
.add_u64_counter(
3918 l_osd_tier_proxy_write
, "tier_proxy_write", "Tier proxy writes");
3920 osd_plb
.add_u64_counter(
3921 l_osd_agent_wake
, "agent_wake", "Tiering agent wake up");
3922 osd_plb
.add_u64_counter(
3923 l_osd_agent_skip
, "agent_skip", "Objects skipped by agent");
3924 osd_plb
.add_u64_counter(
3925 l_osd_agent_flush
, "agent_flush", "Tiering agent flushes");
3926 osd_plb
.add_u64_counter(
3927 l_osd_agent_evict
, "agent_evict", "Tiering agent evictions");
3929 osd_plb
.add_u64_counter(
3930 l_osd_object_ctx_cache_hit
, "object_ctx_cache_hit", "Object context cache hits");
3931 osd_plb
.add_u64_counter(
3932 l_osd_object_ctx_cache_total
, "object_ctx_cache_total", "Object context cache lookups");
3934 osd_plb
.add_u64_counter(l_osd_op_cache_hit
, "op_cache_hit");
3935 osd_plb
.add_time_avg(
3936 l_osd_tier_flush_lat
, "osd_tier_flush_lat", "Object flush latency");
3937 osd_plb
.add_time_avg(
3938 l_osd_tier_promote_lat
, "osd_tier_promote_lat", "Object promote latency");
3939 osd_plb
.add_time_avg(
3940 l_osd_tier_r_lat
, "osd_tier_r_lat", "Object proxy read latency");
3942 osd_plb
.add_u64_counter(
3943 l_osd_pg_info
, "osd_pg_info", "PG updated its info (using any method)");
3944 osd_plb
.add_u64_counter(
3945 l_osd_pg_fastinfo
, "osd_pg_fastinfo",
3946 "PG updated its info using fastinfo attr");
3947 osd_plb
.add_u64_counter(
3948 l_osd_pg_biginfo
, "osd_pg_biginfo", "PG updated its biginfo attr");
3950 logger
= osd_plb
.create_perf_counters();
3951 cct
->get_perfcounters_collection()->add(logger
);
3954 void OSD::create_recoverystate_perf()
3956 dout(10) << "create_recoverystate_perf" << dendl
;
3958 PerfCountersBuilder
rs_perf(cct
, "recoverystate_perf", rs_first
, rs_last
);
3960 rs_perf
.add_time_avg(rs_initial_latency
, "initial_latency", "Initial recovery state latency");
3961 rs_perf
.add_time_avg(rs_started_latency
, "started_latency", "Started recovery state latency");
3962 rs_perf
.add_time_avg(rs_reset_latency
, "reset_latency", "Reset recovery state latency");
3963 rs_perf
.add_time_avg(rs_start_latency
, "start_latency", "Start recovery state latency");
3964 rs_perf
.add_time_avg(rs_primary_latency
, "primary_latency", "Primary recovery state latency");
3965 rs_perf
.add_time_avg(rs_peering_latency
, "peering_latency", "Peering recovery state latency");
3966 rs_perf
.add_time_avg(rs_backfilling_latency
, "backfilling_latency", "Backfilling recovery state latency");
3967 rs_perf
.add_time_avg(rs_waitremotebackfillreserved_latency
, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3968 rs_perf
.add_time_avg(rs_waitlocalbackfillreserved_latency
, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3969 rs_perf
.add_time_avg(rs_notbackfilling_latency
, "notbackfilling_latency", "Notbackfilling recovery state latency");
3970 rs_perf
.add_time_avg(rs_repnotrecovering_latency
, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3971 rs_perf
.add_time_avg(rs_repwaitrecoveryreserved_latency
, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3972 rs_perf
.add_time_avg(rs_repwaitbackfillreserved_latency
, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3973 rs_perf
.add_time_avg(rs_reprecovering_latency
, "reprecovering_latency", "RepRecovering recovery state latency");
3974 rs_perf
.add_time_avg(rs_activating_latency
, "activating_latency", "Activating recovery state latency");
3975 rs_perf
.add_time_avg(rs_waitlocalrecoveryreserved_latency
, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3976 rs_perf
.add_time_avg(rs_waitremoterecoveryreserved_latency
, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3977 rs_perf
.add_time_avg(rs_recovering_latency
, "recovering_latency", "Recovering recovery state latency");
3978 rs_perf
.add_time_avg(rs_recovered_latency
, "recovered_latency", "Recovered recovery state latency");
3979 rs_perf
.add_time_avg(rs_clean_latency
, "clean_latency", "Clean recovery state latency");
3980 rs_perf
.add_time_avg(rs_active_latency
, "active_latency", "Active recovery state latency");
3981 rs_perf
.add_time_avg(rs_replicaactive_latency
, "replicaactive_latency", "Replicaactive recovery state latency");
3982 rs_perf
.add_time_avg(rs_stray_latency
, "stray_latency", "Stray recovery state latency");
3983 rs_perf
.add_time_avg(rs_getinfo_latency
, "getinfo_latency", "Getinfo recovery state latency");
3984 rs_perf
.add_time_avg(rs_getlog_latency
, "getlog_latency", "Getlog recovery state latency");
3985 rs_perf
.add_time_avg(rs_waitactingchange_latency
, "waitactingchange_latency", "Waitactingchange recovery state latency");
3986 rs_perf
.add_time_avg(rs_incomplete_latency
, "incomplete_latency", "Incomplete recovery state latency");
3987 rs_perf
.add_time_avg(rs_down_latency
, "down_latency", "Down recovery state latency");
3988 rs_perf
.add_time_avg(rs_getmissing_latency
, "getmissing_latency", "Getmissing recovery state latency");
3989 rs_perf
.add_time_avg(rs_waitupthru_latency
, "waitupthru_latency", "Waitupthru recovery state latency");
3990 rs_perf
.add_time_avg(rs_notrecovering_latency
, "notrecovering_latency", "Notrecovering recovery state latency");
3992 recoverystate_perf
= rs_perf
.create_perf_counters();
3993 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
3998 if (cct
->_conf
->osd_fast_shutdown
) {
3999 derr
<< "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl
;
4004 if (!service
.prepare_to_stop())
4005 return 0; // already shutting down
4007 if (is_stopping()) {
4011 dout(0) << "shutdown" << dendl
;
4013 set_state(STATE_STOPPING
);
4016 if (cct
->_conf
.get_val
<bool>("osd_debug_shutdown")) {
4017 cct
->_conf
.set_val("debug_osd", "100");
4018 cct
->_conf
.set_val("debug_journal", "100");
4019 cct
->_conf
.set_val("debug_filestore", "100");
4020 cct
->_conf
.set_val("debug_bluestore", "100");
4021 cct
->_conf
.set_val("debug_ms", "100");
4022 cct
->_conf
.apply_changes(nullptr);
4025 // stop MgrClient earlier as it's more like an internal consumer of OSD
4028 service
.start_shutdown();
4030 // stop sending work to pgs. this just prevents any new work in _process
4031 // from racing with on_shutdown and potentially entering the pg after.
4032 op_shardedwq
.drain();
4038 for (auto pg
: pgs
) {
4043 // drain op queue again (in case PGs requeued something)
4044 op_shardedwq
.drain();
4046 finished
.clear(); // zap waiters (bleh, this is messy)
4047 waiting_for_osdmap
.clear();
4050 // unregister commands
4051 cct
->get_admin_socket()->unregister_commands(asok_hook
);
4055 cct
->get_admin_socket()->unregister_commands(test_ops_hook
);
4056 delete test_ops_hook
;
4057 test_ops_hook
= NULL
;
4061 heartbeat_lock
.Lock();
4062 heartbeat_stop
= true;
4063 heartbeat_cond
.Signal();
4064 heartbeat_lock
.Unlock();
4065 heartbeat_thread
.join();
4069 dout(10) << "op sharded tp stopped" << dendl
;
4073 dout(10) << "command tp stopped" << dendl
;
4075 dout(10) << "stopping agent" << dendl
;
4076 service
.agent_stop();
4078 boot_finisher
.wait_for_empty();
4082 boot_finisher
.stop();
4083 reset_heartbeat_peers(true);
4085 tick_timer
.shutdown();
4088 std::lock_guard
l(tick_timer_lock
);
4089 tick_timer_without_osd_lock
.shutdown();
4092 // note unmount epoch
4093 dout(10) << "noting clean unmount in epoch " << osdmap
->get_epoch() << dendl
;
4094 superblock
.mounted
= service
.get_boot_epoch();
4095 superblock
.clean_thru
= osdmap
->get_epoch();
4096 ObjectStore::Transaction t
;
4097 write_superblock(t
);
4098 int r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
4100 derr
<< "OSD::shutdown: error writing superblock: "
4101 << cpp_strerror(r
) << dendl
;
4105 service
.shutdown_reserver();
4108 #ifdef PG_DEBUG_REFS
4109 service
.dump_live_pgids();
4113 _get_pgs(&pgs
, true);
4117 for (auto& pg
: pgs
) {
4118 if (pg
->is_deleted()) {
4121 dout(20) << " kicking pg " << pg
<< dendl
;
4123 if (pg
->get_num_ref() != 1) {
4124 derr
<< "pgid " << pg
->get_pgid() << " has ref count of "
4125 << pg
->get_num_ref() << dendl
;
4126 #ifdef PG_DEBUG_REFS
4127 pg
->dump_live_ids();
4129 if (cct
->_conf
->osd_shutdown_pgref_assert
) {
4137 #ifdef PG_DEBUG_REFS
4138 service
.dump_live_pgids();
4142 cct
->_conf
.remove_observer(this);
4145 service
.meta_ch
.reset();
4147 dout(10) << "syncing store" << dendl
;
4148 enable_disable_fuse(true);
4150 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
4151 dout(10) << "flushing journal" << dendl
;
4152 store
->flush_journal();
4158 map_lock
.get_write();
4159 osdmap
= OSDMapRef();
4160 map_lock
.put_write();
4162 for (auto s
: shards
) {
4163 std::lock_guard
l(s
->osdmap_lock
);
4164 s
->shard_osdmap
= OSDMapRef();
4168 std::lock_guard
lock(osd_lock
);
4172 dout(10) << "Store synced" << dendl
;
4174 op_tracker
.on_shutdown();
4176 class_handler
->shutdown();
4177 client_messenger
->shutdown();
4178 cluster_messenger
->shutdown();
4179 hb_front_client_messenger
->shutdown();
4180 hb_back_client_messenger
->shutdown();
4181 objecter_messenger
->shutdown();
4182 hb_front_server_messenger
->shutdown();
4183 hb_back_server_messenger
->shutdown();
4188 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
4190 bool created
= false;
4192 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
4193 vector
<string
> vcmd
{cmd
};
4197 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
4200 if (r
== -ENOENT
&& !created
) {
4201 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
4202 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
4203 vector
<string
> vnewcmd
{newcmd
};
4207 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
4210 derr
<< __func__
<< " fail: osd does not exist and created failed: "
4211 << cpp_strerror(r
) << dendl
;
4217 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
4226 int OSD::update_crush_location()
4228 if (!cct
->_conf
->osd_crush_update_on_start
) {
4229 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
4234 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
4235 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
4237 struct store_statfs_t st
;
4238 osd_alert_list_t alerts
;
4239 int r
= store
->statfs(&st
, &alerts
);
4241 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
4244 snprintf(weight
, sizeof(weight
), "%.4lf",
4247 double(1ull << 40 /* TB */)));
4250 std::multimap
<string
,string
> loc
= cct
->crush_location
.get_location();
4251 dout(10) << __func__
<< " crush location is " << loc
<< dendl
;
4254 string("{\"prefix\": \"osd crush create-or-move\", ") +
4255 string("\"id\": ") + stringify(whoami
) + string(", ") +
4256 string("\"weight\":") + weight
+ string(", ") +
4257 string("\"args\": [");
4258 for (multimap
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
) {
4259 if (p
!= loc
.begin())
4261 cmd
+= "\"" + p
->first
+ "=" + p
->second
+ "\"";
4265 return mon_cmd_maybe_osd_create(cmd
);
4268 int OSD::update_crush_device_class()
4270 if (!cct
->_conf
->osd_class_update_on_start
) {
4271 dout(10) << __func__
<< " osd_class_update_on_start = false" << dendl
;
4275 string device_class
;
4276 int r
= store
->read_meta("crush_device_class", &device_class
);
4277 if (r
< 0 || device_class
.empty()) {
4278 device_class
= store
->get_default_device_class();
4281 if (device_class
.empty()) {
4282 dout(20) << __func__
<< " no device class stored locally" << dendl
;
4287 string("{\"prefix\": \"osd crush set-device-class\", ") +
4288 string("\"class\": \"") + device_class
+ string("\", ") +
4289 string("\"ids\": [\"") + stringify(whoami
) + string("\"]}");
4291 r
= mon_cmd_maybe_osd_create(cmd
);
4293 // good, already bound to a device-class
4300 void OSD::write_superblock(ObjectStore::Transaction
& t
)
4302 dout(10) << "write_superblock " << superblock
<< dendl
;
4304 //hack: at minimum it's using the baseline feature set
4305 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
4306 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
4309 encode(superblock
, bl
);
4310 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
4313 int OSD::read_superblock()
4316 int r
= store
->read(service
.meta_ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
4320 auto p
= bl
.cbegin();
4321 decode(superblock
, p
);
4323 dout(10) << "read_superblock " << superblock
<< dendl
;
4328 void OSD::clear_temp_objects()
4330 dout(10) << __func__
<< dendl
;
4332 store
->list_collections(ls
);
4333 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
4335 if (!p
->is_pg(&pgid
))
4338 // list temp objects
4339 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
4341 vector
<ghobject_t
> temps
;
4344 vector
<ghobject_t
> objects
;
4345 auto ch
= store
->open_collection(*p
);
4347 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4348 store
->get_ideal_list_max(),
4350 if (objects
.empty())
4352 vector
<ghobject_t
>::iterator q
;
4353 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
4354 // Hammer set pool for temps to -1, so check for clean-up
4355 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
4356 temps
.push_back(*q
);
4361 // If we saw a non-temp object and hit the break above we can
4362 // break out of the while loop too.
4363 if (q
!= objects
.end())
4366 if (!temps
.empty()) {
4367 ObjectStore::Transaction t
;
4369 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
4370 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
4372 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
4373 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4374 t
= ObjectStore::Transaction();
4379 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4385 void OSD::recursive_remove_collection(CephContext
* cct
,
4386 ObjectStore
*store
, spg_t pgid
,
4392 make_snapmapper_oid());
4394 ObjectStore::CollectionHandle ch
= store
->open_collection(tmp
);
4395 ObjectStore::Transaction t
;
4396 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
4399 int max
= cct
->_conf
->osd_target_transaction_size
;
4400 vector
<ghobject_t
> objects
;
4401 objects
.reserve(max
);
4404 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4405 max
, &objects
, &next
);
4406 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
4407 if (objects
.empty())
4409 for (auto& p
: objects
) {
4410 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
4411 int r
= mapper
.remove_oid(p
.hobj
, &_t
);
4412 if (r
!= 0 && r
!= -ENOENT
)
4416 int r
= store
->queue_transaction(ch
, std::move(t
));
4417 ceph_assert(r
== 0);
4418 t
= ObjectStore::Transaction();
4420 t
.remove_collection(tmp
);
4421 int r
= store
->queue_transaction(ch
, std::move(t
));
4422 ceph_assert(r
== 0);
4425 if (!ch
->flush_commit(&waiter
)) {
4431 // ======================================================
4435 OSDMapRef createmap
,
4438 dout(10) << __func__
<< " " << pgid
<< dendl
;
4440 map
<string
,string
> ec_profile
;
4442 if (createmap
->have_pg_pool(pgid
.pool())) {
4443 pi
= *createmap
->get_pg_pool(pgid
.pool());
4444 name
= createmap
->get_pool_name(pgid
.pool());
4445 if (pi
.is_erasure()) {
4446 ec_profile
= createmap
->get_erasure_code_profile(pi
.erasure_code_profile
);
4449 // pool was deleted; grab final pg_pool_t off disk.
4450 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
4452 int r
= store
->read(service
.meta_ch
, oid
, 0, 0, bl
);
4454 derr
<< __func__
<< " missing pool " << pgid
.pool() << " tombstone"
4458 ceph_assert(r
>= 0);
4459 auto p
= bl
.cbegin();
4462 if (p
.end()) { // dev release v13.0.2 did not include ec_profile
4463 derr
<< __func__
<< " missing ec_profile from pool " << pgid
.pool()
4464 << " tombstone" << dendl
;
4467 decode(ec_profile
, p
);
4469 PGPool
pool(cct
, createmap
, pgid
.pool(), pi
, name
);
4471 if (pi
.type
== pg_pool_t::TYPE_REPLICATED
||
4472 pi
.type
== pg_pool_t::TYPE_ERASURE
)
4473 pg
= new PrimaryLogPG(&service
, createmap
, pool
, ec_profile
, pgid
);
4479 void OSD::_get_pgs(vector
<PGRef
> *v
, bool clear_too
)
4482 v
->reserve(get_num_pgs());
4483 for (auto& s
: shards
) {
4484 std::lock_guard
l(s
->shard_lock
);
4485 for (auto& j
: s
->pg_slots
) {
4487 !j
.second
->pg
->is_deleted()) {
4488 v
->push_back(j
.second
->pg
);
4490 s
->_detach_pg(j
.second
.get());
4497 void OSD::_get_pgids(vector
<spg_t
> *v
)
4500 v
->reserve(get_num_pgs());
4501 for (auto& s
: shards
) {
4502 std::lock_guard
l(s
->shard_lock
);
4503 for (auto& j
: s
->pg_slots
) {
4505 !j
.second
->pg
->is_deleted()) {
4506 v
->push_back(j
.first
);
4512 void OSD::register_pg(PGRef pg
)
4514 spg_t pgid
= pg
->get_pgid();
4515 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4516 auto sdata
= shards
[shard_index
];
4517 std::lock_guard
l(sdata
->shard_lock
);
4518 auto r
= sdata
->pg_slots
.emplace(pgid
, make_unique
<OSDShardPGSlot
>());
4519 ceph_assert(r
.second
);
4520 auto *slot
= r
.first
->second
.get();
4521 dout(20) << __func__
<< " " << pgid
<< " " << pg
<< dendl
;
4522 sdata
->_attach_pg(slot
, pg
.get());
4525 bool OSD::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
4527 auto sdata
= pg
->osd_shard
;
4530 std::lock_guard
l(sdata
->shard_lock
);
4531 auto p
= sdata
->pg_slots
.find(pg
->pg_id
);
4532 if (p
== sdata
->pg_slots
.end() ||
4534 dout(20) << __func__
<< " " << pg
->pg_id
<< " not found" << dendl
;
4537 if (p
->second
->waiting_for_merge_epoch
) {
4538 dout(20) << __func__
<< " " << pg
->pg_id
<< " waiting for merge" << dendl
;
4541 dout(20) << __func__
<< " " << pg
->pg_id
<< " " << pg
<< dendl
;
4542 sdata
->_detach_pg(p
->second
.get());
4545 for (auto shard
: shards
) {
4546 shard
->unprime_split_children(pg
->pg_id
, old_pg_num
);
4549 // update pg count now since we might not get an osdmap any time soon.
4550 if (pg
->is_primary())
4551 service
.logger
->dec(l_osd_pg_primary
);
4552 else if (pg
->is_replica())
4553 service
.logger
->dec(l_osd_pg_replica
);
4555 service
.logger
->dec(l_osd_pg_stray
);
4560 PGRef
OSD::_lookup_pg(spg_t pgid
)
4562 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4563 auto sdata
= shards
[shard_index
];
4564 std::lock_guard
l(sdata
->shard_lock
);
4565 auto p
= sdata
->pg_slots
.find(pgid
);
4566 if (p
== sdata
->pg_slots
.end()) {
4569 return p
->second
->pg
;
4572 PGRef
OSD::_lookup_lock_pg(spg_t pgid
)
4574 PGRef pg
= _lookup_pg(pgid
);
4579 if (!pg
->is_deleted()) {
4586 PGRef
OSD::lookup_lock_pg(spg_t pgid
)
4588 return _lookup_lock_pg(pgid
);
4591 void OSD::load_pgs()
4593 ceph_assert(osd_lock
.is_locked());
4594 dout(0) << "load_pgs" << dendl
;
4597 auto pghist
= make_pg_num_history_oid();
4599 int r
= store
->read(service
.meta_ch
, pghist
, 0, 0, bl
, 0);
4600 if (r
>= 0 && bl
.length() > 0) {
4601 auto p
= bl
.cbegin();
4602 decode(pg_num_history
, p
);
4604 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
4608 int r
= store
->list_collections(ls
);
4610 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
4614 for (vector
<coll_t
>::iterator it
= ls
.begin();
4618 if (it
->is_temp(&pgid
) ||
4619 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
, pgid
))) {
4620 dout(10) << "load_pgs " << *it
4621 << " removing, legacy or flagged for removal pg" << dendl
;
4622 recursive_remove_collection(cct
, store
, pgid
, *it
);
4626 if (!it
->is_pg(&pgid
)) {
4627 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
4631 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
4632 epoch_t map_epoch
= 0;
4633 int r
= PG::peek_map_epoch(store
, pgid
, &map_epoch
);
4635 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
4641 if (map_epoch
> 0) {
4642 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
4644 if (!osdmap
->have_pg_pool(pgid
.pool())) {
4645 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
4646 << " on pg " << pgid
<< ", but the pool is not present in the "
4647 << "current map, so this is probably a result of bug 10617. "
4648 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4649 << "to clean it up later." << dendl
;
4652 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
4653 << map_epoch
<< ", but missing map. Crashing."
4655 ceph_abort_msg("Missing map in load_pgs");
4658 pg
= _make_pg(pgosdmap
, pgid
);
4660 pg
= _make_pg(osdmap
, pgid
);
4663 recursive_remove_collection(cct
, store
, pgid
, *it
);
4667 // there can be no waiters here, so we don't call _wake_pg_slot
4670 pg
->ch
= store
->open_collection(pg
->coll
);
4672 // read pg state, log
4673 pg
->read_state(store
);
4676 dout(10) << "load_pgs " << *it
<< " deleting dne" << dendl
;
4679 recursive_remove_collection(cct
, store
, pgid
, *it
);
4683 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4684 assert(NULL
!= shards
[shard_index
]);
4685 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4688 pg
->reg_next_scrub();
4690 dout(10) << __func__
<< " loaded " << *pg
<< dendl
;
4696 dout(0) << __func__
<< " opened " << num
<< " pgs" << dendl
;
4700 PGRef
OSD::handle_pg_create_info(const OSDMapRef
& osdmap
,
4701 const PGCreateInfo
*info
)
4703 spg_t pgid
= info
->pgid
;
4705 if (maybe_wait_for_max_pg(osdmap
, pgid
, info
->by_mon
)) {
4706 dout(10) << __func__
<< " hit max pg, dropping" << dendl
;
4710 PG::RecoveryCtx rctx
= create_context();
4712 OSDMapRef startmap
= get_map(info
->epoch
);
4715 int64_t pool_id
= pgid
.pgid
.pool();
4716 const pg_pool_t
*pool
= osdmap
->get_pg_pool(pool_id
);
4718 dout(10) << __func__
<< " ignoring " << pgid
<< ", pool dne" << dendl
;
4721 if (osdmap
->require_osd_release
>= CEPH_RELEASE_NAUTILUS
&&
4722 !pool
->has_flag(pg_pool_t::FLAG_CREATING
)) {
4723 // this ensures we do not process old creating messages after the
4724 // pool's initial pgs have been created (and pg are subsequently
4725 // allowed to split or merge).
4726 dout(20) << __func__
<< " dropping " << pgid
4727 << "create, pool does not have CREATING flag set" << dendl
;
4732 int up_primary
, acting_primary
;
4733 vector
<int> up
, acting
;
4734 startmap
->pg_to_up_acting_osds(
4735 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4737 const pg_pool_t
* pp
= startmap
->get_pg_pool(pgid
.pool());
4738 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
4739 store
->get_type() != "bluestore") {
4740 clog
->warn() << "pg " << pgid
4741 << " is at risk of silent data corruption: "
4742 << "the pool allows ec overwrites but is not stored in "
4743 << "bluestore, so deep scrubbing will not detect bitrot";
4745 PG::_create(*rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
4746 PG::_init(*rctx
.transaction
, pgid
, pp
);
4748 int role
= startmap
->calc_pg_role(whoami
, acting
, acting
.size());
4749 if (!pp
->is_replicated() && role
!= pgid
.shard
) {
4753 PGRef pg
= _make_pg(startmap
, pgid
);
4754 pg
->ch
= store
->create_new_collection(pg
->coll
);
4757 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4758 assert(NULL
!= shards
[shard_index
]);
4759 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4764 // we are holding the shard lock
4765 ceph_assert(!pg
->is_deleted());
4774 info
->past_intervals
,
4778 pg
->init_collection_pool_opts();
4780 if (pg
->is_primary()) {
4781 Mutex::Locker
locker(m_perf_queries_lock
);
4782 pg
->set_dynamic_perf_stats_queries(m_perf_queries
);
4785 pg
->handle_initialize(&rctx
);
4786 pg
->handle_activate_map(&rctx
);
4788 dispatch_context(rctx
, pg
.get(), osdmap
, nullptr);
4790 dout(10) << __func__
<< " new pg " << *pg
<< dendl
;
4794 bool OSD::maybe_wait_for_max_pg(const OSDMapRef
& osdmap
,
4798 const auto max_pgs_per_osd
=
4799 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4800 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4802 if (num_pgs
< max_pgs_per_osd
) {
4806 std::lock_guard
l(pending_creates_lock
);
4807 if (is_mon_create
) {
4808 pending_creates_from_mon
++;
4810 bool is_primary
= osdmap
->get_pg_acting_rank(pgid
.pgid
, whoami
) == 0;
4811 pending_creates_from_osd
.emplace(pgid
.pgid
, is_primary
);
4813 dout(1) << __func__
<< " withhold creation of pg " << pgid
4814 << ": " << num_pgs
<< " >= "<< max_pgs_per_osd
<< dendl
;
4818 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4819 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4820 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4821 static vector
<int32_t> twiddle(const vector
<int>& acting
) {
4822 if (acting
.size() > 1) {
4825 vector
<int32_t> twiddled(acting
.begin(), acting
.end());
4826 twiddled
.push_back(-1);
4831 void OSD::resume_creating_pg()
4833 bool do_sub_pg_creates
= false;
4834 bool have_pending_creates
= false;
4836 const auto max_pgs_per_osd
=
4837 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4838 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4839 if (max_pgs_per_osd
<= num_pgs
) {
4840 // this could happen if admin decreases this setting before a PG is removed
4843 unsigned spare_pgs
= max_pgs_per_osd
- num_pgs
;
4844 std::lock_guard
l(pending_creates_lock
);
4845 if (pending_creates_from_mon
> 0) {
4846 dout(20) << __func__
<< " pending_creates_from_mon "
4847 << pending_creates_from_mon
<< dendl
;
4848 do_sub_pg_creates
= true;
4849 if (pending_creates_from_mon
>= spare_pgs
) {
4850 spare_pgs
= pending_creates_from_mon
= 0;
4852 spare_pgs
-= pending_creates_from_mon
;
4853 pending_creates_from_mon
= 0;
4856 auto pg
= pending_creates_from_osd
.cbegin();
4857 while (spare_pgs
> 0 && pg
!= pending_creates_from_osd
.cend()) {
4858 dout(20) << __func__
<< " pg " << pg
->first
<< dendl
;
4860 osdmap
->pg_to_up_acting_osds(pg
->first
, nullptr, nullptr, &acting
, nullptr);
4861 service
.queue_want_pg_temp(pg
->first
, twiddle(acting
), true);
4862 pg
= pending_creates_from_osd
.erase(pg
);
4863 do_sub_pg_creates
= true;
4866 have_pending_creates
= (pending_creates_from_mon
> 0 ||
4867 !pending_creates_from_osd
.empty());
4870 bool do_renew_subs
= false;
4871 if (do_sub_pg_creates
) {
4872 if (monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0)) {
4873 dout(4) << __func__
<< ": resolicit pg creates from mon since "
4874 << last_pg_create_epoch
<< dendl
;
4875 do_renew_subs
= true;
4878 version_t start
= osdmap
->get_epoch() + 1;
4879 if (have_pending_creates
) {
4880 // don't miss any new osdmap deleting PGs
4881 if (monc
->sub_want("osdmap", start
, 0)) {
4882 dout(4) << __func__
<< ": resolicit osdmap from mon since "
4884 do_renew_subs
= true;
4886 } else if (do_sub_pg_creates
) {
4887 // no need to subscribe the osdmap continuously anymore
4888 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4889 if (monc
->sub_want_increment("osdmap", start
, CEPH_SUBSCRIBE_ONETIME
)) {
4890 dout(4) << __func__
<< ": re-subscribe osdmap(onetime) since "
4892 do_renew_subs
= true;
4896 if (do_renew_subs
) {
4900 service
.send_pg_temp();
4903 void OSD::build_initial_pg_history(
4906 utime_t created_stamp
,
4910 dout(10) << __func__
<< " " << pgid
<< " created " << created
<< dendl
;
4911 h
->epoch_created
= created
;
4912 h
->epoch_pool_created
= created
;
4913 h
->same_interval_since
= created
;
4914 h
->same_up_since
= created
;
4915 h
->same_primary_since
= created
;
4916 h
->last_scrub_stamp
= created_stamp
;
4917 h
->last_deep_scrub_stamp
= created_stamp
;
4918 h
->last_clean_scrub_stamp
= created_stamp
;
4920 OSDMapRef lastmap
= service
.get_map(created
);
4921 int up_primary
, acting_primary
;
4922 vector
<int> up
, acting
;
4923 lastmap
->pg_to_up_acting_osds(
4924 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4926 ostringstream debug
;
4927 for (epoch_t e
= created
+ 1; e
<= osdmap
->get_epoch(); ++e
) {
4928 OSDMapRef osdmap
= service
.get_map(e
);
4929 int new_up_primary
, new_acting_primary
;
4930 vector
<int> new_up
, new_acting
;
4931 osdmap
->pg_to_up_acting_osds(
4932 pgid
.pgid
, &new_up
, &new_up_primary
, &new_acting
, &new_acting_primary
);
4934 // this is a bit imprecise, but sufficient?
4935 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
4936 const pg_pool_t
*pi
;
4937 bool operator()(const set
<pg_shard_t
> &have
) const {
4938 return have
.size() >= pi
->min_size
;
4940 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
4941 } min_size_predicate(osdmap
->get_pg_pool(pgid
.pgid
.pool()));
4943 bool new_interval
= PastIntervals::check_new_interval(
4950 h
->same_interval_since
,
4951 h
->last_epoch_clean
,
4955 &min_size_predicate
,
4959 h
->same_interval_since
= e
;
4961 h
->same_up_since
= e
;
4963 if (acting_primary
!= new_acting_primary
) {
4964 h
->same_primary_since
= e
;
4966 if (pgid
.pgid
.is_split(lastmap
->get_pg_num(pgid
.pgid
.pool()),
4967 osdmap
->get_pg_num(pgid
.pgid
.pool()),
4969 h
->last_epoch_split
= e
;
4972 acting
= new_acting
;
4973 up_primary
= new_up_primary
;
4974 acting_primary
= new_acting_primary
;
4978 dout(20) << __func__
<< " " << debug
.str() << dendl
;
4979 dout(10) << __func__
<< " " << *h
<< " " << *pi
4980 << " [" << (pi
->empty() ? pair
<epoch_t
,epoch_t
>(0,0) :
4981 pi
->get_bounds()) << ")"
4985 void OSD::_add_heartbeat_peer(int p
)
4991 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
4992 if (i
== heartbeat_peers
.end()) {
4993 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, osdmap
->get_epoch());
4996 hi
= &heartbeat_peers
[p
];
4998 RefCountedPtr s
{new HeartbeatSession
{p
}, false};
4999 hi
->hb_interval_start
= ceph_clock_now();
5000 hi
->con_back
= cons
.first
.get();
5001 hi
->con_back
->set_priv(s
);
5003 hi
->con_front
= cons
.second
.get();
5004 hi
->con_front
->set_priv(s
);
5005 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5006 << " " << hi
->con_back
->get_peer_addr()
5007 << " " << hi
->con_front
->get_peer_addr()
5010 hi
->con_front
.reset(NULL
);
5011 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5012 << " " << hi
->con_back
->get_peer_addr()
5018 hi
->epoch
= osdmap
->get_epoch();
5021 void OSD::_remove_heartbeat_peer(int n
)
5023 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
5024 ceph_assert(q
!= heartbeat_peers
.end());
5025 dout(20) << " removing heartbeat peer osd." << n
5026 << " " << q
->second
.con_back
->get_peer_addr()
5027 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
5029 q
->second
.con_back
->mark_down();
5030 if (q
->second
.con_front
) {
5031 q
->second
.con_front
->mark_down();
5033 heartbeat_peers
.erase(q
);
5036 void OSD::need_heartbeat_peer_update()
5040 dout(20) << "need_heartbeat_peer_update" << dendl
;
5041 heartbeat_set_peers_need_update();
5044 void OSD::maybe_update_heartbeat_peers()
5046 ceph_assert(osd_lock
.is_locked());
5048 if (is_waiting_for_healthy() || is_active()) {
5049 utime_t now
= ceph_clock_now();
5050 if (last_heartbeat_resample
== utime_t()) {
5051 last_heartbeat_resample
= now
;
5052 heartbeat_set_peers_need_update();
5053 } else if (!heartbeat_peers_need_update()) {
5054 utime_t dur
= now
- last_heartbeat_resample
;
5055 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
5056 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
5057 heartbeat_set_peers_need_update();
5058 last_heartbeat_resample
= now
;
5059 // automatically clean up any stale heartbeat peers
5060 // if we are unhealthy, then clean all
5061 reset_heartbeat_peers(is_waiting_for_healthy());
5066 if (!heartbeat_peers_need_update())
5068 heartbeat_clear_peers_need_update();
5070 std::lock_guard
l(heartbeat_lock
);
5072 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
5075 // build heartbeat from set
5079 for (auto& pg
: pgs
) {
5080 pg
->with_heartbeat_peers([&](int peer
) {
5081 if (osdmap
->is_up(peer
)) {
5082 _add_heartbeat_peer(peer
);
5088 // include next and previous up osds to ensure we have a fully-connected set
5089 set
<int> want
, extras
;
5090 const int next
= osdmap
->get_next_up_osd_after(whoami
);
5093 int prev
= osdmap
->get_previous_up_osd_before(whoami
);
5094 if (prev
>= 0 && prev
!= next
)
5097 // make sure we have at least **min_down** osds coming from different
5098 // subtree level (e.g., hosts) for fast failure detection.
5099 auto min_down
= cct
->_conf
.get_val
<uint64_t>("mon_osd_min_down_reporters");
5100 auto subtree
= cct
->_conf
.get_val
<string
>("mon_osd_reporter_subtree_level");
5101 osdmap
->get_random_up_osds_by_subtree(
5102 whoami
, subtree
, min_down
, want
, &want
);
5104 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
5105 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
5107 _add_heartbeat_peer(*p
);
5110 // remove down peers; enumerate extras
5111 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5112 while (p
!= heartbeat_peers
.end()) {
5113 if (!osdmap
->is_up(p
->first
)) {
5116 _remove_heartbeat_peer(o
);
5119 if (p
->second
.epoch
< osdmap
->get_epoch()) {
5120 extras
.insert(p
->first
);
5126 for (int n
= next
; n
>= 0; ) {
5127 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
5129 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
5130 dout(10) << " adding random peer osd." << n
<< dendl
;
5132 _add_heartbeat_peer(n
);
5134 n
= osdmap
->get_next_up_osd_after(n
);
5136 break; // came full circle; stop
5140 for (set
<int>::iterator p
= extras
.begin();
5141 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
5145 _remove_heartbeat_peer(*p
);
5148 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
5151 void OSD::reset_heartbeat_peers(bool all
)
5153 ceph_assert(osd_lock
.is_locked());
5154 dout(10) << "reset_heartbeat_peers" << dendl
;
5155 utime_t stale
= ceph_clock_now();
5156 stale
-= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
5157 std::lock_guard
l(heartbeat_lock
);
5158 for (auto it
= heartbeat_peers
.begin(); it
!= heartbeat_peers
.end();) {
5159 HeartbeatInfo
& hi
= it
->second
;
5160 if (all
|| hi
.is_stale(stale
)) {
5161 hi
.con_back
->mark_down();
5163 hi
.con_front
->mark_down();
5165 // stop sending failure_report to mon too
5166 failure_queue
.erase(it
->first
);
5167 heartbeat_peers
.erase(it
++);
5174 void OSD::handle_osd_ping(MOSDPing
*m
)
5176 if (superblock
.cluster_fsid
!= m
->fsid
) {
5177 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
5178 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
<< dendl
;
5183 int from
= m
->get_source().num();
5185 heartbeat_lock
.Lock();
5186 if (is_stopping()) {
5187 heartbeat_lock
.Unlock();
5192 OSDMapRef curmap
= service
.get_osdmap();
5194 heartbeat_lock
.Unlock();
5201 case MOSDPing::PING
:
5203 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
5204 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
5205 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
5206 if (heartbeat_drop
->second
== 0) {
5207 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
5209 --heartbeat_drop
->second
;
5210 dout(5) << "Dropping heartbeat from " << from
5211 << ", " << heartbeat_drop
->second
5212 << " remaining to drop" << dendl
;
5215 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
5216 ((((double)(rand()%100))/100.0))) {
5218 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
5219 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
5220 dout(5) << "Dropping heartbeat from " << from
5221 << ", " << heartbeat_drop
->second
5222 << " remaining to drop" << dendl
;
5227 if (!cct
->get_heartbeat_map()->is_healthy()) {
5228 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl
;
5232 Message
*r
= new MOSDPing(monc
->get_fsid(),
5233 curmap
->get_epoch(),
5234 MOSDPing::PING_REPLY
, m
->stamp
,
5235 cct
->_conf
->osd_heartbeat_min_size
);
5236 m
->get_connection()->send_message(r
);
5238 if (curmap
->is_up(from
)) {
5239 service
.note_peer_epoch(from
, m
->map_epoch
);
5241 ConnectionRef con
= service
.get_con_osd_cluster(from
, curmap
->get_epoch());
5243 service
.share_map_peer(from
, con
.get());
5246 } else if (!curmap
->exists(from
) ||
5247 curmap
->get_down_at(from
) > m
->map_epoch
) {
5248 // tell them they have died
5249 Message
*r
= new MOSDPing(monc
->get_fsid(),
5250 curmap
->get_epoch(),
5253 cct
->_conf
->osd_heartbeat_min_size
);
5254 m
->get_connection()->send_message(r
);
5259 case MOSDPing::PING_REPLY
:
5261 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
5262 if (i
!= heartbeat_peers
.end()) {
5263 auto acked
= i
->second
.ping_history
.find(m
->stamp
);
5264 if (acked
!= i
->second
.ping_history
.end()) {
5265 utime_t now
= ceph_clock_now();
5266 int &unacknowledged
= acked
->second
.second
;
5267 if (m
->get_connection() == i
->second
.con_back
) {
5268 dout(25) << "handle_osd_ping got reply from osd." << from
5269 << " first_tx " << i
->second
.first_tx
5270 << " last_tx " << i
->second
.last_tx
5271 << " last_rx_back " << i
->second
.last_rx_back
<< " -> " << now
5272 << " last_rx_front " << i
->second
.last_rx_front
5274 i
->second
.last_rx_back
= now
;
5275 ceph_assert(unacknowledged
> 0);
5277 // if there is no front con, set both stamps.
5278 if (i
->second
.con_front
== NULL
) {
5279 i
->second
.last_rx_front
= now
;
5280 ceph_assert(unacknowledged
> 0);
5283 } else if (m
->get_connection() == i
->second
.con_front
) {
5284 dout(25) << "handle_osd_ping got reply from osd." << from
5285 << " first_tx " << i
->second
.first_tx
5286 << " last_tx " << i
->second
.last_tx
5287 << " last_rx_back " << i
->second
.last_rx_back
5288 << " last_rx_front " << i
->second
.last_rx_front
<< " -> " << now
5290 i
->second
.last_rx_front
= now
;
5291 ceph_assert(unacknowledged
> 0);
5295 if (unacknowledged
== 0) {
5296 // succeeded in getting all replies
5297 dout(25) << "handle_osd_ping got all replies from osd." << from
5298 << " , erase pending ping(sent at " << m
->stamp
<< ")"
5299 << " and older pending ping(s)"
5302 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5303 ++i
->second
.hb_average_count
;
5304 uint32_t back_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_back
- m
->stamp
);
5305 i
->second
.hb_total_back
+= back_pingtime
;
5306 if (back_pingtime
< i
->second
.hb_min_back
)
5307 i
->second
.hb_min_back
= back_pingtime
;
5308 if (back_pingtime
> i
->second
.hb_max_back
)
5309 i
->second
.hb_max_back
= back_pingtime
;
5310 uint32_t front_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_front
- m
->stamp
);
5311 i
->second
.hb_total_front
+= front_pingtime
;
5312 if (front_pingtime
< i
->second
.hb_min_front
)
5313 i
->second
.hb_min_front
= front_pingtime
;
5314 if (front_pingtime
> i
->second
.hb_max_front
)
5315 i
->second
.hb_max_front
= front_pingtime
;
5317 ceph_assert(i
->second
.hb_interval_start
!= utime_t());
5318 if (i
->second
.hb_interval_start
== utime_t())
5319 i
->second
.hb_interval_start
= now
;
5320 int64_t hb_avg_time_period
= 60;
5321 if (cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span")) {
5322 hb_avg_time_period
= cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span");
5324 if (now
- i
->second
.hb_interval_start
>= utime_t(hb_avg_time_period
, 0)) {
5325 uint32_t back_avg
= i
->second
.hb_total_back
/ i
->second
.hb_average_count
;
5326 uint32_t back_min
= i
->second
.hb_min_back
;
5327 uint32_t back_max
= i
->second
.hb_max_back
;
5328 uint32_t front_avg
= i
->second
.hb_total_front
/ i
->second
.hb_average_count
;
5329 uint32_t front_min
= i
->second
.hb_min_front
;
5330 uint32_t front_max
= i
->second
.hb_max_front
;
5332 // Reset for new interval
5333 i
->second
.hb_average_count
= 0;
5334 i
->second
.hb_interval_start
= now
;
5335 i
->second
.hb_total_back
= i
->second
.hb_max_back
= 0;
5336 i
->second
.hb_min_back
= UINT_MAX
;
5337 i
->second
.hb_total_front
= i
->second
.hb_max_front
= 0;
5338 i
->second
.hb_min_front
= UINT_MAX
;
5340 // Record per osd interace ping times
5341 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5342 if (i
->second
.hb_back_pingtime
.size() == 0) {
5343 ceph_assert(i
->second
.hb_front_pingtime
.size() == 0);
5344 for (unsigned k
= 0 ; k
< hb_vector_size
; ++k
) {
5345 i
->second
.hb_back_pingtime
.push_back(back_avg
);
5346 i
->second
.hb_back_min
.push_back(back_min
);
5347 i
->second
.hb_back_max
.push_back(back_max
);
5348 i
->second
.hb_front_pingtime
.push_back(front_avg
);
5349 i
->second
.hb_front_min
.push_back(front_min
);
5350 i
->second
.hb_front_max
.push_back(front_max
);
5351 ++i
->second
.hb_index
;
5354 int index
= i
->second
.hb_index
& (hb_vector_size
- 1);
5355 i
->second
.hb_back_pingtime
[index
] = back_avg
;
5356 i
->second
.hb_back_min
[index
] = back_min
;
5357 i
->second
.hb_back_max
[index
] = back_max
;
5358 i
->second
.hb_front_pingtime
[index
] = front_avg
;
5359 i
->second
.hb_front_min
[index
] = front_min
;
5360 i
->second
.hb_front_max
[index
] = front_max
;
5361 ++i
->second
.hb_index
;
5365 std::lock_guard
l(service
.stat_lock
);
5366 service
.osd_stat
.hb_pingtime
[from
].last_update
= now
.sec();
5367 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5370 uint32_t min
= UINT_MAX
;
5374 uint32_t size
= (uint32_t)i
->second
.hb_back_pingtime
.size();
5375 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5377 int index
= (i
->second
.hb_index
+ k
) % size
;
5378 total
+= i
->second
.hb_back_pingtime
[index
];
5379 if (i
->second
.hb_back_min
[index
] < min
)
5380 min
= i
->second
.hb_back_min
[index
];
5381 if (i
->second
.hb_back_max
[index
] > max
)
5382 max
= i
->second
.hb_back_max
[index
];
5383 if (count
== 1 || count
== 5 || count
== 15) {
5384 service
.osd_stat
.hb_pingtime
[from
].back_pingtime
[which
] = total
/ count
;
5385 service
.osd_stat
.hb_pingtime
[from
].back_min
[which
] = min
;
5386 service
.osd_stat
.hb_pingtime
[from
].back_max
[which
] = max
;
5393 if (i
->second
.con_front
!= NULL
) {
5394 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5401 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5403 int index
= (i
->second
.hb_index
+ k
) % size
;
5404 total
+= i
->second
.hb_front_pingtime
[index
];
5405 if (i
->second
.hb_front_min
[index
] < min
)
5406 min
= i
->second
.hb_front_min
[index
];
5407 if (i
->second
.hb_front_max
[index
] > max
)
5408 max
= i
->second
.hb_front_max
[index
];
5409 if (count
== 1 || count
== 5 || count
== 15) {
5410 service
.osd_stat
.hb_pingtime
[from
].front_pingtime
[which
] = total
/ count
;
5411 service
.osd_stat
.hb_pingtime
[from
].front_min
[which
] = min
;
5412 service
.osd_stat
.hb_pingtime
[from
].front_max
[which
] = max
;
5421 std::lock_guard
l(service
.stat_lock
);
5422 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5423 if (i
->second
.con_front
!= NULL
)
5424 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5426 i
->second
.ping_history
.erase(i
->second
.ping_history
.begin(), ++acked
);
5429 if (i
->second
.is_healthy(now
)) {
5430 // Cancel false reports
5431 auto failure_queue_entry
= failure_queue
.find(from
);
5432 if (failure_queue_entry
!= failure_queue
.end()) {
5433 dout(10) << "handle_osd_ping canceling queued "
5434 << "failure report for osd." << from
<< dendl
;
5435 failure_queue
.erase(failure_queue_entry
);
5438 auto failure_pending_entry
= failure_pending
.find(from
);
5439 if (failure_pending_entry
!= failure_pending
.end()) {
5440 dout(10) << "handle_osd_ping canceling in-flight "
5441 << "failure report for osd." << from
<< dendl
;
5442 send_still_alive(curmap
->get_epoch(),
5444 failure_pending_entry
->second
.second
);
5445 failure_pending
.erase(failure_pending_entry
);
5449 // old replies, deprecated by newly sent pings.
5450 dout(10) << "handle_osd_ping no pending ping(sent at " << m
->stamp
5451 << ") is found, treat as covered by newly sent pings "
5458 curmap
->is_up(from
)) {
5459 service
.note_peer_epoch(from
, m
->map_epoch
);
5461 ConnectionRef con
= service
.get_con_osd_cluster(from
, curmap
->get_epoch());
5463 service
.share_map_peer(from
, con
.get());
5470 case MOSDPing::YOU_DIED
:
5471 dout(10) << "handle_osd_ping " << m
->get_source_inst()
5472 << " says i am down in " << m
->map_epoch
<< dendl
;
5473 osdmap_subscribe(curmap
->get_epoch()+1, false);
5477 heartbeat_lock
.Unlock();
5481 void OSD::heartbeat_entry()
5483 std::lock_guard
l(heartbeat_lock
);
5486 while (!heartbeat_stop
) {
5490 if (cct
->_conf
.get_val
<bool>("debug_disable_randomized_ping")) {
5491 wait
= (float)cct
->_conf
->osd_heartbeat_interval
;
5493 wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
5496 w
.set_from_double(wait
);
5497 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
5498 heartbeat_cond
.WaitInterval(heartbeat_lock
, w
);
5501 dout(30) << "heartbeat_entry woke up" << dendl
;
5505 void OSD::heartbeat_check()
5507 ceph_assert(heartbeat_lock
.is_locked());
5508 utime_t now
= ceph_clock_now();
5510 // check for incoming heartbeats (move me elsewhere?)
5511 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5512 p
!= heartbeat_peers
.end();
5515 if (p
->second
.first_tx
== utime_t()) {
5516 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
5517 << " yet, skipping" << dendl
;
5521 dout(25) << "heartbeat_check osd." << p
->first
5522 << " first_tx " << p
->second
.first_tx
5523 << " last_tx " << p
->second
.last_tx
5524 << " last_rx_back " << p
->second
.last_rx_back
5525 << " last_rx_front " << p
->second
.last_rx_front
5527 if (p
->second
.is_unhealthy(now
)) {
5528 utime_t oldest_deadline
= p
->second
.ping_history
.begin()->second
.first
;
5529 if (p
->second
.last_rx_back
== utime_t() ||
5530 p
->second
.last_rx_front
== utime_t()) {
5531 derr
<< "heartbeat_check: no reply from "
5532 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5533 << " osd." << p
->first
5534 << " ever on either front or back, first ping sent "
5535 << p
->second
.first_tx
5536 << " (oldest deadline " << oldest_deadline
<< ")"
5539 failure_queue
[p
->first
] = p
->second
.first_tx
;
5541 derr
<< "heartbeat_check: no reply from "
5542 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5543 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
5544 << " front " << p
->second
.last_rx_front
5545 << " (oldest deadline " << oldest_deadline
<< ")"
5548 failure_queue
[p
->first
] = std::min(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
5554 void OSD::heartbeat()
5556 ceph_assert(heartbeat_lock
.is_locked_by_me());
5557 dout(30) << "heartbeat" << dendl
;
5561 int hb_interval
= cct
->_conf
->osd_heartbeat_interval
;
5562 int n_samples
= 86400;
5563 if (hb_interval
> 1) {
5564 n_samples
/= hb_interval
;
5569 if (getloadavg(loadavgs
, 1) == 1) {
5570 logger
->set(l_osd_loadavg
, 100 * loadavgs
[0]);
5571 daily_loadavg
= (daily_loadavg
* (n_samples
- 1) + loadavgs
[0]) / n_samples
;
5572 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg
<< dendl
;
5575 dout(30) << "heartbeat checking stats" << dendl
;
5577 // refresh peer list and osd stats
5578 vector
<int> hb_peers
;
5579 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5580 p
!= heartbeat_peers
.end();
5582 hb_peers
.push_back(p
->first
);
5584 auto new_stat
= service
.set_osd_stat(hb_peers
, get_num_pgs());
5585 dout(5) << __func__
<< " " << new_stat
<< dendl
;
5586 ceph_assert(new_stat
.statfs
.total
);
5589 float ratio
= service
.compute_adjusted_ratio(new_stat
, &pratio
);
5591 service
.check_full_status(ratio
, pratio
);
5593 utime_t now
= ceph_clock_now();
5594 utime_t deadline
= now
;
5595 deadline
+= cct
->_conf
->osd_heartbeat_grace
;
5598 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
5599 i
!= heartbeat_peers
.end();
5601 int peer
= i
->first
;
5602 i
->second
.last_tx
= now
;
5603 if (i
->second
.first_tx
== utime_t())
5604 i
->second
.first_tx
= now
;
5605 i
->second
.ping_history
[now
] = make_pair(deadline
,
5606 HeartbeatInfo::HEARTBEAT_MAX_CONN
);
5607 if (i
->second
.hb_interval_start
== utime_t())
5608 i
->second
.hb_interval_start
= now
;
5609 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
5610 i
->second
.con_back
->send_message(new MOSDPing(monc
->get_fsid(),
5611 service
.get_osdmap_epoch(),
5612 MOSDPing::PING
, now
,
5613 cct
->_conf
->osd_heartbeat_min_size
));
5615 if (i
->second
.con_front
)
5616 i
->second
.con_front
->send_message(new MOSDPing(monc
->get_fsid(),
5617 service
.get_osdmap_epoch(),
5618 MOSDPing::PING
, now
,
5619 cct
->_conf
->osd_heartbeat_min_size
));
5622 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
5624 // hmm.. am i all alone?
5625 dout(30) << "heartbeat lonely?" << dendl
;
5626 if (heartbeat_peers
.empty()) {
5627 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
5628 last_mon_heartbeat
= now
;
5629 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
5630 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
5634 dout(30) << "heartbeat done" << dendl
;
5637 bool OSD::heartbeat_reset(Connection
*con
)
5639 std::lock_guard
l(heartbeat_lock
);
5640 auto s
= con
->get_priv();
5641 con
->set_priv(nullptr);
5643 if (is_stopping()) {
5646 auto heartbeat_session
= static_cast<HeartbeatSession
*>(s
.get());
5647 auto p
= heartbeat_peers
.find(heartbeat_session
->peer
);
5648 if (p
!= heartbeat_peers
.end() &&
5649 (p
->second
.con_back
== con
||
5650 p
->second
.con_front
== con
)) {
5651 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5652 << ", reopening" << dendl
;
5653 if (con
!= p
->second
.con_back
) {
5654 p
->second
.con_back
->mark_down();
5656 p
->second
.con_back
.reset(NULL
);
5657 if (p
->second
.con_front
&& con
!= p
->second
.con_front
) {
5658 p
->second
.con_front
->mark_down();
5660 p
->second
.con_front
.reset(NULL
);
5661 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
5663 p
->second
.con_back
= newcon
.first
.get();
5664 p
->second
.con_back
->set_priv(s
);
5665 if (newcon
.second
) {
5666 p
->second
.con_front
= newcon
.second
.get();
5667 p
->second
.con_front
->set_priv(s
);
5669 p
->second
.ping_history
.clear();
5671 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5672 << ", raced with osdmap update, closing out peer" << dendl
;
5673 heartbeat_peers
.erase(p
);
5676 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
5684 // =========================================
5688 ceph_assert(osd_lock
.is_locked());
5689 dout(10) << "tick" << dendl
;
5691 if (is_active() || is_waiting_for_healthy()) {
5692 maybe_update_heartbeat_peers();
5695 if (is_waiting_for_healthy()) {
5699 if (is_waiting_for_healthy() || is_booting()) {
5700 std::lock_guard
l(heartbeat_lock
);
5701 utime_t now
= ceph_clock_now();
5702 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
) {
5703 last_mon_heartbeat
= now
;
5704 dout(1) << __func__
<< " checking mon for new map" << dendl
;
5705 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
5711 tick_timer
.add_event_after(get_tick_interval(), new C_Tick(this));
5714 void OSD::tick_without_osd_lock()
5716 ceph_assert(tick_timer_lock
.is_locked());
5717 dout(10) << "tick_without_osd_lock" << dendl
;
5719 logger
->set(l_osd_cached_crc
, buffer::get_cached_crc());
5720 logger
->set(l_osd_cached_crc_adjusted
, buffer::get_cached_crc_adjusted());
5721 logger
->set(l_osd_missed_crc
, buffer::get_missed_crc());
5723 // refresh osd stats
5724 struct store_statfs_t stbuf
;
5725 osd_alert_list_t alerts
;
5726 int r
= store
->statfs(&stbuf
, &alerts
);
5727 ceph_assert(r
== 0);
5728 service
.set_statfs(stbuf
, alerts
);
5730 // osd_lock is not being held, which means the OSD state
5731 // might change when doing the monitor report
5732 if (is_active() || is_waiting_for_healthy()) {
5733 heartbeat_lock
.Lock();
5735 heartbeat_lock
.Unlock();
5737 map_lock
.get_read();
5738 std::lock_guard
l(mon_report_lock
);
5741 utime_t now
= ceph_clock_now();
5742 if (service
.need_fullness_update() ||
5743 now
- last_mon_report
> cct
->_conf
->osd_mon_report_interval
) {
5744 last_mon_report
= now
;
5748 map_lock
.put_read();
5750 epoch_t max_waiting_epoch
= 0;
5751 for (auto s
: shards
) {
5752 max_waiting_epoch
= std::max(max_waiting_epoch
,
5753 s
->get_max_waiting_epoch());
5755 if (max_waiting_epoch
> get_osdmap()->get_epoch()) {
5756 dout(20) << __func__
<< " max_waiting_epoch " << max_waiting_epoch
5757 << ", requesting new map" << dendl
;
5758 osdmap_subscribe(superblock
.newest_map
+ 1, false);
5763 if (!scrub_random_backoff()) {
5766 service
.promote_throttle_recalibrate();
5767 resume_creating_pg();
5768 bool need_send_beacon
= false;
5769 const auto now
= ceph::coarse_mono_clock::now();
5771 // borrow lec lock to pretect last_sent_beacon from changing
5772 std::lock_guard l
{min_last_epoch_clean_lock
};
5773 const auto elapsed
= now
- last_sent_beacon
;
5774 if (chrono::duration_cast
<chrono::seconds
>(elapsed
).count() >
5775 cct
->_conf
->osd_beacon_report_interval
) {
5776 need_send_beacon
= true;
5779 if (need_send_beacon
) {
5784 mgrc
.update_daemon_health(get_health_metrics());
5785 service
.kick_recovery_queue();
5786 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
5787 new C_Tick_WithoutOSDLock(this));
5791 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5792 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5793 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5794 // getomap <pool> [namespace/]<obj-name>
5795 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5796 // injectmdataerr [namespace/]<obj-name> [shardid]
5797 // injectdataerr [namespace/]<obj-name> [shardid]
5799 // set_recovery_delay [utime]
5800 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
5801 std::string_view command
,
5802 const cmdmap_t
& cmdmap
, ostream
&ss
)
5805 //Support changing the omap on a single osd by using the Admin Socket to
5806 //directly request the osd make a change.
5807 if (command
== "setomapval" || command
== "rmomapkey" ||
5808 command
== "setomapheader" || command
== "getomap" ||
5809 command
== "truncobj" || command
== "injectmdataerr" ||
5810 command
== "injectdataerr"
5814 OSDMapRef curmap
= service
->get_osdmap();
5819 cmd_getval(service
->cct
, cmdmap
, "pool", poolstr
);
5820 pool
= curmap
->lookup_pg_pool_name(poolstr
);
5821 //If we can't find it by name then maybe id specified
5822 if (pool
< 0 && isdigit(poolstr
[0]))
5823 pool
= atoll(poolstr
.c_str());
5825 ss
<< "Invalid pool '" << poolstr
<< "''";
5829 string objname
, nspace
;
5830 cmd_getval(service
->cct
, cmdmap
, "objname", objname
);
5831 std::size_t found
= objname
.find_first_of('/');
5832 if (found
!= string::npos
) {
5833 nspace
= objname
.substr(0, found
);
5834 objname
= objname
.substr(found
+1);
5836 object_locator_t
oloc(pool
, nspace
);
5837 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
5840 ss
<< "Invalid namespace/objname";
5845 cmd_getval(service
->cct
, cmdmap
, "shardid", shardid
, int64_t(shard_id_t::NO_SHARD
));
5846 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
5847 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
5848 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
5849 if (curmap
->pg_is_ec(rawpg
)) {
5850 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
5851 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
5856 ObjectStore::Transaction t
;
5858 if (command
== "setomapval") {
5859 map
<string
, bufferlist
> newattrs
;
5862 cmd_getval(service
->cct
, cmdmap
, "key", key
);
5863 cmd_getval(service
->cct
, cmdmap
, "val", valstr
);
5866 newattrs
[key
] = val
;
5867 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
5868 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5870 ss
<< "error=" << r
;
5873 } else if (command
== "rmomapkey") {
5876 cmd_getval(service
->cct
, cmdmap
, "key", key
);
5879 t
.omap_rmkeys(coll_t(pgid
), ghobject_t(obj
), keys
);
5880 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5882 ss
<< "error=" << r
;
5885 } else if (command
== "setomapheader") {
5886 bufferlist newheader
;
5889 cmd_getval(service
->cct
, cmdmap
, "header", headerstr
);
5890 newheader
.append(headerstr
);
5891 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
5892 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5894 ss
<< "error=" << r
;
5897 } else if (command
== "getomap") {
5898 //Debug: Output entire omap
5900 map
<string
, bufferlist
> keyvals
;
5901 auto ch
= store
->open_collection(coll_t(pgid
));
5903 ss
<< "unable to open collection for " << pgid
;
5906 r
= store
->omap_get(ch
, ghobject_t(obj
), &hdrbl
, &keyvals
);
5908 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
5909 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
5910 it
!= keyvals
.end(); ++it
)
5911 ss
<< " key=" << (*it
).first
<< " val="
5912 << string((*it
).second
.c_str(), (*it
).second
.length());
5914 ss
<< "error=" << r
;
5917 } else if (command
== "truncobj") {
5919 cmd_getval(service
->cct
, cmdmap
, "len", trunclen
);
5920 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
5921 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5923 ss
<< "error=" << r
;
5926 } else if (command
== "injectdataerr") {
5927 store
->inject_data_error(gobj
);
5929 } else if (command
== "injectmdataerr") {
5930 store
->inject_mdata_error(gobj
);
5935 if (command
== "set_recovery_delay") {
5937 cmd_getval(service
->cct
, cmdmap
, "utime", delay
, (int64_t)0);
5940 int r
= service
->cct
->_conf
.set_val("osd_recovery_delay_start",
5943 ss
<< "set_recovery_delay: error setting "
5944 << "osd_recovery_delay_start to '" << delay
<< "': error "
5948 service
->cct
->_conf
.apply_changes(nullptr);
5949 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
5950 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
5953 if (command
== "trigger_scrub" || command
== "trigger_deep_scrub") {
5955 bool deep
= (command
== "trigger_deep_scrub");
5956 OSDMapRef curmap
= service
->get_osdmap();
5960 cmd_getval(service
->cct
, cmdmap
, "pgid", pgidstr
);
5961 if (!pgid
.parse(pgidstr
.c_str())) {
5962 ss
<< "Invalid pgid specified";
5967 cmd_getval(service
->cct
, cmdmap
, "time", time
, (int64_t)0);
5969 PGRef pg
= service
->osd
->_lookup_lock_pg(pgid
);
5970 if (pg
== nullptr) {
5971 ss
<< "Can't find pg " << pgid
;
5975 if (pg
->is_primary()) {
5976 pg
->unreg_next_scrub();
5977 const pg_pool_t
*p
= curmap
->get_pg_pool(pgid
.pool());
5978 double pool_scrub_max_interval
= 0;
5979 double scrub_max_interval
;
5981 p
->opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &pool_scrub_max_interval
);
5982 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
5983 pool_scrub_max_interval
: g_conf()->osd_deep_scrub_interval
;
5985 p
->opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &pool_scrub_max_interval
);
5986 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
5987 pool_scrub_max_interval
: g_conf()->osd_scrub_max_interval
;
5989 // Instead of marking must_scrub force a schedule scrub
5990 utime_t stamp
= ceph_clock_now();
5992 stamp
-= scrub_max_interval
;
5994 stamp
-= (float)time
;
5995 stamp
-= 100.0; // push back last scrub more for good measure
5997 pg
->set_last_deep_scrub_stamp(stamp
);
5999 pg
->set_last_scrub_stamp(stamp
);
6001 pg
->reg_next_scrub();
6002 pg
->publish_stats_to_osd();
6003 ss
<< "ok - set" << (deep
? " deep" : "" ) << " stamp " << stamp
;
6005 ss
<< "Not primary";
6010 if (command
== "injectfull") {
6013 OSDService::s_names state
;
6014 cmd_getval(service
->cct
, cmdmap
, "type", type
, string("full"));
6015 cmd_getval(service
->cct
, cmdmap
, "count", count
, (int64_t)-1);
6016 if (type
== "none" || count
== 0) {
6020 state
= service
->get_full_state(type
);
6021 if (state
== OSDService::s_names::INVALID
) {
6022 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6025 service
->set_injectfull(state
, count
);
6028 ss
<< "Internal error - command=" << command
;
6031 // =========================================
6033 void OSD::ms_handle_connect(Connection
*con
)
6035 dout(10) << __func__
<< " con " << con
<< dendl
;
6036 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
6037 std::lock_guard
l(osd_lock
);
6040 dout(10) << __func__
<< " on mon" << dendl
;
6044 } else if (is_booting()) {
6045 _send_boot(); // resend boot message
6047 map_lock
.get_read();
6048 std::lock_guard
l2(mon_report_lock
);
6050 utime_t now
= ceph_clock_now();
6051 last_mon_report
= now
;
6053 // resend everything, it's a new session
6056 service
.requeue_pg_temp();
6057 service
.clear_sent_ready_to_merge();
6058 service
.send_pg_temp();
6059 service
.send_ready_to_merge();
6060 service
.send_pg_created();
6064 map_lock
.put_read();
6066 send_beacon(ceph::coarse_mono_clock::now());
6070 // full map requests may happen while active or pre-boot
6071 if (requested_full_first
) {
6072 rerequest_full_maps();
6077 void OSD::ms_handle_fast_connect(Connection
*con
)
6079 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6080 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6081 auto priv
= con
->get_priv();
6082 auto s
= static_cast<Session
*>(priv
.get());
6084 s
= new Session
{cct
, con
};
6085 con
->set_priv(RefCountedPtr
{s
, false});
6086 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
6087 << " addr=" << s
->con
->get_peer_addr() << dendl
;
6088 // we don't connect to clients
6089 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6090 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6095 void OSD::ms_handle_fast_accept(Connection
*con
)
6097 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6098 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6099 auto priv
= con
->get_priv();
6100 auto s
= static_cast<Session
*>(priv
.get());
6102 s
= new Session
{cct
, con
};
6103 con
->set_priv(RefCountedPtr
{s
, false});
6104 dout(10) << "new session (incoming)" << s
<< " con=" << con
6105 << " addr=" << con
->get_peer_addr()
6106 << " must have raced with connect" << dendl
;
6107 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6108 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6113 bool OSD::ms_handle_reset(Connection
*con
)
6115 auto s
= con
->get_priv();
6116 auto session
= static_cast<Session
*>(s
.get());
6117 dout(2) << "ms_handle_reset con " << con
<< " session " << session
<< dendl
;
6120 session
->wstate
.reset(con
);
6121 session
->con
->set_priv(nullptr);
6122 session
->con
.reset(); // break con <-> session ref cycle
6123 // note that we break session->con *before* the session_handle_reset
6124 // cleanup below. this avoids a race between us and
6125 // PG::add_backoff, Session::check_backoff, etc.
6126 session_handle_reset(SessionRef
{session
});
6130 bool OSD::ms_handle_refused(Connection
*con
)
6132 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
6135 auto priv
= con
->get_priv();
6136 auto session
= static_cast<Session
*>(priv
.get());
6137 dout(2) << "ms_handle_refused con " << con
<< " session " << session
<< dendl
;
6140 int type
= con
->get_peer_type();
6141 // handle only OSD failures here
6142 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
6143 OSDMapRef osdmap
= get_osdmap();
6145 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
6146 if (id
>= 0 && osdmap
->is_up(id
)) {
6147 // I'm cheating mon heartbeat grace logic, because we know it's not going
6148 // to respawn alone. +1 so we won't hit any boundary case.
6149 monc
->send_mon_message(
6153 osdmap
->get_addrs(id
),
6154 cct
->_conf
->osd_heartbeat_grace
+ 1,
6155 osdmap
->get_epoch(),
6156 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
6164 struct C_OSD_GetVersion
: public Context
{
6166 uint64_t oldest
, newest
;
6167 explicit C_OSD_GetVersion(OSD
*o
) : osd(o
), oldest(0), newest(0) {}
6168 void finish(int r
) override
{
6170 osd
->_got_mon_epochs(oldest
, newest
);
6174 void OSD::start_boot()
6176 if (!_is_healthy()) {
6177 // if we are not healthy, do not mark ourselves up (yet)
6178 dout(1) << "not healthy; waiting to boot" << dendl
;
6179 if (!is_waiting_for_healthy())
6180 start_waiting_for_healthy();
6181 // send pings sooner rather than later
6185 dout(1) << __func__
<< dendl
;
6186 set_state(STATE_PREBOOT
);
6187 dout(10) << "start_boot - have maps " << superblock
.oldest_map
6188 << ".." << superblock
.newest_map
<< dendl
;
6189 C_OSD_GetVersion
*c
= new C_OSD_GetVersion(this);
6190 monc
->get_version("osdmap", &c
->newest
, &c
->oldest
, c
);
6193 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
6195 std::lock_guard
l(osd_lock
);
6197 _preboot(oldest
, newest
);
6201 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
6203 ceph_assert(is_preboot());
6204 dout(10) << __func__
<< " _preboot mon has osdmaps "
6205 << oldest
<< ".." << newest
<< dendl
;
6207 // ensure our local fullness awareness is accurate
6209 std::lock_guard
l(heartbeat_lock
);
6213 // if our map within recent history, try to add ourselves to the osdmap.
6214 if (osdmap
->get_epoch() == 0) {
6215 derr
<< "waiting for initial osdmap" << dendl
;
6216 } else if (osdmap
->is_destroyed(whoami
)) {
6217 derr
<< "osdmap says I am destroyed" << dendl
;
6218 // provide a small margin so we don't livelock seeing if we
6219 // un-destroyed ourselves.
6220 if (osdmap
->get_epoch() > newest
- 1) {
6223 } else if (osdmap
->is_noup(whoami
)) {
6224 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
6225 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
6226 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6228 } else if (osdmap
->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
6229 derr
<< "osdmap require_osd_release < luminous; please upgrade to luminous"
6231 } else if (service
.need_fullness_update()) {
6232 derr
<< "osdmap fullness state needs update" << dendl
;
6234 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
6235 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
6237 // wait for pgs to fully catch up in a different thread, since
6238 // this thread might be required for splitting and merging PGs to
6240 boot_finisher
.queue(
6241 new FunctionContext(
6243 std::lock_guard
l(osd_lock
);
6245 dout(10) << __func__
<< " waiting for peering work to drain"
6248 for (auto shard
: shards
) {
6249 shard
->wait_min_pg_epoch(osdmap
->get_epoch());
6260 // get all the latest maps
6261 if (osdmap
->get_epoch() + 1 >= oldest
)
6262 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6264 osdmap_subscribe(oldest
- 1, true);
6267 void OSD::send_full_update()
6269 if (!service
.need_fullness_update())
6272 if (service
.is_full()) {
6273 state
= CEPH_OSD_FULL
;
6274 } else if (service
.is_backfillfull()) {
6275 state
= CEPH_OSD_BACKFILLFULL
;
6276 } else if (service
.is_nearfull()) {
6277 state
= CEPH_OSD_NEARFULL
;
6280 OSDMap::calc_state_set(state
, s
);
6281 dout(10) << __func__
<< " want state " << s
<< dendl
;
6282 monc
->send_mon_message(new MOSDFull(osdmap
->get_epoch(), state
));
6285 void OSD::start_waiting_for_healthy()
6287 dout(1) << "start_waiting_for_healthy" << dendl
;
6288 set_state(STATE_WAITING_FOR_HEALTHY
);
6289 last_heartbeat_resample
= utime_t();
6291 // subscribe to osdmap updates, in case our peers really are known to be dead
6292 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6295 bool OSD::_is_healthy()
6297 if (!cct
->get_heartbeat_map()->is_healthy()) {
6298 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
6302 if (is_waiting_for_healthy()) {
6303 utime_t now
= ceph_clock_now();
6304 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
6305 while (!osd_markdown_log
.empty() &&
6306 osd_markdown_log
.front() + grace
< now
)
6307 osd_markdown_log
.pop_front();
6308 if (osd_markdown_log
.size() <= 1) {
6309 dout(5) << __func__
<< " first time marked as down,"
6310 << " try reboot unconditionally" << dendl
;
6313 std::lock_guard
l(heartbeat_lock
);
6314 int num
= 0, up
= 0;
6315 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
6316 p
!= heartbeat_peers
.end();
6318 if (p
->second
.is_healthy(now
))
6322 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
6323 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
6324 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
6332 void OSD::_send_boot()
6334 dout(10) << "_send_boot" << dendl
;
6335 Connection
*local_connection
=
6336 cluster_messenger
->get_loopback_connection().get();
6337 entity_addrvec_t client_addrs
= client_messenger
->get_myaddrs();
6338 entity_addrvec_t cluster_addrs
= cluster_messenger
->get_myaddrs();
6339 entity_addrvec_t hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6340 entity_addrvec_t hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6342 dout(20) << " initial client_addrs " << client_addrs
6343 << ", cluster_addrs " << cluster_addrs
6344 << ", hb_back_addrs " << hb_back_addrs
6345 << ", hb_front_addrs " << hb_front_addrs
6347 if (cluster_messenger
->set_addr_unknowns(client_addrs
)) {
6348 dout(10) << " assuming cluster_addrs match client_addrs "
6349 << client_addrs
<< dendl
;
6350 cluster_addrs
= cluster_messenger
->get_myaddrs();
6352 if (auto session
= local_connection
->get_priv(); !session
) {
6353 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6356 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
6357 if (hb_back_server_messenger
->set_addr_unknowns(cluster_addrs
)) {
6358 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6359 << cluster_addrs
<< dendl
;
6360 hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6362 if (auto session
= local_connection
->get_priv(); !session
) {
6363 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6366 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
6367 if (hb_front_server_messenger
->set_addr_unknowns(client_addrs
)) {
6368 dout(10) << " assuming hb_front_addrs match client_addrs "
6369 << client_addrs
<< dendl
;
6370 hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6372 if (auto session
= local_connection
->get_priv(); !session
) {
6373 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6376 // we now know what our front and back addrs will be, and we are
6377 // about to tell the mon what our metadata (including numa bindings)
6378 // are, so now is a good time!
6379 set_numa_affinity();
6381 MOSDBoot
*mboot
= new MOSDBoot(
6382 superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
6383 hb_back_addrs
, hb_front_addrs
, cluster_addrs
,
6385 dout(10) << " final client_addrs " << client_addrs
6386 << ", cluster_addrs " << cluster_addrs
6387 << ", hb_back_addrs " << hb_back_addrs
6388 << ", hb_front_addrs " << hb_front_addrs
6390 _collect_metadata(&mboot
->metadata
);
6391 monc
->send_mon_message(mboot
);
6392 set_state(STATE_BOOTING
);
6395 void OSD::_collect_metadata(map
<string
,string
> *pm
)
6398 (*pm
)["osd_data"] = dev_path
;
6399 if (store
->get_type() == "filestore") {
6400 // not applicable for bluestore
6401 (*pm
)["osd_journal"] = journal_path
;
6403 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddrs());
6404 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddrs());
6405 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddrs());
6406 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddrs());
6409 (*pm
)["osd_objectstore"] = store
->get_type();
6410 (*pm
)["rotational"] = store_is_rotational
? "1" : "0";
6411 (*pm
)["journal_rotational"] = journal_is_rotational
? "1" : "0";
6412 (*pm
)["default_device_class"] = store
->get_default_device_class();
6413 store
->collect_metadata(pm
);
6415 collect_sys_info(pm
, cct
);
6417 (*pm
)["front_iface"] = pick_iface(
6419 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
6420 (*pm
)["back_iface"] = pick_iface(
6422 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
6428 set
<string
> unknown
;
6429 for (auto nm
: { "front_iface", "back_iface" }) {
6430 if (!(*pm
)[nm
].size()) {
6435 int r
= get_iface_numa_node((*pm
)[nm
], &n
);
6437 unknown
.insert((*pm
)[nm
]);
6445 if (unknown
.size()) {
6446 (*pm
)["network_numa_unknown_ifaces"] = stringify(unknown
);
6448 if (!nodes
.empty()) {
6449 (*pm
)["network_numa_nodes"] = stringify(nodes
);
6451 if (node
>= 0 && nodes
.size() == 1 && unknown
.empty()) {
6452 (*pm
)["network_numa_node"] = stringify(node
);
6456 if (numa_node
>= 0) {
6457 (*pm
)["numa_node"] = stringify(numa_node
);
6458 (*pm
)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size
,
6462 set
<string
> devnames
;
6463 store
->get_devices(&devnames
);
6464 (*pm
)["devices"] = stringify(devnames
);
6466 for (auto& dev
: devnames
) {
6468 string id
= get_device_id(dev
, &err
);
6470 if (!devids
.empty()) {
6473 devids
+= dev
+ "=" + id
;
6475 dout(10) << __func__
<< " no unique device id for " << dev
<< ": "
6479 (*pm
)["device_ids"] = devids
;
6481 dout(10) << __func__
<< " " << *pm
<< dendl
;
6484 void OSD::queue_want_up_thru(epoch_t want
)
6486 map_lock
.get_read();
6487 epoch_t cur
= osdmap
->get_up_thru(whoami
);
6488 std::lock_guard
l(mon_report_lock
);
6489 if (want
> up_thru_wanted
) {
6490 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
6491 << ", currently " << cur
6493 up_thru_wanted
= want
;
6496 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
6497 << ", currently " << cur
6500 map_lock
.put_read();
6503 void OSD::send_alive()
6505 ceph_assert(mon_report_lock
.is_locked());
6506 if (!osdmap
->exists(whoami
))
6508 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
6509 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
6510 if (up_thru_wanted
> up_thru
) {
6511 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
6512 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
6516 void OSD::request_full_map(epoch_t first
, epoch_t last
)
6518 dout(10) << __func__
<< " " << first
<< ".." << last
6519 << ", previously requested "
6520 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
6521 ceph_assert(osd_lock
.is_locked());
6522 ceph_assert(first
> 0 && last
> 0);
6523 ceph_assert(first
<= last
);
6524 ceph_assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
6525 if (requested_full_first
== 0) {
6527 requested_full_first
= first
;
6528 requested_full_last
= last
;
6529 } else if (last
<= requested_full_last
) {
6533 // additional request
6534 first
= requested_full_last
+ 1;
6535 requested_full_last
= last
;
6537 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
6538 req
->request_full(first
, last
);
6539 monc
->send_mon_message(req
);
6542 void OSD::got_full_map(epoch_t e
)
6544 ceph_assert(requested_full_first
<= requested_full_last
);
6545 ceph_assert(osd_lock
.is_locked());
6546 if (requested_full_first
== 0) {
6547 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
6550 if (e
< requested_full_first
) {
6551 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6552 << ".." << requested_full_last
6553 << ", ignoring" << dendl
;
6556 if (e
>= requested_full_last
) {
6557 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6558 << ".." << requested_full_last
<< ", resetting" << dendl
;
6559 requested_full_first
= requested_full_last
= 0;
6563 requested_full_first
= e
+ 1;
6565 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6566 << ".." << requested_full_last
6567 << ", still need more" << dendl
;
6570 void OSD::requeue_failures()
6572 std::lock_guard
l(heartbeat_lock
);
6573 unsigned old_queue
= failure_queue
.size();
6574 unsigned old_pending
= failure_pending
.size();
6575 for (auto p
= failure_pending
.begin(); p
!= failure_pending
.end(); ) {
6576 failure_queue
[p
->first
] = p
->second
.first
;
6577 failure_pending
.erase(p
++);
6579 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
6580 << failure_queue
.size() << dendl
;
6583 void OSD::send_failures()
6585 ceph_assert(map_lock
.is_locked());
6586 ceph_assert(mon_report_lock
.is_locked());
6587 std::lock_guard
l(heartbeat_lock
);
6588 utime_t now
= ceph_clock_now();
6589 while (!failure_queue
.empty()) {
6590 int osd
= failure_queue
.begin()->first
;
6591 if (!failure_pending
.count(osd
)) {
6592 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
6593 monc
->send_mon_message(
6597 osdmap
->get_addrs(osd
),
6599 osdmap
->get_epoch()));
6600 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
,
6601 osdmap
->get_addrs(osd
));
6603 failure_queue
.erase(osd
);
6607 void OSD::send_still_alive(epoch_t epoch
, int osd
, const entity_addrvec_t
&addrs
)
6609 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), osd
, addrs
, 0, epoch
,
6610 MOSDFailure::FLAG_ALIVE
);
6611 monc
->send_mon_message(m
);
6614 void OSD::cancel_pending_failures()
6616 std::lock_guard
l(heartbeat_lock
);
6617 auto it
= failure_pending
.begin();
6618 while (it
!= failure_pending
.end()) {
6619 dout(10) << __func__
<< " canceling in-flight failure report for osd."
6620 << it
->first
<< dendl
;
6621 send_still_alive(osdmap
->get_epoch(), it
->first
, it
->second
.second
);
6622 failure_pending
.erase(it
++);
6626 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
6628 const auto& monmap
= monc
->monmap
;
6629 // send beacon to mon even if we are just connected, and the monmap is not
6630 // initialized yet by then.
6631 if (monmap
.epoch
> 0 &&
6632 monmap
.get_required_features().contains_all(
6633 ceph::features::mon::FEATURE_LUMINOUS
)) {
6634 dout(20) << __func__
<< " sending" << dendl
;
6635 MOSDBeacon
* beacon
= nullptr;
6637 std::lock_guard l
{min_last_epoch_clean_lock
};
6638 beacon
= new MOSDBeacon(osdmap
->get_epoch(), min_last_epoch_clean
);
6639 beacon
->pgs
= min_last_epoch_clean_pgs
;
6640 last_sent_beacon
= now
;
6642 monc
->send_mon_message(beacon
);
6644 dout(20) << __func__
<< " not sending" << dendl
;
6648 void OSD::handle_command(MMonCommand
*m
)
6650 if (!require_mon_peer(m
)) {
6655 Command
*c
= new Command(m
->cmd
, m
->get_tid(), m
->get_data(), NULL
);
6656 command_wq
.queue(c
);
6660 void OSD::handle_command(MCommand
*m
)
6662 ConnectionRef con
= m
->get_connection();
6663 auto priv
= con
->get_priv();
6664 auto session
= static_cast<Session
*>(priv
.get());
6666 con
->send_message(new MCommandReply(m
, -EPERM
));
6671 OSDCap
& caps
= session
->caps
;
6674 if (!caps
.allow_all() || m
->get_source().is_mon()) {
6675 con
->send_message(new MCommandReply(m
, -EPERM
));
6680 Command
*c
= new Command(m
->cmd
, m
->get_tid(), m
->get_data(), con
.get());
6681 command_wq
.queue(c
);
6691 } osd_commands
[] = {
6693 #define COMMAND(parsesig, helptext, module, perm) \
6694 {parsesig, helptext, module, perm},
6696 // yes, these are really pg commands, but there's a limit to how
6697 // much work it's worth. The OSD returns all of them. Make this
6698 // form (pg <pgid> <cmd>) valid only for the cli.
6699 // Rest uses "tell <pgid> <cmd>"
6702 "name=pgid,type=CephPgid " \
6703 "name=cmd,type=CephChoices,strings=query", \
6704 "show details of a specific pg", "osd", "r")
6706 "name=pgid,type=CephPgid " \
6707 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6708 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6709 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6712 "name=pgid,type=CephPgid " \
6713 "name=cmd,type=CephChoices,strings=list_unfound " \
6714 "name=offset,type=CephString,req=false",
6715 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6718 // new form: tell <pgid> <cmd> for both cli and rest
6721 "show details of a specific pg", "osd", "r")
6722 COMMAND("mark_unfound_lost " \
6723 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6724 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6726 COMMAND("list_unfound " \
6727 "name=offset,type=CephString,req=false",
6728 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6730 COMMAND("perf histogram dump "
6731 "name=logger,type=CephString,req=false "
6732 "name=counter,type=CephString,req=false",
6733 "Get histogram data",
6736 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6737 COMMAND("version", "report version of OSD", "osd", "r")
6738 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r")
6739 COMMAND("injectargs " \
6740 "name=injected_args,type=CephString,n=N",
6741 "inject configuration arguments into running OSD",
6743 COMMAND("config set " \
6744 "name=key,type=CephString name=value,type=CephString",
6745 "Set a configuration option at runtime (not persistent)",
6747 COMMAND("config get " \
6748 "name=key,type=CephString",
6749 "Get a configuration option at runtime",
6751 COMMAND("config unset " \
6752 "name=key,type=CephString",
6753 "Unset a configuration option at runtime (not persistent)",
6755 COMMAND("cluster_log " \
6756 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6757 "name=message,type=CephString,n=N",
6758 "log a message to the cluster log",
6761 "name=count,type=CephInt,req=false " \
6762 "name=size,type=CephInt,req=false " \
6763 "name=object_size,type=CephInt,req=false " \
6764 "name=object_num,type=CephInt,req=false ", \
6765 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
6766 "(default count=1G default size=4MB). Results in log.",
6768 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw")
6770 "name=heapcmd,type=CephChoices,strings="\
6771 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
6772 "name=value,type=CephString,req=false",
6773 "show heap usage info (available only if compiled with tcmalloc)",
6775 COMMAND("debug dump_missing " \
6776 "name=filename,type=CephFilepath",
6777 "dump missing objects to a named file", "osd", "r")
6778 COMMAND("debug kick_recovery_wq " \
6779 "name=delay,type=CephInt,range=0",
6780 "set osd_recovery_delay_start to <val>", "osd", "rw")
6781 COMMAND("cpu_profiler " \
6782 "name=arg,type=CephChoices,strings=status|flush",
6783 "run cpu profiling on daemon", "osd", "rw")
6784 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6786 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6789 "compact object store's omap. "
6790 "WARNING: Compaction probably slows your requests",
6792 COMMAND("smart name=devid,type=CephString,req=False",
6793 "runs smartctl on this osd devices. ",
6795 COMMAND("cache drop",
6796 "Drop all OSD caches",
6798 COMMAND("cache status",
6799 "Get OSD caches statistics",
6801 COMMAND("send_beacon",
6802 "Send OSD beacon to mon immediately",
6806 void OSD::do_command(
6807 Connection
*con
, ceph_tid_t tid
, vector
<string
>& cmd
, bufferlist
& data
)
6809 dout(20) << "do_command tid " << tid
<< " " << cmd
<< dendl
;
6812 stringstream ss
, ds
;
6816 ss
<< "no command given";
6819 if (!cmdmap_from_json(cmd
, &cmdmap
, ss
)) {
6825 r
= _do_command(con
, cmdmap
, tid
, data
, odata
, ss
, ds
);
6826 } catch (const bad_cmd_get
& e
) {
6834 string rs
= ss
.str();
6836 dout(0) << "do_command r=" << r
<< " " << rs
<< dendl
;
6839 MCommandReply
*reply
= new MCommandReply(r
, rs
);
6840 reply
->set_tid(tid
);
6841 reply
->set_data(odata
);
6842 con
->send_message(reply
);
6847 class unlock_guard
{
6850 explicit unlock_guard(Mutex
& mutex
)
6855 unlock_guard(unlock_guard
&) = delete;
6862 int OSD::_do_command(
6863 Connection
*con
, cmdmap_t
& cmdmap
, ceph_tid_t tid
, bufferlist
& data
,
6864 bufferlist
& odata
, stringstream
& ss
, stringstream
& ds
)
6870 boost::scoped_ptr
<Formatter
> f
;
6872 cmd_getval(cct
, cmdmap
, "prefix", prefix
);
6874 if (prefix
== "get_command_descriptions") {
6876 JSONFormatter
*f
= new JSONFormatter();
6877 f
->open_object_section("command_descriptions");
6878 for (OSDCommand
*cp
= osd_commands
;
6879 cp
< &osd_commands
[std::size(osd_commands
)]; cp
++) {
6881 ostringstream secname
;
6882 secname
<< "cmd" << setfill('0') << std::setw(3) << cmdnum
;
6883 dump_cmddesc_to_json(f
, con
->get_features(),
6884 secname
.str(), cp
->cmdstring
, cp
->helpstring
,
6885 cp
->module
, cp
->perm
, 0);
6888 f
->close_section(); // command_descriptions
6895 cmd_getval(cct
, cmdmap
, "format", format
);
6896 f
.reset(Formatter::create(format
));
6898 if (prefix
== "version") {
6900 f
->open_object_section("version");
6901 f
->dump_string("version", pretty_version_to_str());
6905 ds
<< pretty_version_to_str();
6909 else if (prefix
== "injectargs") {
6910 vector
<string
> argsvec
;
6911 cmd_getval(cct
, cmdmap
, "injected_args", argsvec
);
6913 if (argsvec
.empty()) {
6915 ss
<< "ignoring empty injectargs";
6918 string args
= argsvec
.front();
6919 for (vector
<string
>::iterator a
= ++argsvec
.begin(); a
!= argsvec
.end(); ++a
)
6921 unlock_guard unlock
{osd_lock
};
6922 r
= cct
->_conf
.injectargs(args
, &ss
);
6924 else if (prefix
== "config set") {
6927 cmd_getval(cct
, cmdmap
, "key", key
);
6928 cmd_getval(cct
, cmdmap
, "value", val
);
6929 unlock_guard unlock
{osd_lock
};
6930 r
= cct
->_conf
.set_val(key
, val
, &ss
);
6932 cct
->_conf
.apply_changes(nullptr);
6935 else if (prefix
== "config get") {
6937 cmd_getval(cct
, cmdmap
, "key", key
);
6938 unlock_guard unlock
{osd_lock
};
6940 r
= cct
->_conf
.get_val(key
, &val
);
6945 else if (prefix
== "config unset") {
6947 cmd_getval(cct
, cmdmap
, "key", key
);
6948 unlock_guard unlock
{osd_lock
};
6949 r
= cct
->_conf
.rm_val(key
);
6951 cct
->_conf
.apply_changes(nullptr);
6954 r
= 0; // make command idempotent
6957 else if (prefix
== "cluster_log") {
6959 cmd_getval(cct
, cmdmap
, "message", msg
);
6962 ss
<< "ignoring empty log message";
6965 string message
= msg
.front();
6966 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
6967 message
+= " " + *a
;
6969 cmd_getval(cct
, cmdmap
, "level", lvl
);
6970 clog_type level
= string_to_clog_type(lvl
);
6973 ss
<< "unknown level '" << lvl
<< "'";
6976 clog
->do_log(level
, message
);
6979 // either 'pg <pgid> <command>' or
6980 // 'tell <pgid>' (which comes in without any of that prefix)?
6982 else if (prefix
== "pg" ||
6983 prefix
== "query" ||
6984 prefix
== "mark_unfound_lost" ||
6985 prefix
== "list_unfound"
6989 if (!cmd_getval(cct
, cmdmap
, "pgid", pgidstr
)) {
6990 ss
<< "no pgid specified";
6992 } else if (!pgid
.parse(pgidstr
.c_str())) {
6993 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
6998 if (osdmap
->get_primary_shard(pgid
, &pcand
) &&
6999 (pg
= _lookup_lock_pg(pcand
))) {
7000 if (pg
->is_primary()) {
7001 // simulate pg <pgid> cmd= for pg->do-command
7003 cmd_putval(cct
, cmdmap
, "cmd", prefix
);
7005 r
= pg
->do_command(cmdmap
, ss
, data
, odata
, con
, tid
);
7006 } catch (const bad_cmd_get
& e
) {
7013 // don't reply, pg will do so async
7017 ss
<< "not primary for pgid " << pgid
;
7019 // send them the latest diff to ensure they realize the mapping
7021 service
.send_incremental_map(osdmap
->get_epoch() - 1, con
, osdmap
);
7023 // do not reply; they will get newer maps and realize they
7030 ss
<< "i don't have pgid " << pgid
;
7036 else if (prefix
== "bench") {
7039 int64_t osize
, onum
;
7040 // default count 1G, size 4MB
7041 cmd_getval(cct
, cmdmap
, "count", count
, (int64_t)1 << 30);
7042 cmd_getval(cct
, cmdmap
, "size", bsize
, (int64_t)4 << 20);
7043 cmd_getval(cct
, cmdmap
, "object_size", osize
, (int64_t)0);
7044 cmd_getval(cct
, cmdmap
, "object_num", onum
, (int64_t)0);
7046 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
7048 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
7049 // let us limit the block size because the next checks rely on it
7050 // having a sane value. If we allow any block size to be set things
7051 // can still go sideways.
7052 ss
<< "block 'size' values are capped at "
7053 << byte_u_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
7054 << " a higher value, please adjust 'osd_bench_max_block_size'";
7057 } else if (bsize
< (int64_t) (1 << 20)) {
7058 // entering the realm of small block sizes.
7059 // limit the count to a sane value, assuming a configurable amount of
7060 // IOPS and duration, so that the OSD doesn't get hung up on this,
7061 // preventing timeouts from going off
7063 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
7064 if (count
> max_count
) {
7065 ss
<< "'count' values greater than " << max_count
7066 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
7067 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
7068 << " for " << duration
<< " seconds,"
7069 << " can cause ill effects on osd. "
7070 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
7071 << " value if you wish to use a higher 'count'.";
7076 // 1MB block sizes are big enough so that we get more stuff done.
7077 // However, to avoid the osd from getting hung on this and having
7078 // timers being triggered, we are going to limit the count assuming
7079 // a configurable throughput and duration.
7080 // NOTE: max_count is the total amount of bytes that we believe we
7081 // will be able to write during 'duration' for the given
7082 // throughput. The block size hardly impacts this unless it's
7083 // way too big. Given we already check how big the block size
7084 // is, it's safe to assume everything will check out.
7086 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
7087 if (count
> max_count
) {
7088 ss
<< "'count' values greater than " << max_count
7089 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
7090 << byte_u_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
7091 << " for " << duration
<< " seconds,"
7092 << " can cause ill effects on osd. "
7093 << " Please adjust 'osd_bench_large_size_max_throughput'"
7094 << " with a higher value if you wish to use a higher 'count'.";
7100 if (osize
&& bsize
> osize
)
7103 dout(1) << " bench count " << count
7104 << " bsize " << byte_u_t(bsize
) << dendl
;
7106 ObjectStore::Transaction cleanupt
;
7108 if (osize
&& onum
) {
7110 bufferptr
bp(osize
);
7112 bl
.push_back(std::move(bp
));
7113 bl
.rebuild_page_aligned();
7114 for (int i
=0; i
<onum
; ++i
) {
7116 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
7118 hobject_t
soid(sobject_t(oid
, 0));
7119 ObjectStore::Transaction t
;
7120 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
7121 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
7122 cleanupt
.remove(coll_t(), ghobject_t(soid
));
7127 bufferptr
bp(bsize
);
7129 bl
.push_back(std::move(bp
));
7130 bl
.rebuild_page_aligned();
7134 if (!service
.meta_ch
->flush_commit(&waiter
)) {
7139 utime_t start
= ceph_clock_now();
7140 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
7142 unsigned offset
= 0;
7143 if (onum
&& osize
) {
7144 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
7145 offset
= rand() % (osize
/ bsize
) * bsize
;
7147 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
7150 hobject_t
soid(sobject_t(oid
, 0));
7151 ObjectStore::Transaction t
;
7152 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
7153 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
7154 if (!onum
|| !osize
)
7155 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
7160 if (!service
.meta_ch
->flush_commit(&waiter
)) {
7164 utime_t end
= ceph_clock_now();
7167 store
->queue_transaction(service
.meta_ch
, std::move(cleanupt
), NULL
);
7170 if (!service
.meta_ch
->flush_commit(&waiter
)) {
7175 double elapsed
= end
- start
;
7176 double rate
= count
/ elapsed
;
7177 double iops
= rate
/ bsize
;
7179 f
->open_object_section("osd_bench_results");
7180 f
->dump_int("bytes_written", count
);
7181 f
->dump_int("blocksize", bsize
);
7182 f
->dump_float("elapsed_sec", elapsed
);
7183 f
->dump_float("bytes_per_sec", rate
);
7184 f
->dump_float("iops", iops
);
7188 ds
<< "bench: wrote " << byte_u_t(count
)
7189 << " in blocks of " << byte_u_t(bsize
) << " in "
7190 << elapsed
<< " sec at " << byte_u_t(rate
) << "/sec "
7191 << si_u_t(iops
) << " IOPS";
7195 else if (prefix
== "flush_pg_stats") {
7196 mgrc
.send_pgstats();
7197 ds
<< service
.get_osd_stat_seq() << "\n";
7200 else if (prefix
== "heap") {
7201 r
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ds
);
7204 else if (prefix
== "debug dump_missing") {
7206 f
.reset(new JSONFormatter(true));
7208 f
->open_array_section("pgs");
7211 for (auto& pg
: pgs
) {
7212 string s
= stringify(pg
->pg_id
);
7213 f
->open_array_section(s
.c_str());
7215 pg
->dump_missing(f
.get());
7222 else if (prefix
== "debug kick_recovery_wq") {
7224 cmd_getval(cct
, cmdmap
, "delay", delay
);
7227 unlock_guard unlock
{osd_lock
};
7228 r
= cct
->_conf
.set_val("osd_recovery_delay_start", oss
.str().c_str());
7230 ss
<< "kick_recovery_wq: error setting "
7231 << "osd_recovery_delay_start to '" << delay
<< "': error "
7235 cct
->_conf
.apply_changes(nullptr);
7236 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
7237 << "to " << cct
->_conf
->osd_recovery_delay_start
;
7240 else if (prefix
== "cpu_profiler") {
7242 cmd_getval(cct
, cmdmap
, "arg", arg
);
7243 vector
<string
> argvec
;
7244 get_str_vec(arg
, argvec
);
7245 cpu_profiler_handle_command(argvec
, ds
);
7248 else if (prefix
== "dump_pg_recovery_stats") {
7251 pg_recovery_stats
.dump_formatted(f
.get());
7254 pg_recovery_stats
.dump(s
);
7255 ds
<< "dump pg recovery stats: " << s
.str();
7259 else if (prefix
== "reset_pg_recovery_stats") {
7260 ss
<< "reset pg recovery stats";
7261 pg_recovery_stats
.reset();
7264 else if (prefix
== "perf histogram dump") {
7266 std::string counter
;
7267 cmd_getval(cct
, cmdmap
, "logger", logger
);
7268 cmd_getval(cct
, cmdmap
, "counter", counter
);
7270 cct
->get_perfcounters_collection()->dump_formatted_histograms(
7271 f
.get(), false, logger
, counter
);
7276 else if (prefix
== "compact") {
7277 dout(1) << "triggering manual compaction" << dendl
;
7278 auto start
= ceph::coarse_mono_clock::now();
7280 auto end
= ceph::coarse_mono_clock::now();
7281 double duration
= std::chrono::duration
<double>(end
-start
).count();
7282 dout(1) << "finished manual compaction in "
7284 << " seconds" << dendl
;
7285 ss
<< "compacted omap in " << duration
<< " seconds";
7288 else if (prefix
== "smart") {
7290 cmd_getval(cct
, cmdmap
, "devid", devid
);
7291 probe_smart(devid
, ds
);
7294 else if (prefix
== "cache drop") {
7295 dout(20) << "clearing all caches" << dendl
;
7296 // Clear the objectstore's cache - onode and buffer for Bluestore,
7297 // system's pagecache for Filestore
7298 r
= store
->flush_cache(&ss
);
7300 ds
<< "Error flushing objectstore cache: " << cpp_strerror(r
);
7303 // Clear the objectcontext cache (per PG)
7306 for (auto& pg
: pgs
) {
7311 else if (prefix
== "cache status") {
7312 int obj_ctx_count
= 0;
7315 for (auto& pg
: pgs
) {
7316 obj_ctx_count
+= pg
->get_cache_obj_count();
7319 f
->open_object_section("cache_status");
7320 f
->dump_int("object_ctx", obj_ctx_count
);
7321 store
->dump_cache_stats(f
.get());
7325 ds
<< "object_ctx: " << obj_ctx_count
;
7326 store
->dump_cache_stats(ds
);
7329 else if (prefix
== "send_beacon") {
7331 send_beacon(ceph::coarse_mono_clock::now());
7334 ss
<< "unrecognized command '" << prefix
<< "'";
7342 void OSD::probe_smart(const string
& only_devid
, ostream
& ss
)
7344 set
<string
> devnames
;
7345 store
->get_devices(&devnames
);
7346 uint64_t smart_timeout
= cct
->_conf
.get_val
<uint64_t>(
7347 "osd_smart_report_timeout");
7349 // == typedef std::map<std::string, mValue> mObject;
7350 json_spirit::mObject json_map
;
7352 for (auto dev
: devnames
) {
7353 // smartctl works only on physical devices; filter out any logical device
7354 if (dev
.find("dm-") == 0) {
7359 string devid
= get_device_id(dev
, &err
);
7360 if (devid
.size() == 0) {
7361 dout(10) << __func__
<< " no unique id for dev " << dev
<< " ("
7362 << err
<< "), skipping" << dendl
;
7365 if (only_devid
.size() && devid
!= only_devid
) {
7369 json_spirit::mValue smart_json
;
7370 if (block_device_get_metrics(dev
, smart_timeout
,
7372 dout(10) << "block_device_get_metrics failed for /dev/" << dev
<< dendl
;
7375 json_map
[devid
] = smart_json
;
7377 json_spirit::write(json_map
, ss
, json_spirit::pretty_print
);
7380 bool OSD::heartbeat_dispatch(Message
*m
)
7382 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
7383 switch (m
->get_type()) {
7386 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
7391 handle_osd_ping(static_cast<MOSDPing
*>(m
));
7395 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
7402 bool OSD::ms_dispatch(Message
*m
)
7404 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
7405 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
7406 service
.got_stop_ack();
7414 if (is_stopping()) {
7428 void OSD::maybe_share_map(
7433 if (!op
->check_send_map
) {
7436 epoch_t last_sent_epoch
= 0;
7438 session
->sent_epoch_lock
.lock();
7439 last_sent_epoch
= session
->last_sent_epoch
;
7440 session
->sent_epoch_lock
.unlock();
7442 // assume the peer has the newer of the op's sent_epoch and what
7443 // we think we sent them.
7444 epoch_t from
= std::max(last_sent_epoch
, op
->sent_epoch
);
7446 const Message
*m
= op
->get_req();
7449 m
->get_connection().get(),
7452 session
? &last_sent_epoch
: NULL
);
7454 session
->sent_epoch_lock
.lock();
7455 if (session
->last_sent_epoch
< last_sent_epoch
) {
7456 session
->last_sent_epoch
= last_sent_epoch
;
7458 session
->sent_epoch_lock
.unlock();
7460 op
->check_send_map
= false;
7463 void OSD::dispatch_session_waiting(SessionRef session
, OSDMapRef osdmap
)
7465 ceph_assert(session
->session_dispatch_lock
.is_locked());
7467 auto i
= session
->waiting_on_map
.begin();
7468 while (i
!= session
->waiting_on_map
.end()) {
7469 OpRequestRef op
= &(*i
);
7470 ceph_assert(ms_can_fast_dispatch(op
->get_req()));
7471 const MOSDFastDispatchOp
*m
= static_cast<const MOSDFastDispatchOp
*>(
7473 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
7476 session
->waiting_on_map
.erase(i
++);
7480 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
7481 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
7482 static_cast<const MOSDOp
*>(m
)->get_pg());
7483 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
7487 pgid
= m
->get_spg();
7489 enqueue_op(pgid
, std::move(op
), m
->get_map_epoch());
7492 if (session
->waiting_on_map
.empty()) {
7493 clear_session_waiting_on_map(session
);
7495 register_session_waiting_on_map(session
);
7499 void OSD::ms_fast_dispatch(Message
*m
)
7502 if (service
.is_stopping()) {
7508 switch (m
->get_type()) {
7510 dout(10) << "ping from " << m
->get_source() << dendl
;
7513 case MSG_MON_COMMAND
:
7514 handle_command(static_cast<MMonCommand
*>(m
));
7516 case MSG_OSD_FORCE_RECOVERY
:
7517 handle_fast_force_recovery(static_cast<MOSDForceRecovery
*>(m
));
7519 case MSG_OSD_SCRUB2
:
7520 handle_fast_scrub(static_cast<MOSDScrub2
*>(m
));
7523 case MSG_OSD_PG_CREATE2
:
7524 return handle_fast_pg_create(static_cast<MOSDPGCreate2
*>(m
));
7525 case MSG_OSD_PG_QUERY
:
7526 return handle_fast_pg_query(static_cast<MOSDPGQuery
*>(m
));
7527 case MSG_OSD_PG_NOTIFY
:
7528 return handle_fast_pg_notify(static_cast<MOSDPGNotify
*>(m
));
7529 case MSG_OSD_PG_INFO
:
7530 return handle_fast_pg_info(static_cast<MOSDPGInfo
*>(m
));
7531 case MSG_OSD_PG_REMOVE
:
7532 return handle_fast_pg_remove(static_cast<MOSDPGRemove
*>(m
));
7534 // these are single-pg messages that handle themselves
7535 case MSG_OSD_PG_LOG
:
7536 case MSG_OSD_PG_TRIM
:
7537 case MSG_OSD_BACKFILL_RESERVE
:
7538 case MSG_OSD_RECOVERY_RESERVE
:
7540 MOSDPeeringOp
*pm
= static_cast<MOSDPeeringOp
*>(m
);
7541 if (require_osd_peer(pm
)) {
7542 enqueue_peering_evt(
7544 PGPeeringEventRef(pm
->get_event()));
7551 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7554 osd_reqid_t reqid
= op
->get_reqid();
7556 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
7557 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
7561 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7563 // note sender epoch, min req's epoch
7564 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
7565 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
7566 ceph_assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
7568 service
.maybe_inject_dispatch_delay();
7570 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
7571 m
->get_type() != CEPH_MSG_OSD_OP
) {
7572 // queue it directly
7574 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
7576 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
7578 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7579 // message that didn't have an explicit spg_t); we need to map
7580 // them to an spg_t while preserving delivery order.
7581 auto priv
= m
->get_connection()->get_priv();
7582 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
7583 std::lock_guard l
{session
->session_dispatch_lock
};
7585 session
->waiting_on_map
.push_back(*op
);
7586 OSDMapRef nextmap
= service
.get_nextmap_reserved();
7587 dispatch_session_waiting(session
, nextmap
);
7588 service
.release_map(nextmap
);
7591 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
7594 bool OSD::ms_get_authorizer(int dest_type
, AuthAuthorizer
**authorizer
)
7596 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type
) << dendl
;
7598 if (is_stopping()) {
7599 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
7603 if (dest_type
== CEPH_ENTITY_TYPE_MON
)
7606 *authorizer
= monc
->build_authorizer(dest_type
);
7607 return *authorizer
!= NULL
;
7610 KeyStore
*OSD::ms_get_auth1_authorizer_keystore()
7612 return monc
->rotating_secrets
.get();
7615 int OSD::ms_handle_authentication(Connection
*con
)
7618 auto priv
= con
->get_priv();
7619 Session
*s
= static_cast<Session
*>(priv
.get());
7621 s
= new Session(cct
, con
);
7622 con
->set_priv(RefCountedPtr
{s
, false});
7623 s
->entity_name
= con
->get_peer_entity_name();
7624 dout(10) << __func__
<< " new session " << s
<< " con " << s
->con
7625 << " entity " << s
->entity_name
7626 << " addr " << con
->get_peer_addrs() << dendl
;
7628 dout(10) << __func__
<< " existing session " << s
<< " con " << s
->con
7629 << " entity " << s
->entity_name
7630 << " addr " << con
->get_peer_addrs() << dendl
;
7633 AuthCapsInfo
&caps_info
= con
->get_peer_caps_info();
7634 if (caps_info
.allow_all
)
7635 s
->caps
.set_allow_all();
7637 if (caps_info
.caps
.length() > 0) {
7638 bufferlist::const_iterator p
= caps_info
.caps
.cbegin();
7643 catch (buffer::error
& e
) {
7644 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7645 << " failed to decode caps string" << dendl
;
7649 bool success
= s
->caps
.parse(str
);
7651 dout(10) << __func__
<< " session " << s
7652 << " " << s
->entity_name
7653 << " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
7656 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7657 << " failed to parse caps '" << str
<< "'" << dendl
;
7665 void OSD::do_waiters()
7667 ceph_assert(osd_lock
.is_locked());
7669 dout(10) << "do_waiters -- start" << dendl
;
7670 while (!finished
.empty()) {
7671 OpRequestRef next
= finished
.front();
7672 finished
.pop_front();
7675 dout(10) << "do_waiters -- finish" << dendl
;
7678 void OSD::dispatch_op(OpRequestRef op
)
7680 switch (op
->get_req()->get_type()) {
7682 case MSG_OSD_PG_CREATE
:
7683 handle_pg_create(op
);
7688 void OSD::_dispatch(Message
*m
)
7690 ceph_assert(osd_lock
.is_locked());
7691 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
7693 switch (m
->get_type()) {
7694 // -- don't need OSDMap --
7696 // map and replication
7697 case CEPH_MSG_OSD_MAP
:
7698 handle_osd_map(static_cast<MOSDMap
*>(m
));
7703 handle_scrub(static_cast<MOSDScrub
*>(m
));
7707 handle_command(static_cast<MCommand
*>(m
));
7710 // -- need OSDMap --
7712 case MSG_OSD_PG_CREATE
:
7714 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7716 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7717 // no map? starting up?
7719 dout(7) << "no OSDMap, not booted" << dendl
;
7720 logger
->inc(l_osd_waiting_for_map
);
7721 waiting_for_osdmap
.push_back(op
);
7722 op
->mark_delayed("no osdmap");
7732 // remove me post-nautilus
7733 void OSD::handle_scrub(MOSDScrub
*m
)
7735 dout(10) << "handle_scrub " << *m
<< dendl
;
7736 if (!require_mon_or_mgr_peer(m
)) {
7740 if (m
->fsid
!= monc
->get_fsid()) {
7741 dout(0) << "handle_scrub fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7750 if (!m
->scrub_pgs
.empty()) {
7752 for (auto pgid
: m
->scrub_pgs
) {
7754 if (osdmap
->get_primary_shard(pgid
, &pcand
) &&
7755 std::find(spgs
.begin(), spgs
.end(), pcand
) != spgs
.end()) {
7762 for (auto pgid
: spgs
) {
7763 enqueue_peering_evt(
7766 std::make_shared
<PGPeeringEvent
>(
7769 PG::RequestScrub(m
->deep
, m
->repair
))));
7775 void OSD::handle_fast_scrub(MOSDScrub2
*m
)
7777 dout(10) << __func__
<< " " << *m
<< dendl
;
7778 if (!require_mon_or_mgr_peer(m
)) {
7782 if (m
->fsid
!= monc
->get_fsid()) {
7783 dout(0) << __func__
<< " fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7788 for (auto pgid
: m
->scrub_pgs
) {
7789 enqueue_peering_evt(
7792 std::make_shared
<PGPeeringEvent
>(
7795 PG::RequestScrub(m
->deep
, m
->repair
))));
7800 bool OSD::scrub_random_backoff()
7802 bool coin_flip
= (rand() / (double)RAND_MAX
>=
7803 cct
->_conf
->osd_scrub_backoff_ratio
);
7805 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl
;
7811 OSDService::ScrubJob::ScrubJob(CephContext
* cct
,
7812 const spg_t
& pg
, const utime_t
& timestamp
,
7813 double pool_scrub_min_interval
,
7814 double pool_scrub_max_interval
, bool must
)
7817 sched_time(timestamp
),
7820 // if not explicitly requested, postpone the scrub with a random delay
7822 double scrub_min_interval
= pool_scrub_min_interval
> 0 ?
7823 pool_scrub_min_interval
: cct
->_conf
->osd_scrub_min_interval
;
7824 double scrub_max_interval
= pool_scrub_max_interval
> 0 ?
7825 pool_scrub_max_interval
: cct
->_conf
->osd_scrub_max_interval
;
7827 sched_time
+= scrub_min_interval
;
7828 double r
= rand() / (double)RAND_MAX
;
7830 scrub_min_interval
* cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
7831 if (scrub_max_interval
== 0) {
7832 deadline
= utime_t();
7834 deadline
+= scrub_max_interval
;
7840 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob
& rhs
) const {
7841 if (sched_time
< rhs
.sched_time
)
7843 if (sched_time
> rhs
.sched_time
)
7845 return pgid
< rhs
.pgid
;
7848 bool OSD::scrub_time_permit(utime_t now
)
7851 time_t tt
= now
.sec();
7852 localtime_r(&tt
, &bdt
);
7854 bool day_permit
= false;
7855 if (cct
->_conf
->osd_scrub_begin_week_day
< cct
->_conf
->osd_scrub_end_week_day
) {
7856 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
&& bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7860 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
|| bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7866 dout(20) << __func__
<< " should run between week day " << cct
->_conf
->osd_scrub_begin_week_day
7867 << " - " << cct
->_conf
->osd_scrub_end_week_day
7868 << " now " << bdt
.tm_wday
<< " = no" << dendl
;
7872 bool time_permit
= false;
7873 if (cct
->_conf
->osd_scrub_begin_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7874 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
&& bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7878 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
|| bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7883 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7884 << " - " << cct
->_conf
->osd_scrub_end_hour
7885 << " now " << bdt
.tm_hour
<< " = no" << dendl
;
7887 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7888 << " - " << cct
->_conf
->osd_scrub_end_hour
7889 << " now " << bdt
.tm_hour
<< " = yes" << dendl
;
7894 bool OSD::scrub_load_below_threshold()
7897 if (getloadavg(loadavgs
, 3) != 3) {
7898 dout(10) << __func__
<< " couldn't read loadavgs\n" << dendl
;
7902 // allow scrub if below configured threshold
7903 long cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
7904 double loadavg_per_cpu
= cpus
> 0 ? loadavgs
[0] / cpus
: loadavgs
[0];
7905 if (loadavg_per_cpu
< cct
->_conf
->osd_scrub_load_threshold
) {
7906 dout(20) << __func__
<< " loadavg per cpu " << loadavg_per_cpu
7907 << " < max " << cct
->_conf
->osd_scrub_load_threshold
7908 << " = yes" << dendl
;
7912 // allow scrub if below daily avg and currently decreasing
7913 if (loadavgs
[0] < daily_loadavg
&& loadavgs
[0] < loadavgs
[2]) {
7914 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7915 << " < daily_loadavg " << daily_loadavg
7916 << " and < 15m avg " << loadavgs
[2]
7917 << " = yes" << dendl
;
7921 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7922 << " >= max " << cct
->_conf
->osd_scrub_load_threshold
7923 << " and ( >= daily_loadavg " << daily_loadavg
7924 << " or >= 15m avg " << loadavgs
[2]
7925 << ") = no" << dendl
;
7929 void OSD::sched_scrub()
7931 // if not permitted, fail fast
7932 if (!service
.can_inc_scrubs()) {
7935 bool allow_requested_repair_only
= false;
7936 if (service
.is_recovery_active()) {
7937 if (!cct
->_conf
->osd_scrub_during_recovery
&& cct
->_conf
->osd_repair_during_recovery
) {
7938 dout(10) << __func__
7939 << " will only schedule explicitly requested repair due to active recovery"
7941 allow_requested_repair_only
= true;
7942 } else if (!cct
->_conf
->osd_scrub_during_recovery
&& !cct
->_conf
->osd_repair_during_recovery
) {
7943 dout(20) << __func__
<< " not scheduling scrubs due to active recovery" << dendl
;
7948 utime_t now
= ceph_clock_now();
7949 bool time_permit
= scrub_time_permit(now
);
7950 bool load_is_low
= scrub_load_below_threshold();
7951 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low
<< dendl
;
7953 OSDService::ScrubJob scrub
;
7954 if (service
.first_scrub_stamp(&scrub
)) {
7956 dout(30) << "sched_scrub examine " << scrub
.pgid
<< " at " << scrub
.sched_time
<< dendl
;
7958 if (scrub
.sched_time
> now
) {
7959 // save ourselves some effort
7960 dout(10) << "sched_scrub " << scrub
.pgid
<< " scheduled at " << scrub
.sched_time
7961 << " > " << now
<< dendl
;
7965 if ((scrub
.deadline
.is_zero() || scrub
.deadline
>= now
) && !(time_permit
&& load_is_low
)) {
7966 dout(10) << __func__
<< " not scheduling scrub for " << scrub
.pgid
<< " due to "
7967 << (!time_permit
? "time not permit" : "high load") << dendl
;
7971 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7974 // This has already started, so go on to the next scrub job
7975 if (pg
->scrubber
.active
) {
7977 dout(30) << __func__
<< ": already in progress pgid " << scrub
.pgid
<< dendl
;
7980 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7981 if (allow_requested_repair_only
&& !pg
->scrubber
.must_repair
) {
7983 dout(10) << __func__
<< " skip " << scrub
.pgid
7984 << " because repairing is not explicitly requested on it"
7988 // If it is reserving, let it resolve before going to the next scrub job
7989 if (pg
->scrubber
.local_reserved
&& !pg
->scrubber
.active
) {
7991 dout(30) << __func__
<< ": reserve in progress pgid " << scrub
.pgid
<< dendl
;
7994 dout(10) << "sched_scrub scrubbing " << scrub
.pgid
<< " at " << scrub
.sched_time
7995 << (pg
->get_must_scrub() ? ", explicitly requested" :
7996 (load_is_low
? ", load_is_low" : " deadline < now"))
7998 if (pg
->sched_scrub()) {
8003 } while (service
.next_scrub_stamp(scrub
, &scrub
));
8005 dout(20) << "sched_scrub done" << dendl
;
8008 void OSD::resched_all_scrubs()
8010 dout(10) << __func__
<< ": start" << dendl
;
8011 OSDService::ScrubJob scrub
;
8012 if (service
.first_scrub_stamp(&scrub
)) {
8014 dout(20) << __func__
<< ": examine " << scrub
.pgid
<< dendl
;
8016 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
8019 if (!pg
->scrubber
.must_scrub
&& !pg
->scrubber
.need_auto
) {
8020 dout(20) << __func__
<< ": reschedule " << scrub
.pgid
<< dendl
;
8021 pg
->on_info_history_change();
8024 } while (service
.next_scrub_stamp(scrub
, &scrub
));
8026 dout(10) << __func__
<< ": done" << dendl
;
8029 MPGStats
* OSD::collect_pg_stats()
8031 // This implementation unconditionally sends every is_primary PG's
8032 // stats every time we're called. This has equivalent cost to the
8033 // previous implementation's worst case where all PGs are busy and
8034 // their stats are always enqueued for sending.
8035 RWLock::RLocker
l(map_lock
);
8037 utime_t had_for
= ceph_clock_now() - had_map_since
;
8038 osd_stat_t cur_stat
= service
.get_osd_stat();
8039 cur_stat
.os_perf_stat
= store
->get_cur_stats();
8041 auto m
= new MPGStats(monc
->get_fsid(), osdmap
->get_epoch(), had_for
);
8042 m
->osd_stat
= cur_stat
;
8044 std::lock_guard lec
{min_last_epoch_clean_lock
};
8045 min_last_epoch_clean
= osdmap
->get_epoch();
8046 min_last_epoch_clean_pgs
.clear();
8048 std::set
<int64_t> pool_set
;
8051 for (auto& pg
: pgs
) {
8052 auto pool
= pg
->pg_id
.pgid
.pool();
8053 pool_set
.emplace((int64_t)pool
);
8054 if (!pg
->is_primary()) {
8057 pg
->get_pg_stats([&](const pg_stat_t
& s
, epoch_t lec
) {
8058 m
->pg_stat
[pg
->pg_id
.pgid
] = s
;
8059 min_last_epoch_clean
= min(min_last_epoch_clean
, lec
);
8060 min_last_epoch_clean_pgs
.push_back(pg
->pg_id
.pgid
);
8064 bool per_pool_stats
= false;
8065 for (auto p
: pool_set
) {
8066 int r
= store
->pool_statfs(p
, &st
);
8067 if (r
== -ENOTSUP
) {
8071 m
->pool_stat
[p
] = st
;
8072 per_pool_stats
= true;
8076 // indicate whether we are reporting per-pool stats
8077 m
->osd_stat
.num_osds
= 1;
8078 m
->osd_stat
.num_per_pool_osds
= per_pool_stats
? 1 : 0;
8083 vector
<DaemonHealthMetric
> OSD::get_health_metrics()
8085 vector
<DaemonHealthMetric
> metrics
;
8087 utime_t oldest_secs
;
8088 const utime_t now
= ceph_clock_now();
8090 too_old
-= cct
->_conf
.get_val
<double>("osd_op_complaint_time");
8092 TrackedOpRef oldest_op
;
8093 auto count_slow_ops
= [&](TrackedOp
& op
) {
8094 if (op
.get_initiated() < too_old
) {
8095 lgeneric_subdout(cct
,osd
,20) << "slow op " << op
.get_desc()
8097 << op
.get_initiated() << dendl
;
8099 if (!oldest_op
|| op
.get_initiated() < oldest_op
->get_initiated()) {
8107 if (op_tracker
.visit_ops_in_flight(&oldest_secs
, count_slow_ops
)) {
8109 derr
<< __func__
<< " reporting " << slow
<< " slow ops, oldest is "
8110 << oldest_op
->get_desc() << dendl
;
8112 metrics
.emplace_back(daemon_metric::SLOW_OPS
, slow
, oldest_secs
);
8114 // no news is not good news.
8115 metrics
.emplace_back(daemon_metric::SLOW_OPS
, 0, 0);
8119 std::lock_guard
l(pending_creates_lock
);
8120 auto n_primaries
= pending_creates_from_mon
;
8121 for (const auto& create
: pending_creates_from_osd
) {
8122 if (create
.second
) {
8126 metrics
.emplace_back(daemon_metric::PENDING_CREATING_PGS
, n_primaries
);
8131 // =====================================================
8134 void OSD::wait_for_new_map(OpRequestRef op
)
8137 if (waiting_for_osdmap
.empty()) {
8138 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
8141 logger
->inc(l_osd_waiting_for_map
);
8142 waiting_for_osdmap
.push_back(op
);
8143 op
->mark_delayed("wait for new map");
8148 * assimilate new OSDMap(s). scan pgs, etc.
8151 void OSD::note_down_osd(int peer
)
8153 ceph_assert(osd_lock
.is_locked());
8154 cluster_messenger
->mark_down_addrs(osdmap
->get_cluster_addrs(peer
));
8156 heartbeat_lock
.Lock();
8157 failure_queue
.erase(peer
);
8158 failure_pending
.erase(peer
);
8159 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
8160 if (p
!= heartbeat_peers
.end()) {
8161 p
->second
.con_back
->mark_down();
8162 if (p
->second
.con_front
) {
8163 p
->second
.con_front
->mark_down();
8165 heartbeat_peers
.erase(p
);
8167 heartbeat_lock
.Unlock();
8170 void OSD::note_up_osd(int peer
)
8172 service
.forget_peer_epoch(peer
, osdmap
->get_epoch() - 1);
8173 heartbeat_set_peers_need_update();
8176 struct C_OnMapCommit
: public Context
{
8178 epoch_t first
, last
;
8180 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
8181 : osd(o
), first(f
), last(l
), msg(m
) {}
8182 void finish(int r
) override
{
8183 osd
->_committed_osd_maps(first
, last
, msg
);
8188 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
8190 std::lock_guard
l(osdmap_subscribe_lock
);
8191 if (latest_subscribed_epoch
>= epoch
&& !force_request
)
8194 latest_subscribed_epoch
= std::max
<uint64_t>(epoch
, latest_subscribed_epoch
);
8196 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
8202 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
8204 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
8205 if (min
<= superblock
.oldest_map
)
8209 ObjectStore::Transaction t
;
8210 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
8211 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
8212 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
8213 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
8214 superblock
.oldest_map
= e
+ 1;
8216 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
8217 service
.publish_superblock(superblock
);
8218 write_superblock(t
);
8219 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
8220 ceph_assert(tr
== 0);
8223 // skip_maps leaves us with a range of old maps if we fail to remove all
8224 // of them before moving superblock.oldest_map forward to the first map
8225 // in the incoming MOSDMap msg. so we should continue removing them in
8226 // this case, even we could do huge series of delete transactions all at
8233 service
.publish_superblock(superblock
);
8234 write_superblock(t
);
8235 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
8236 ceph_assert(tr
== 0);
8238 // we should not remove the cached maps
8239 ceph_assert(min
<= service
.map_cache
.cached_key_lower_bound());
8242 void OSD::handle_osd_map(MOSDMap
*m
)
8244 // wait for pgs to catch up
8246 // we extend the map cache pins to accomodate pgs slow to consume maps
8247 // for some period, until we hit the max_lag_factor bound, at which point
8248 // we block here to stop injesting more maps than they are able to keep
8250 epoch_t max_lag
= cct
->_conf
->osd_map_cache_size
*
8251 m_osd_pg_epoch_max_lag_factor
;
8252 ceph_assert(max_lag
> 0);
8253 epoch_t osd_min
= 0;
8254 for (auto shard
: shards
) {
8255 epoch_t min
= shard
->get_min_pg_epoch();
8256 if (osd_min
== 0 || min
< osd_min
) {
8261 osdmap
->get_epoch() > max_lag
&&
8262 osdmap
->get_epoch() - max_lag
> osd_min
) {
8263 epoch_t need
= osdmap
->get_epoch() - max_lag
;
8264 dout(10) << __func__
<< " waiting for pgs to catch up (need " << need
8265 << " max_lag " << max_lag
<< ")" << dendl
;
8266 for (auto shard
: shards
) {
8267 epoch_t min
= shard
->get_min_pg_epoch();
8269 dout(10) << __func__
<< " waiting for pgs to consume " << need
8270 << " (shard " << shard
->shard_id
<< " min " << min
8271 << ", map cache is " << cct
->_conf
->osd_map_cache_size
8272 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
8274 unlock_guard unlock
{osd_lock
};
8275 shard
->wait_min_pg_epoch(need
);
8281 ceph_assert(osd_lock
.is_locked());
8282 map
<epoch_t
,OSDMapRef
> added_maps
;
8283 map
<epoch_t
,bufferlist
> added_maps_bl
;
8284 if (m
->fsid
!= monc
->get_fsid()) {
8285 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
8286 << monc
->get_fsid() << dendl
;
8290 if (is_initializing()) {
8291 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
8296 auto priv
= m
->get_connection()->get_priv();
8297 if (auto session
= static_cast<Session
*>(priv
.get());
8298 session
&& !(session
->entity_name
.is_mon() ||
8299 session
->entity_name
.is_osd())) {
8301 dout(10) << "got osd map from Session " << session
8302 << " which we can't take maps from (not a mon or osd)" << dendl
;
8307 // share with the objecter
8309 service
.objecter
->handle_osd_map(m
);
8311 epoch_t first
= m
->get_first();
8312 epoch_t last
= m
->get_last();
8313 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
8314 << superblock
.newest_map
8315 << ", src has [" << m
->oldest_map
<< "," << m
->newest_map
<< "]"
8318 logger
->inc(l_osd_map
);
8319 logger
->inc(l_osd_mape
, last
- first
+ 1);
8320 if (first
<= superblock
.newest_map
)
8321 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
8322 if (service
.max_oldest_map
< m
->oldest_map
) {
8323 service
.max_oldest_map
= m
->oldest_map
;
8324 ceph_assert(service
.max_oldest_map
>= superblock
.oldest_map
);
8327 // make sure there is something new, here, before we bother flushing
8328 // the queues and such
8329 if (last
<= superblock
.newest_map
) {
8330 dout(10) << " no new maps here, dropping" << dendl
;
8336 bool skip_maps
= false;
8337 if (first
> superblock
.newest_map
+ 1) {
8338 dout(10) << "handle_osd_map message skips epochs "
8339 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
8340 if (m
->oldest_map
<= superblock
.newest_map
+ 1) {
8341 osdmap_subscribe(superblock
.newest_map
+ 1, false);
8345 // always try to get the full range of maps--as many as we can. this
8346 // 1- is good to have
8347 // 2- is at present the only way to ensure that we get a *full* map as
8349 if (m
->oldest_map
< first
) {
8350 osdmap_subscribe(m
->oldest_map
- 1, true);
8357 ObjectStore::Transaction t
;
8358 uint64_t txn_size
= 0;
8360 // store new maps: queue for disk and put in the osdmap cache
8361 epoch_t start
= std::max(superblock
.newest_map
+ 1, first
);
8362 for (epoch_t e
= start
; e
<= last
; e
++) {
8363 if (txn_size
>= t
.get_num_bytes()) {
8364 derr
<< __func__
<< " transaction size overflowed" << dendl
;
8365 ceph_assert(txn_size
< t
.get_num_bytes());
8367 txn_size
= t
.get_num_bytes();
8368 map
<epoch_t
,bufferlist
>::iterator p
;
8369 p
= m
->maps
.find(e
);
8370 if (p
!= m
->maps
.end()) {
8371 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
8372 OSDMap
*o
= new OSDMap
;
8373 bufferlist
& bl
= p
->second
;
8377 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8378 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
8379 added_maps
[e
] = add_map(o
);
8380 added_maps_bl
[e
] = bl
;
8385 p
= m
->incremental_maps
.find(e
);
8386 if (p
!= m
->incremental_maps
.end()) {
8387 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
8388 bufferlist
& bl
= p
->second
;
8389 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
8390 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
8392 OSDMap
*o
= new OSDMap
;
8395 bool got
= get_map_bl(e
- 1, obl
);
8397 auto p
= added_maps_bl
.find(e
- 1);
8398 ceph_assert(p
!= added_maps_bl
.end());
8404 OSDMap::Incremental inc
;
8405 auto p
= bl
.cbegin();
8408 if (o
->apply_incremental(inc
) < 0) {
8409 derr
<< "ERROR: bad fsid? i have " << osdmap
->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
8410 ceph_abort_msg("bad fsid");
8414 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
8416 bool injected_failure
= false;
8417 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
8418 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
8419 derr
<< __func__
<< " injecting map crc failure" << dendl
;
8420 injected_failure
= true;
8423 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
8424 dout(2) << "got incremental " << e
8425 << " but failed to encode full with correct crc; requesting"
8427 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
8428 dout(20) << "my encoded map was:\n";
8429 fbl
.hexdump(*_dout
);
8432 request_full_map(e
, last
);
8438 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8439 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
8440 added_maps
[e
] = add_map(o
);
8441 added_maps_bl
[e
] = fbl
;
8445 ceph_abort_msg("MOSDMap lied about what maps it had?");
8448 // even if this map isn't from a mon, we may have satisfied our subscription
8449 monc
->sub_got("osdmap", last
);
8451 if (!m
->maps
.empty() && requested_full_first
) {
8452 dout(10) << __func__
<< " still missing full maps " << requested_full_first
8453 << ".." << requested_full_last
<< dendl
;
8454 rerequest_full_maps();
8457 if (superblock
.oldest_map
) {
8458 // make sure we at least keep pace with incoming maps
8459 trim_maps(m
->oldest_map
, last
- first
+ 1, skip_maps
);
8460 pg_num_history
.prune(superblock
.oldest_map
);
8463 if (!superblock
.oldest_map
|| skip_maps
)
8464 superblock
.oldest_map
= first
;
8465 superblock
.newest_map
= last
;
8466 superblock
.current_epoch
= last
;
8468 // note in the superblock that we were clean thru the prior epoch
8469 epoch_t boot_epoch
= service
.get_boot_epoch();
8470 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
8471 superblock
.mounted
= boot_epoch
;
8472 superblock
.clean_thru
= last
;
8475 // check for pg_num changes and deleted pools
8477 for (auto& i
: added_maps
) {
8479 if (!(lastmap
= service
.try_get_map(i
.first
- 1))) {
8480 dout(10) << __func__
<< " can't get previous map " << i
.first
- 1
8481 << " probably first start of this osd" << dendl
;
8485 ceph_assert(lastmap
->get_epoch() + 1 == i
.second
->get_epoch());
8486 for (auto& j
: lastmap
->get_pools()) {
8487 if (!i
.second
->have_pg_pool(j
.first
)) {
8488 pg_num_history
.log_pool_delete(i
.first
, j
.first
);
8489 dout(10) << __func__
<< " recording final pg_pool_t for pool "
8490 << j
.first
<< dendl
;
8491 // this information is needed by _make_pg() if have to restart before
8492 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8493 ghobject_t obj
= make_final_pool_info_oid(j
.first
);
8495 encode(j
.second
, bl
, CEPH_FEATURES_ALL
);
8496 string name
= lastmap
->get_pool_name(j
.first
);
8498 map
<string
,string
> profile
;
8499 if (lastmap
->get_pg_pool(j
.first
)->is_erasure()) {
8500 profile
= lastmap
->get_erasure_code_profile(
8501 lastmap
->get_pg_pool(j
.first
)->erasure_code_profile
);
8503 encode(profile
, bl
);
8504 t
.write(coll_t::meta(), obj
, 0, bl
.length(), bl
);
8505 service
.store_deleted_pool_pg_num(j
.first
, j
.second
.get_pg_num());
8506 } else if (unsigned new_pg_num
= i
.second
->get_pg_num(j
.first
);
8507 new_pg_num
!= j
.second
.get_pg_num()) {
8508 dout(10) << __func__
<< " recording pool " << j
.first
<< " pg_num "
8509 << j
.second
.get_pg_num() << " -> " << new_pg_num
<< dendl
;
8510 pg_num_history
.log_pg_num_change(i
.first
, j
.first
, new_pg_num
);
8513 for (auto& j
: i
.second
->get_pools()) {
8514 if (!lastmap
->have_pg_pool(j
.first
)) {
8515 dout(10) << __func__
<< " recording new pool " << j
.first
<< " pg_num "
8516 << j
.second
.get_pg_num() << dendl
;
8517 pg_num_history
.log_pg_num_change(i
.first
, j
.first
,
8518 j
.second
.get_pg_num());
8523 pg_num_history
.epoch
= last
;
8526 ::encode(pg_num_history
, bl
);
8527 t
.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl
.length(), bl
);
8528 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
8531 // superblock and commit
8532 write_superblock(t
);
8533 t
.register_on_commit(new C_OnMapCommit(this, start
, last
, m
));
8534 store
->queue_transaction(
8537 service
.publish_superblock(superblock
);
8540 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
8542 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
8543 if (is_stopping()) {
8544 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8547 std::lock_guard
l(osd_lock
);
8548 if (is_stopping()) {
8549 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8552 map_lock
.get_write();
8554 bool do_shutdown
= false;
8555 bool do_restart
= false;
8556 bool network_error
= false;
8558 // advance through the new maps
8559 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
8560 dout(10) << " advance to epoch " << cur
8561 << " (<= last " << last
8562 << " <= newest_map " << superblock
.newest_map
8565 OSDMapRef newmap
= get_map(cur
);
8566 ceph_assert(newmap
); // we just cached it above!
8568 // start blacklisting messages sent to peers that go down.
8569 service
.pre_publish_map(newmap
);
8571 // kill connections to newly down osds
8572 bool waited_for_reservations
= false;
8574 osdmap
->get_all_osds(old
);
8575 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
8577 osdmap
->is_up(*p
) && // in old map
8578 newmap
->is_down(*p
)) { // but not the new one
8579 if (!waited_for_reservations
) {
8580 service
.await_reserved_maps();
8581 waited_for_reservations
= true;
8584 } else if (*p
!= whoami
&&
8585 osdmap
->is_down(*p
) &&
8586 newmap
->is_up(*p
)) {
8591 if (osdmap
->is_noup(whoami
) != newmap
->is_noup(whoami
)) {
8592 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
8595 // this captures the case where we sent the boot message while
8596 // NOUP was being set on the mon and our boot request was
8597 // dropped, and then later it is cleared. it imperfectly
8598 // handles the case where our original boot message was not
8599 // dropped and we restart even though we might have booted, but
8600 // that is harmless (boot will just take slightly longer).
8608 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
8610 osdmap
->is_up(whoami
) &&
8611 osdmap
->get_addrs(whoami
) == client_messenger
->get_myaddrs()) {
8612 up_epoch
= osdmap
->get_epoch();
8613 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
8615 boot_epoch
= osdmap
->get_epoch();
8616 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
8618 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
8622 had_map_since
= ceph_clock_now();
8624 epoch_t _bind_epoch
= service
.get_bind_epoch();
8625 if (osdmap
->is_up(whoami
) &&
8626 osdmap
->get_addrs(whoami
).legacy_equals(
8627 client_messenger
->get_myaddrs()) &&
8628 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
8631 dout(1) << "state: booting -> active" << dendl
;
8632 set_state(STATE_ACTIVE
);
8635 // set incarnation so that osd_reqid_t's we generate for our
8636 // objecter requests are unique across restarts.
8637 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
8638 cancel_pending_failures();
8642 if (osdmap
->get_epoch() > 0 &&
8644 if (!osdmap
->exists(whoami
)) {
8645 dout(0) << "map says i do not exist. shutting down." << dendl
;
8646 do_shutdown
= true; // don't call shutdown() while we have
8647 // everything paused
8648 } else if (!osdmap
->is_up(whoami
) ||
8649 !osdmap
->get_addrs(whoami
).legacy_equals(
8650 client_messenger
->get_myaddrs()) ||
8651 !osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8652 cluster_messenger
->get_myaddrs()) ||
8653 !osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8654 hb_back_server_messenger
->get_myaddrs()) ||
8655 !osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8656 hb_front_server_messenger
->get_myaddrs())) {
8657 if (!osdmap
->is_up(whoami
)) {
8658 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
8659 service
.got_stop_ack();
8661 clog
->warn() << "Monitor daemon marked osd." << whoami
<< " down, "
8662 "but it is still running";
8663 clog
->debug() << "map e" << osdmap
->get_epoch()
8664 << " wrongly marked me down at e"
8665 << osdmap
->get_down_at(whoami
);
8667 } else if (!osdmap
->get_addrs(whoami
).legacy_equals(
8668 client_messenger
->get_myaddrs())) {
8669 clog
->error() << "map e" << osdmap
->get_epoch()
8670 << " had wrong client addr (" << osdmap
->get_addrs(whoami
)
8671 << " != my " << client_messenger
->get_myaddrs() << ")";
8672 } else if (!osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8673 cluster_messenger
->get_myaddrs())) {
8674 clog
->error() << "map e" << osdmap
->get_epoch()
8675 << " had wrong cluster addr ("
8676 << osdmap
->get_cluster_addrs(whoami
)
8677 << " != my " << cluster_messenger
->get_myaddrs() << ")";
8678 } else if (!osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8679 hb_back_server_messenger
->get_myaddrs())) {
8680 clog
->error() << "map e" << osdmap
->get_epoch()
8681 << " had wrong heartbeat back addr ("
8682 << osdmap
->get_hb_back_addrs(whoami
)
8683 << " != my " << hb_back_server_messenger
->get_myaddrs()
8685 } else if (!osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8686 hb_front_server_messenger
->get_myaddrs())) {
8687 clog
->error() << "map e" << osdmap
->get_epoch()
8688 << " had wrong heartbeat front addr ("
8689 << osdmap
->get_hb_front_addrs(whoami
)
8690 << " != my " << hb_front_server_messenger
->get_myaddrs()
8694 if (!service
.is_stopping()) {
8695 epoch_t up_epoch
= 0;
8696 epoch_t bind_epoch
= osdmap
->get_epoch();
8697 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
8701 utime_t now
= ceph_clock_now();
8702 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
8703 osd_markdown_log
.push_back(now
);
8704 //clear all out-of-date log
8705 while (!osd_markdown_log
.empty() &&
8706 osd_markdown_log
.front() + grace
< now
)
8707 osd_markdown_log
.pop_front();
8708 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
8709 dout(0) << __func__
<< " marked down "
8710 << osd_markdown_log
.size()
8711 << " > osd_max_markdown_count "
8712 << cct
->_conf
->osd_max_markdown_count
8713 << " in last " << grace
<< " seconds, shutting down"
8719 start_waiting_for_healthy();
8721 set
<int> avoid_ports
;
8722 #if defined(__FreeBSD__)
8723 // prevent FreeBSD from grabbing the client_messenger port during
8724 // rebinding. In which case a cluster_meesneger will connect also
8726 client_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8728 cluster_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8729 hb_back_server_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8730 hb_front_server_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8732 int r
= cluster_messenger
->rebind(avoid_ports
);
8734 do_shutdown
= true; // FIXME: do_restart?
8735 network_error
= true;
8736 dout(0) << __func__
<< " marked down:"
8737 << " rebind cluster_messenger failed" << dendl
;
8740 r
= hb_back_server_messenger
->rebind(avoid_ports
);
8742 do_shutdown
= true; // FIXME: do_restart?
8743 network_error
= true;
8744 dout(0) << __func__
<< " marked down:"
8745 << " rebind hb_back_server_messenger failed" << dendl
;
8748 r
= hb_front_server_messenger
->rebind(avoid_ports
);
8750 do_shutdown
= true; // FIXME: do_restart?
8751 network_error
= true;
8752 dout(0) << __func__
<< " marked down:"
8753 << " rebind hb_front_server_messenger failed" << dendl
;
8756 hb_front_client_messenger
->mark_down_all();
8757 hb_back_client_messenger
->mark_down_all();
8759 reset_heartbeat_peers(true);
8764 map_lock
.put_write();
8766 check_osdmap_features();
8771 if (is_active() || is_waiting_for_healthy())
8772 maybe_update_heartbeat_peers();
8779 if (network_error
) {
8780 cancel_pending_failures();
8782 // trigger shutdown in a different thread
8783 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
8784 queue_async_signal(SIGINT
);
8786 else if (m
->newest_map
&& m
->newest_map
> last
) {
8787 dout(10) << " msg say newest map is " << m
->newest_map
8788 << ", requesting more" << dendl
;
8789 osdmap_subscribe(osdmap
->get_epoch()+1, false);
8791 else if (is_preboot()) {
8792 if (m
->get_source().is_mon())
8793 _preboot(m
->oldest_map
, m
->newest_map
);
8797 else if (do_restart
)
8802 void OSD::check_osdmap_features()
8804 // adjust required feature bits?
8806 // we have to be a bit careful here, because we are accessing the
8807 // Policy structures without taking any lock. in particular, only
8808 // modify integer values that can safely be read by a racing CPU.
8809 // since we are only accessing existing Policy structures a their
8810 // current memory location, and setting or clearing bits in integer
8811 // fields, and we are the only writer, this is not a problem.
8814 Messenger::Policy p
= client_messenger
->get_default_policy();
8816 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
8817 if ((p
.features_required
& mask
) != features
) {
8818 dout(0) << "crush map has features " << features
8819 << ", adjusting msgr requires for clients" << dendl
;
8820 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8821 client_messenger
->set_default_policy(p
);
8825 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
8827 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
8828 if ((p
.features_required
& mask
) != features
) {
8829 dout(0) << "crush map has features " << features
8830 << " was " << p
.features_required
8831 << ", adjusting msgr requires for mons" << dendl
;
8832 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8833 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
8837 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
8839 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
8841 if ((p
.features_required
& mask
) != features
) {
8842 dout(0) << "crush map has features " << features
8843 << ", adjusting msgr requires for osds" << dendl
;
8844 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8845 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
8848 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
8849 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
8850 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
8851 ObjectStore::Transaction t
;
8852 write_superblock(t
);
8853 int err
= store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
8854 ceph_assert(err
== 0);
8858 if (osdmap
->require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
8859 heartbeat_dispatcher
.ms_set_require_authorizer(false);
8862 if (osdmap
->require_osd_release
!= last_require_osd_release
) {
8863 dout(1) << __func__
<< " require_osd_release " << last_require_osd_release
8864 << " -> " << to_string(osdmap
->require_osd_release
) << dendl
;
8865 store
->write_meta("require_osd_release",
8866 stringify((int)osdmap
->require_osd_release
));
8867 last_require_osd_release
= osdmap
->require_osd_release
;
8871 struct C_FinishSplits
: public Context
{
8874 C_FinishSplits(OSD
*osd
, const set
<PGRef
> &in
)
8875 : osd(osd
), pgs(in
) {}
8876 void finish(int r
) override
{
8877 osd
->_finish_splits(pgs
);
8881 void OSD::_finish_splits(set
<PGRef
>& pgs
)
8883 dout(10) << __func__
<< " " << pgs
<< dendl
;
8886 PG::RecoveryCtx rctx
= create_context();
8887 for (set
<PGRef
>::iterator i
= pgs
.begin();
8893 dout(10) << __func__
<< " " << *pg
<< dendl
;
8894 epoch_t e
= pg
->get_osdmap_epoch();
8895 pg
->handle_initialize(&rctx
);
8896 pg
->queue_null(e
, e
);
8897 dispatch_context_transaction(rctx
, pg
);
8900 unsigned shard_index
= pg
->pg_id
.hash_to_shard(num_shards
);
8901 shards
[shard_index
]->register_and_wake_split_child(pg
);
8904 dispatch_context(rctx
, 0, service
.get_osdmap());
8907 bool OSD::add_merge_waiter(OSDMapRef nextmap
, spg_t target
, PGRef src
,
8910 std::lock_guard
l(merge_lock
);
8911 auto& p
= merge_waiters
[nextmap
->get_epoch()][target
];
8912 p
[src
->pg_id
] = src
;
8913 dout(10) << __func__
<< " added merge_waiter " << src
->pg_id
8914 << " for " << target
<< ", have " << p
.size() << "/" << need
8916 return p
.size() == need
;
8919 bool OSD::advance_pg(
8922 ThreadPool::TPHandle
&handle
,
8923 PG::RecoveryCtx
*rctx
)
8925 if (osd_epoch
<= pg
->get_osdmap_epoch()) {
8928 ceph_assert(pg
->is_locked());
8929 OSDMapRef lastmap
= pg
->get_osdmap();
8930 ceph_assert(lastmap
->get_epoch() < osd_epoch
);
8931 set
<PGRef
> new_pgs
; // any split children
8934 unsigned old_pg_num
= lastmap
->have_pg_pool(pg
->pg_id
.pool()) ?
8935 lastmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8936 for (epoch_t next_epoch
= pg
->get_osdmap_epoch() + 1;
8937 next_epoch
<= osd_epoch
;
8939 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
8941 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
8945 unsigned new_pg_num
=
8946 (old_pg_num
&& nextmap
->have_pg_pool(pg
->pg_id
.pool())) ?
8947 nextmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8948 if (old_pg_num
&& new_pg_num
&& old_pg_num
!= new_pg_num
) {
8950 if (nextmap
->have_pg_pool(pg
->pg_id
.pool())) {
8952 if (pg
->pg_id
.is_merge_source(
8956 // we are merge source
8957 PGRef spg
= pg
; // carry a ref
8958 dout(1) << __func__
<< " " << pg
->pg_id
8959 << " is merge source, target is " << parent
8961 pg
->write_if_dirty(rctx
);
8962 dispatch_context_transaction(*rctx
, pg
, &handle
);
8964 // release backoffs explicitly, since the on_shutdown path
8965 // aggressively tears down backoff state.
8966 if (pg
->is_primary()) {
8967 pg
->release_pg_backoffs();
8970 OSDShard
*sdata
= pg
->osd_shard
;
8972 std::lock_guard
l(sdata
->shard_lock
);
8974 sdata
->_detach_pg(pg
->pg_slot
);
8975 // update pg count now since we might not get an osdmap
8977 if (pg
->is_primary())
8978 logger
->dec(l_osd_pg_primary
);
8979 else if (pg
->is_replica())
8980 logger
->dec(l_osd_pg_replica
);
8982 logger
->dec(l_osd_pg_stray
);
8987 set
<spg_t
> children
;
8988 parent
.is_split(new_pg_num
, old_pg_num
, &children
);
8989 if (add_merge_waiter(nextmap
, parent
, pg
, children
.size())) {
8990 enqueue_peering_evt(
8993 std::make_shared
<PGPeeringEvent
>(
8994 nextmap
->get_epoch(),
8995 nextmap
->get_epoch(),
9000 } else if (pg
->pg_id
.is_merge_target(old_pg_num
, new_pg_num
)) {
9001 // we are merge target
9002 set
<spg_t
> children
;
9003 pg
->pg_id
.is_split(new_pg_num
, old_pg_num
, &children
);
9004 dout(20) << __func__
<< " " << pg
->pg_id
9005 << " is merge target, sources are " << children
9007 map
<spg_t
,PGRef
> sources
;
9009 std::lock_guard
l(merge_lock
);
9010 auto& s
= merge_waiters
[nextmap
->get_epoch()][pg
->pg_id
];
9011 unsigned need
= children
.size();
9012 dout(20) << __func__
<< " have " << s
.size() << "/"
9014 if (s
.size() == need
) {
9016 merge_waiters
[nextmap
->get_epoch()].erase(pg
->pg_id
);
9017 if (merge_waiters
[nextmap
->get_epoch()].empty()) {
9018 merge_waiters
.erase(nextmap
->get_epoch());
9022 if (!sources
.empty()) {
9023 unsigned new_pg_num
= nextmap
->get_pg_num(pg
->pg_id
.pool());
9024 unsigned split_bits
= pg
->pg_id
.get_split_bits(new_pg_num
);
9025 dout(1) << __func__
<< " merging " << pg
->pg_id
<< dendl
;
9027 sources
, rctx
, split_bits
,
9028 nextmap
->get_pg_pool(
9029 pg
->pg_id
.pool())->last_pg_merge_meta
);
9030 pg
->pg_slot
->waiting_for_merge_epoch
= 0;
9032 dout(20) << __func__
<< " not ready to merge yet" << dendl
;
9033 pg
->write_if_dirty(rctx
);
9035 // kick source(s) to get them ready
9036 for (auto& i
: children
) {
9037 dout(20) << __func__
<< " kicking source " << i
<< dendl
;
9038 enqueue_peering_evt(
9041 std::make_shared
<PGPeeringEvent
>(
9042 nextmap
->get_epoch(),
9043 nextmap
->get_epoch(),
9053 vector
<int> newup
, newacting
;
9054 int up_primary
, acting_primary
;
9055 nextmap
->pg_to_up_acting_osds(
9057 &newup
, &up_primary
,
9058 &newacting
, &acting_primary
);
9059 pg
->handle_advance_map(
9060 nextmap
, lastmap
, newup
, up_primary
,
9061 newacting
, acting_primary
, rctx
);
9063 auto oldpool
= lastmap
->get_pools().find(pg
->pg_id
.pool());
9064 auto newpool
= nextmap
->get_pools().find(pg
->pg_id
.pool());
9065 if (oldpool
!= lastmap
->get_pools().end()
9066 && newpool
!= nextmap
->get_pools().end()) {
9067 dout(20) << __func__
9068 << " new pool opts " << newpool
->second
.opts
9069 << " old pool opts " << oldpool
->second
.opts
9072 double old_min_interval
= 0, new_min_interval
= 0;
9073 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &old_min_interval
);
9074 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &new_min_interval
);
9076 double old_max_interval
= 0, new_max_interval
= 0;
9077 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &old_max_interval
);
9078 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &new_max_interval
);
9080 // Assume if an interval is change from set to unset or vice versa the actual config
9081 // is different. Keep it simple even if it is possible to call resched_all_scrub()
9083 if (old_min_interval
!= new_min_interval
|| old_max_interval
!= new_max_interval
) {
9084 pg
->on_info_history_change();
9088 if (new_pg_num
&& old_pg_num
!= new_pg_num
) {
9090 set
<spg_t
> children
;
9091 if (pg
->pg_id
.is_split(
9096 pg
, children
, &new_pgs
, lastmap
, nextmap
,
9102 old_pg_num
= new_pg_num
;
9103 handle
.reset_tp_timeout();
9105 pg
->handle_activate_map(rctx
);
9109 if (!new_pgs
.empty()) {
9110 rctx
->transaction
->register_on_applied(new C_FinishSplits(this, new_pgs
));
9115 void OSD::consume_map()
9117 ceph_assert(osd_lock
.is_locked());
9118 dout(7) << "consume_map version " << osdmap
->get_epoch() << dendl
;
9120 /** make sure the cluster is speaking in SORTBITWISE, because we don't
9121 * speak the older sorting version any more. Be careful not to force
9122 * a shutdown if we are merely processing old maps, though.
9124 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
) && is_active()) {
9125 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
9129 service
.pre_publish_map(osdmap
);
9130 service
.await_reserved_maps();
9131 service
.publish_map(osdmap
);
9133 // prime splits and merges
9134 set
<pair
<spg_t
,epoch_t
>> newly_split
; // splits, and when
9135 set
<pair
<spg_t
,epoch_t
>> merge_pgs
; // merge participants, and when
9136 for (auto& shard
: shards
) {
9137 shard
->identify_splits_and_merges(osdmap
, &newly_split
, &merge_pgs
);
9139 if (!newly_split
.empty()) {
9140 for (auto& shard
: shards
) {
9141 shard
->prime_splits(osdmap
, &newly_split
);
9143 ceph_assert(newly_split
.empty());
9146 // prune sent_ready_to_merge
9147 service
.prune_sent_ready_to_merge(osdmap
);
9149 // FIXME, maybe: We could race against an incoming peering message
9150 // that instantiates a merge PG after identify_merges() below and
9151 // never set up its peer to complete the merge. An OSD restart
9152 // would clear it up. This is a hard race to resolve,
9153 // extraordinarily rare (we only merge PGs that are stable and
9154 // clean, so it'd have to be an imported PG to an OSD with a
9155 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
9156 // replace all of this with a seastar-based code soon anyway.
9157 if (!merge_pgs
.empty()) {
9158 // mark the pgs we already have, or create new and empty merge
9159 // participants for those we are missing. do this all under the
9160 // shard lock so we don't have to worry about racing pg creates
9162 for (auto& shard
: shards
) {
9163 shard
->prime_merges(osdmap
, &merge_pgs
);
9165 ceph_assert(merge_pgs
.empty());
9168 service
.prune_pg_created();
9170 unsigned pushes_to_free
= 0;
9171 for (auto& shard
: shards
) {
9172 shard
->consume_map(osdmap
, &pushes_to_free
);
9175 vector
<spg_t
> pgids
;
9178 // count (FIXME, probably during seastar rewrite)
9179 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
9182 for (auto& pg
: pgs
) {
9183 // FIXME (probably during seastar rewrite): this is lockless and
9184 // racy, but we don't want to take pg lock here.
9185 if (pg
->is_primary())
9187 else if (pg
->is_replica())
9194 // FIXME (as part of seastar rewrite): move to OSDShard
9195 std::lock_guard
l(pending_creates_lock
);
9196 for (auto pg
= pending_creates_from_osd
.begin();
9197 pg
!= pending_creates_from_osd
.end();) {
9198 if (osdmap
->get_pg_acting_rank(pg
->first
, whoami
) < 0) {
9199 dout(10) << __func__
<< " pg " << pg
->first
<< " doesn't map here, "
9200 << "discarding pending_create_from_osd" << dendl
;
9201 pg
= pending_creates_from_osd
.erase(pg
);
9208 service
.maybe_inject_dispatch_delay();
9210 dispatch_sessions_waiting_on_map();
9212 service
.maybe_inject_dispatch_delay();
9214 service
.release_reserved_pushes(pushes_to_free
);
9216 // queue null events to push maps down to individual PGs
9217 for (auto pgid
: pgids
) {
9218 enqueue_peering_evt(
9221 std::make_shared
<PGPeeringEvent
>(
9222 osdmap
->get_epoch(),
9223 osdmap
->get_epoch(),
9226 logger
->set(l_osd_pg
, pgids
.size());
9227 logger
->set(l_osd_pg_primary
, num_pg_primary
);
9228 logger
->set(l_osd_pg_replica
, num_pg_replica
);
9229 logger
->set(l_osd_pg_stray
, num_pg_stray
);
9232 void OSD::activate_map()
9234 ceph_assert(osd_lock
.is_locked());
9236 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
9238 if (osdmap
->test_flag(CEPH_OSDMAP_FULL
)) {
9239 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl
;
9240 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
9244 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
9245 if (!service
.recovery_is_paused()) {
9246 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
9247 service
.pause_recovery();
9250 if (service
.recovery_is_paused()) {
9251 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
9252 service
.unpause_recovery();
9256 service
.activate_map();
9259 take_waiters(waiting_for_osdmap
);
9262 bool OSD::require_mon_peer(const Message
*m
)
9264 if (!m
->get_connection()->peer_is_mon()) {
9265 dout(0) << "require_mon_peer received from non-mon "
9266 << m
->get_connection()->get_peer_addr()
9267 << " " << *m
<< dendl
;
9273 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
9275 if (!m
->get_connection()->peer_is_mon() &&
9276 !m
->get_connection()->peer_is_mgr()) {
9277 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
9278 << m
->get_connection()->get_peer_addr()
9279 << " " << *m
<< dendl
;
9285 bool OSD::require_osd_peer(const Message
*m
)
9287 if (!m
->get_connection()->peer_is_osd()) {
9288 dout(0) << "require_osd_peer received from non-osd "
9289 << m
->get_connection()->get_peer_addr()
9290 << " " << *m
<< dendl
;
9296 bool OSD::require_self_aliveness(const Message
*m
, epoch_t epoch
)
9298 epoch_t up_epoch
= service
.get_up_epoch();
9299 if (epoch
< up_epoch
) {
9300 dout(7) << "from pre-up epoch " << epoch
<< " < " << up_epoch
<< dendl
;
9305 dout(7) << "still in boot state, dropping message " << *m
<< dendl
;
9312 bool OSD::require_same_peer_instance(const Message
*m
, OSDMapRef
& map
,
9313 bool is_fast_dispatch
)
9315 int from
= m
->get_source().num();
9317 if (map
->is_down(from
) ||
9318 (map
->get_cluster_addrs(from
) != m
->get_source_addrs())) {
9319 dout(5) << "from dead osd." << from
<< ", marking down, "
9320 << " msg was " << m
->get_source_inst().addr
9322 << (map
->is_up(from
) ?
9323 map
->get_cluster_addrs(from
) : entity_addrvec_t())
9325 ConnectionRef con
= m
->get_connection();
9327 auto priv
= con
->get_priv();
9328 if (auto s
= static_cast<Session
*>(priv
.get()); s
) {
9329 if (!is_fast_dispatch
)
9330 s
->session_dispatch_lock
.Lock();
9331 clear_session_waiting_on_map(s
);
9332 con
->set_priv(nullptr); // break ref <-> session cycle, if any
9334 if (!is_fast_dispatch
)
9335 s
->session_dispatch_lock
.Unlock();
9344 * require that we have same (or newer) map, and that
9345 * the source is the pg primary.
9347 bool OSD::require_same_or_newer_map(OpRequestRef
& op
, epoch_t epoch
,
9348 bool is_fast_dispatch
)
9350 const Message
*m
= op
->get_req();
9351 dout(15) << "require_same_or_newer_map " << epoch
9352 << " (i am " << osdmap
->get_epoch() << ") " << m
<< dendl
;
9354 ceph_assert(osd_lock
.is_locked());
9356 // do they have a newer map?
9357 if (epoch
> osdmap
->get_epoch()) {
9358 dout(7) << "waiting for newer map epoch " << epoch
9359 << " > my " << osdmap
->get_epoch() << " with " << m
<< dendl
;
9360 wait_for_new_map(op
);
9364 if (!require_self_aliveness(op
->get_req(), epoch
)) {
9368 // ok, our map is same or newer.. do they still exist?
9369 if (m
->get_connection()->get_messenger() == cluster_messenger
&&
9370 !require_same_peer_instance(op
->get_req(), osdmap
, is_fast_dispatch
)) {
9381 // ----------------------------------------
9384 void OSD::split_pgs(
9386 const set
<spg_t
> &childpgids
, set
<PGRef
> *out_pgs
,
9389 PG::RecoveryCtx
*rctx
)
9391 unsigned pg_num
= nextmap
->get_pg_num(parent
->pg_id
.pool());
9392 parent
->update_snap_mapper_bits(parent
->get_pgid().get_split_bits(pg_num
));
9394 vector
<object_stat_sum_t
> updated_stats
;
9395 parent
->start_split_stats(childpgids
, &updated_stats
);
9397 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
9398 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
9399 i
!= childpgids
.end();
9401 ceph_assert(stat_iter
!= updated_stats
.end());
9402 dout(10) << __func__
<< " splitting " << *parent
<< " into " << *i
<< dendl
;
9403 PG
* child
= _make_pg(nextmap
, *i
);
9405 out_pgs
->insert(child
);
9406 child
->ch
= store
->create_new_collection(child
->coll
);
9409 uint32_t shard_index
= i
->hash_to_shard(shards
.size());
9410 assert(NULL
!= shards
[shard_index
]);
9411 store
->set_collection_commit_queue(child
->coll
, &(shards
[shard_index
]->context_queue
));
9414 unsigned split_bits
= i
->get_split_bits(pg_num
);
9415 dout(10) << " pg_num is " << pg_num
9416 << ", m_seed " << i
->ps()
9417 << ", split_bits is " << split_bits
<< dendl
;
9418 parent
->split_colls(
9422 &child
->get_pool().info
,
9429 child
->init_collection_pool_opts();
9431 child
->finish_split_stats(*stat_iter
, rctx
->transaction
);
9434 ceph_assert(stat_iter
!= updated_stats
.end());
9435 parent
->finish_split_stats(*stat_iter
, rctx
->transaction
);
9441 void OSD::handle_pg_create(OpRequestRef op
)
9443 const MOSDPGCreate
*m
= static_cast<const MOSDPGCreate
*>(op
->get_req());
9444 ceph_assert(m
->get_type() == MSG_OSD_PG_CREATE
);
9446 dout(10) << "handle_pg_create " << *m
<< dendl
;
9448 if (!require_mon_peer(op
->get_req())) {
9452 if (!require_same_or_newer_map(op
, m
->epoch
, false))
9457 map
<pg_t
,utime_t
>::const_iterator ci
= m
->ctimes
.begin();
9458 for (map
<pg_t
,pg_create_t
>::const_iterator p
= m
->mkpg
.begin();
9461 ceph_assert(ci
!= m
->ctimes
.end() && ci
->first
== p
->first
);
9462 epoch_t created
= p
->second
.created
;
9463 if (p
->second
.split_bits
) // Skip split pgs
9467 if (!osdmap
->have_pg_pool(on
.pool())) {
9468 dout(20) << "ignoring pg on deleted pool " << on
<< dendl
;
9472 dout(20) << "mkpg " << on
<< " e" << created
<< "@" << ci
->second
<< dendl
;
9474 // is it still ours?
9475 vector
<int> up
, acting
;
9476 int up_primary
= -1;
9477 int acting_primary
= -1;
9478 osdmap
->pg_to_up_acting_osds(on
, &up
, &up_primary
, &acting
, &acting_primary
);
9479 int role
= osdmap
->calc_pg_role(whoami
, acting
, acting
.size());
9481 if (acting_primary
!= whoami
) {
9482 dout(10) << "mkpg " << on
<< " not acting_primary (" << acting_primary
9483 << "), my role=" << role
<< ", skipping" << dendl
;
9488 bool mapped
= osdmap
->get_primary_shard(on
, &pgid
);
9489 ceph_assert(mapped
);
9492 pg_history_t history
;
9493 build_initial_pg_history(pgid
, created
, ci
->second
, &history
, &pi
);
9495 // The mon won't resend unless the primary changed, so we ignore
9496 // same_interval_since. We'll pass this history with the current
9497 // epoch as the event.
9498 if (history
.same_primary_since
> m
->epoch
) {
9499 dout(10) << __func__
<< ": got obsolete pg create on pgid "
9500 << pgid
<< " from epoch " << m
->epoch
9501 << ", primary changed in " << history
.same_primary_since
9505 enqueue_peering_evt(
9508 std::make_shared
<PGPeeringEvent
>(
9509 osdmap
->get_epoch(),
9510 osdmap
->get_epoch(),
9515 osdmap
->get_epoch(),
9523 std::lock_guard
l(pending_creates_lock
);
9524 if (pending_creates_from_mon
== 0) {
9525 last_pg_create_epoch
= m
->epoch
;
9529 maybe_update_heartbeat_peers();
9533 // ----------------------------------------
9534 // peering and recovery
9536 PG::RecoveryCtx
OSD::create_context()
9538 ObjectStore::Transaction
*t
= new ObjectStore::Transaction
;
9539 map
<int, map
<spg_t
,pg_query_t
> > *query_map
=
9540 new map
<int, map
<spg_t
, pg_query_t
> >;
9541 map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > > *notify_list
=
9542 new map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > >;
9543 map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > > *info_map
=
9544 new map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > >;
9545 PG::RecoveryCtx
rctx(query_map
, info_map
, notify_list
, t
);
9549 void OSD::dispatch_context_transaction(PG::RecoveryCtx
&ctx
, PG
*pg
,
9550 ThreadPool::TPHandle
*handle
)
9552 if (!ctx
.transaction
->empty() || ctx
.transaction
->has_contexts()) {
9553 int tr
= store
->queue_transaction(
9555 std::move(*ctx
.transaction
), TrackedOpRef(), handle
);
9556 ceph_assert(tr
== 0);
9557 delete (ctx
.transaction
);
9558 ctx
.transaction
= new ObjectStore::Transaction
;
9562 void OSD::dispatch_context(PG::RecoveryCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
9563 ThreadPool::TPHandle
*handle
)
9565 if (!service
.get_osdmap()->is_up(whoami
)) {
9566 dout(20) << __func__
<< " not up in osdmap" << dendl
;
9567 } else if (!is_active()) {
9568 dout(20) << __func__
<< " not active" << dendl
;
9570 do_notifies(*ctx
.notify_list
, curmap
);
9571 do_queries(*ctx
.query_map
, curmap
);
9572 do_infos(*ctx
.info_map
, curmap
);
9574 if ((!ctx
.transaction
->empty() || ctx
.transaction
->has_contexts()) && pg
) {
9575 int tr
= store
->queue_transaction(
9577 std::move(*ctx
.transaction
), TrackedOpRef(),
9579 ceph_assert(tr
== 0);
9581 delete ctx
.notify_list
;
9582 delete ctx
.query_map
;
9583 delete ctx
.info_map
;
9584 delete ctx
.transaction
;
9587 void OSD::discard_context(PG::RecoveryCtx
& ctx
)
9589 delete ctx
.notify_list
;
9590 delete ctx
.query_map
;
9591 delete ctx
.info_map
;
9592 delete ctx
.transaction
;
9597 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
9598 * content for, and they are primary for.
9601 void OSD::do_notifies(
9602 map
<int,vector
<pair
<pg_notify_t
,PastIntervals
> > >& notify_list
,
9606 vector
<pair
<pg_notify_t
,PastIntervals
> > >::iterator it
=
9607 notify_list
.begin();
9608 it
!= notify_list
.end();
9610 if (!curmap
->is_up(it
->first
)) {
9611 dout(20) << __func__
<< " skipping down osd." << it
->first
<< dendl
;
9614 ConnectionRef con
= service
.get_con_osd_cluster(
9615 it
->first
, curmap
->get_epoch());
9617 dout(20) << __func__
<< " skipping osd." << it
->first
9618 << " (NULL con)" << dendl
;
9621 service
.share_map_peer(it
->first
, con
.get(), curmap
);
9622 dout(7) << __func__
<< " osd." << it
->first
9623 << " on " << it
->second
.size() << " PGs" << dendl
;
9624 MOSDPGNotify
*m
= new MOSDPGNotify(curmap
->get_epoch(),
9626 con
->send_message(m
);
9632 * send out pending queries for info | summaries
9634 void OSD::do_queries(map
<int, map
<spg_t
,pg_query_t
> >& query_map
,
9637 for (map
<int, map
<spg_t
,pg_query_t
> >::iterator pit
= query_map
.begin();
9638 pit
!= query_map
.end();
9640 if (!curmap
->is_up(pit
->first
)) {
9641 dout(20) << __func__
<< " skipping down osd." << pit
->first
<< dendl
;
9644 int who
= pit
->first
;
9645 ConnectionRef con
= service
.get_con_osd_cluster(who
, curmap
->get_epoch());
9647 dout(20) << __func__
<< " skipping osd." << who
9648 << " (NULL con)" << dendl
;
9651 service
.share_map_peer(who
, con
.get(), curmap
);
9652 dout(7) << __func__
<< " querying osd." << who
9653 << " on " << pit
->second
.size() << " PGs" << dendl
;
9654 MOSDPGQuery
*m
= new MOSDPGQuery(curmap
->get_epoch(), pit
->second
);
9655 con
->send_message(m
);
9660 void OSD::do_infos(map
<int,
9661 vector
<pair
<pg_notify_t
, PastIntervals
> > >& info_map
,
9665 vector
<pair
<pg_notify_t
, PastIntervals
> > >::iterator p
=
9667 p
!= info_map
.end();
9669 if (!curmap
->is_up(p
->first
)) {
9670 dout(20) << __func__
<< " skipping down osd." << p
->first
<< dendl
;
9673 for (vector
<pair
<pg_notify_t
,PastIntervals
> >::iterator i
= p
->second
.begin();
9674 i
!= p
->second
.end();
9676 dout(20) << __func__
<< " sending info " << i
->first
.info
9677 << " to shard " << p
->first
<< dendl
;
9679 ConnectionRef con
= service
.get_con_osd_cluster(
9680 p
->first
, curmap
->get_epoch());
9682 dout(20) << __func__
<< " skipping osd." << p
->first
9683 << " (NULL con)" << dendl
;
9686 service
.share_map_peer(p
->first
, con
.get(), curmap
);
9687 MOSDPGInfo
*m
= new MOSDPGInfo(curmap
->get_epoch());
9688 m
->pg_list
= p
->second
;
9689 con
->send_message(m
);
9694 void OSD::handle_fast_pg_create(MOSDPGCreate2
*m
)
9696 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9697 if (!require_mon_peer(m
)) {
9701 for (auto& p
: m
->pgs
) {
9702 spg_t pgid
= p
.first
;
9703 epoch_t created
= p
.second
.first
;
9704 utime_t created_stamp
= p
.second
.second
;
9705 dout(20) << __func__
<< " " << pgid
<< " e" << created
9706 << "@" << created_stamp
<< dendl
;
9708 h
.epoch_created
= created
;
9709 h
.epoch_pool_created
= created
;
9710 h
.same_up_since
= created
;
9711 h
.same_interval_since
= created
;
9712 h
.same_primary_since
= created
;
9713 h
.last_scrub_stamp
= created_stamp
;
9714 h
.last_deep_scrub_stamp
= created_stamp
;
9715 h
.last_clean_scrub_stamp
= created_stamp
;
9717 enqueue_peering_evt(
9720 std::make_shared
<PGPeeringEvent
>(
9735 std::lock_guard
l(pending_creates_lock
);
9736 if (pending_creates_from_mon
== 0) {
9737 last_pg_create_epoch
= m
->epoch
;
9744 void OSD::handle_fast_pg_query(MOSDPGQuery
*m
)
9746 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9747 if (!require_osd_peer(m
)) {
9751 int from
= m
->get_source().num();
9752 for (auto& p
: m
->pg_list
) {
9753 enqueue_peering_evt(
9756 std::make_shared
<PGPeeringEvent
>(
9757 p
.second
.epoch_sent
, p
.second
.epoch_sent
,
9760 pg_shard_t(from
, p
.second
.from
),
9762 p
.second
.epoch_sent
),
9769 void OSD::handle_fast_pg_notify(MOSDPGNotify
* m
)
9771 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9772 if (!require_osd_peer(m
)) {
9776 int from
= m
->get_source().num();
9777 for (auto& p
: m
->get_pg_list()) {
9778 spg_t
pgid(p
.first
.info
.pgid
.pgid
, p
.first
.to
);
9779 enqueue_peering_evt(
9782 std::make_shared
<PGPeeringEvent
>(
9784 p
.first
.query_epoch
,
9786 pgid
, pg_shard_t(from
, p
.first
.from
),
9788 m
->get_connection()->get_features(),
9793 p
.first
.query_epoch
,
9794 p
.first
.info
.history
,
9802 void OSD::handle_fast_pg_info(MOSDPGInfo
* m
)
9804 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9805 if (!require_osd_peer(m
)) {
9809 int from
= m
->get_source().num();
9810 for (auto& p
: m
->pg_list
) {
9811 enqueue_peering_evt(
9812 spg_t(p
.first
.info
.pgid
.pgid
, p
.first
.to
),
9814 std::make_shared
<PGPeeringEvent
>(
9815 p
.first
.epoch_sent
, p
.first
.query_epoch
,
9817 pg_shard_t(from
, p
.first
.from
),
9819 p
.first
.epoch_sent
)))
9825 void OSD::handle_fast_pg_remove(MOSDPGRemove
*m
)
9827 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9828 if (!require_osd_peer(m
)) {
9832 for (auto& pgid
: m
->pg_list
) {
9833 enqueue_peering_evt(
9836 std::make_shared
<PGPeeringEvent
>(
9837 m
->get_epoch(), m
->get_epoch(),
9838 PG::DeleteStart())));
9843 void OSD::handle_fast_force_recovery(MOSDForceRecovery
*m
)
9845 dout(10) << __func__
<< " " << *m
<< dendl
;
9846 if (!require_mon_or_mgr_peer(m
)) {
9850 epoch_t epoch
= get_osdmap_epoch();
9851 for (auto pgid
: m
->forced_pgs
) {
9852 if (m
->options
& OFR_BACKFILL
) {
9853 if (m
->options
& OFR_CANCEL
) {
9854 enqueue_peering_evt(
9857 std::make_shared
<PGPeeringEvent
>(
9859 PG::UnsetForceBackfill())));
9861 enqueue_peering_evt(
9864 std::make_shared
<PGPeeringEvent
>(
9866 PG::SetForceBackfill())));
9868 } else if (m
->options
& OFR_RECOVERY
) {
9869 if (m
->options
& OFR_CANCEL
) {
9870 enqueue_peering_evt(
9873 std::make_shared
<PGPeeringEvent
>(
9875 PG::UnsetForceRecovery())));
9877 enqueue_peering_evt(
9880 std::make_shared
<PGPeeringEvent
>(
9882 PG::SetForceRecovery())));
9889 void OSD::handle_pg_query_nopg(const MQuery
& q
)
9891 spg_t pgid
= q
.pgid
;
9892 dout(10) << __func__
<< " " << pgid
<< dendl
;
9894 OSDMapRef osdmap
= get_osdmap();
9895 if (!osdmap
->have_pg_pool(pgid
.pool()))
9898 dout(10) << " pg " << pgid
<< " dne" << dendl
;
9899 pg_info_t
empty(spg_t(pgid
.pgid
, q
.query
.to
));
9900 ConnectionRef con
= service
.get_con_osd_cluster(q
.from
.osd
, osdmap
->get_epoch());
9903 if (q
.query
.type
== pg_query_t::LOG
||
9904 q
.query
.type
== pg_query_t::FULLLOG
) {
9906 q
.query
.from
, q
.query
.to
,
9907 osdmap
->get_epoch(), empty
,
9908 q
.query
.epoch_sent
);
9910 vector
<pair
<pg_notify_t
,PastIntervals
>> ls
;
9914 q
.query
.from
, q
.query
.to
,
9916 osdmap
->get_epoch(),
9919 m
= new MOSDPGNotify(osdmap
->get_epoch(), ls
);
9921 service
.share_map_peer(q
.from
.osd
, con
.get(), osdmap
);
9922 con
->send_message(m
);
9927 // =========================================================
9930 void OSDService::_maybe_queue_recovery() {
9931 ceph_assert(recovery_lock
.is_locked_by_me());
9932 uint64_t available_pushes
;
9933 while (!awaiting_throttle
.empty() &&
9934 _recover_now(&available_pushes
)) {
9935 uint64_t to_start
= std::min(
9937 cct
->_conf
->osd_recovery_max_single_start
);
9938 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
9939 awaiting_throttle
.pop_front();
9940 dout(10) << __func__
<< " starting " << to_start
9941 << ", recovery_ops_reserved " << recovery_ops_reserved
9942 << " -> " << (recovery_ops_reserved
+ to_start
) << dendl
;
9943 recovery_ops_reserved
+= to_start
;
9947 bool OSDService::_recover_now(uint64_t *available_pushes
)
9949 if (available_pushes
)
9950 *available_pushes
= 0;
9952 if (ceph_clock_now() < defer_recovery_until
) {
9953 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
9957 if (recovery_paused
) {
9958 dout(15) << __func__
<< " paused" << dendl
;
9962 uint64_t max
= cct
->_conf
->osd_recovery_max_active
;
9963 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
9964 dout(15) << __func__
<< " active " << recovery_ops_active
9965 << " + reserved " << recovery_ops_reserved
9966 << " >= max " << max
<< dendl
;
9970 if (available_pushes
)
9971 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
9976 void OSD::do_recovery(
9977 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
,
9978 ThreadPool::TPHandle
&handle
)
9980 uint64_t started
= 0;
9983 * When the value of osd_recovery_sleep is set greater than zero, recovery
9984 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9985 * recovery event's schedule time. This is done by adding a
9986 * recovery_requeue_callback event, which re-queues the recovery op using
9987 * queue_recovery_after_sleep.
9989 float recovery_sleep
= get_osd_recovery_sleep();
9991 std::lock_guard
l(service
.sleep_lock
);
9992 if (recovery_sleep
> 0 && service
.recovery_needs_sleep
) {
9994 auto recovery_requeue_callback
= new FunctionContext([this, pgref
, queued
, reserved_pushes
](int r
) {
9995 dout(20) << "do_recovery wake up at "
9997 << ", re-queuing recovery" << dendl
;
9998 std::lock_guard
l(service
.sleep_lock
);
9999 service
.recovery_needs_sleep
= false;
10000 service
.queue_recovery_after_sleep(pgref
.get(), queued
, reserved_pushes
);
10003 // This is true for the first recovery op and when the previous recovery op
10004 // has been scheduled in the past. The next recovery op is scheduled after
10005 // completing the sleep from now.
10006 if (service
.recovery_schedule_time
< ceph_clock_now()) {
10007 service
.recovery_schedule_time
= ceph_clock_now();
10009 service
.recovery_schedule_time
+= recovery_sleep
;
10010 service
.sleep_timer
.add_event_at(service
.recovery_schedule_time
,
10011 recovery_requeue_callback
);
10012 dout(20) << "Recovery event scheduled at "
10013 << service
.recovery_schedule_time
<< dendl
;
10020 std::lock_guard
l(service
.sleep_lock
);
10021 service
.recovery_needs_sleep
= true;
10024 if (pg
->pg_has_reset_since(queued
)) {
10028 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
10029 #ifdef DEBUG_RECOVERY_OIDS
10030 dout(20) << " active was " << service
.recovery_oids
[pg
->pg_id
] << dendl
;
10033 bool do_unfound
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
10034 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
10035 << " on " << *pg
<< dendl
;
10038 PG::RecoveryCtx rctx
= create_context();
10039 rctx
.handle
= &handle
;
10040 pg
->find_unfound(queued
, &rctx
);
10041 dispatch_context(rctx
, pg
, pg
->get_osdmap());
10046 ceph_assert(started
<= reserved_pushes
);
10047 service
.release_reserved_pushes(reserved_pushes
);
10050 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
10052 std::lock_guard
l(recovery_lock
);
10053 dout(10) << "start_recovery_op " << *pg
<< " " << soid
10054 << " (" << recovery_ops_active
<< "/"
10055 << cct
->_conf
->osd_recovery_max_active
<< " rops)"
10057 recovery_ops_active
++;
10059 #ifdef DEBUG_RECOVERY_OIDS
10060 dout(20) << " active was " << recovery_oids
[pg
->pg_id
] << dendl
;
10061 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
) == 0);
10062 recovery_oids
[pg
->pg_id
].insert(soid
);
10066 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
10068 std::lock_guard
l(recovery_lock
);
10069 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
10070 << " dequeue=" << dequeue
10071 << " (" << recovery_ops_active
<< "/" << cct
->_conf
->osd_recovery_max_active
<< " rops)"
10075 ceph_assert(recovery_ops_active
> 0);
10076 recovery_ops_active
--;
10078 #ifdef DEBUG_RECOVERY_OIDS
10079 dout(20) << " active oids was " << recovery_oids
[pg
->pg_id
] << dendl
;
10080 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
));
10081 recovery_oids
[pg
->pg_id
].erase(soid
);
10084 _maybe_queue_recovery();
10087 bool OSDService::is_recovery_active()
10089 if (cct
->_conf
->osd_debug_pretend_recovery_active
) {
10092 return local_reserver
.has_reservation() || remote_reserver
.has_reservation();
10095 void OSDService::release_reserved_pushes(uint64_t pushes
)
10097 std::lock_guard
l(recovery_lock
);
10098 dout(10) << __func__
<< "(" << pushes
<< "), recovery_ops_reserved "
10099 << recovery_ops_reserved
<< " -> " << (recovery_ops_reserved
-pushes
)
10101 ceph_assert(recovery_ops_reserved
>= pushes
);
10102 recovery_ops_reserved
-= pushes
;
10103 _maybe_queue_recovery();
10106 // =========================================================
10109 bool OSD::op_is_discardable(const MOSDOp
*op
)
10111 // drop client request if they are not connected and can't get the
10113 if (!op
->get_connection()->is_connected()) {
10119 void OSD::enqueue_op(spg_t pg
, OpRequestRef
&& op
, epoch_t epoch
)
10121 const utime_t stamp
= op
->get_req()->get_recv_stamp();
10122 const utime_t latency
= ceph_clock_now() - stamp
;
10123 const unsigned priority
= op
->get_req()->get_priority();
10124 const int cost
= op
->get_req()->get_cost();
10125 const uint64_t owner
= op
->get_req()->get_source().num();
10127 dout(15) << "enqueue_op " << op
<< " prio " << priority
10128 << " cost " << cost
10129 << " latency " << latency
10130 << " epoch " << epoch
10131 << " " << *(op
->get_req()) << dendl
;
10132 op
->osd_trace
.event("enqueue op");
10133 op
->osd_trace
.keyval("priority", priority
);
10134 op
->osd_trace
.keyval("cost", cost
);
10135 op
->mark_queued_for_pg();
10136 logger
->tinc(l_osd_op_before_queue_op_lat
, latency
);
10137 op_shardedwq
.queue(
10139 unique_ptr
<OpQueueItem::OpQueueable
>(new PGOpItem(pg
, std::move(op
))),
10140 cost
, priority
, stamp
, owner
, epoch
));
10143 void OSD::enqueue_peering_evt(spg_t pgid
, PGPeeringEventRef evt
)
10145 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
10146 op_shardedwq
.queue(
10148 unique_ptr
<OpQueueItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
10150 cct
->_conf
->osd_peering_op_priority
,
10153 evt
->get_epoch_sent()));
10156 void OSD::enqueue_peering_evt_front(spg_t pgid
, PGPeeringEventRef evt
)
10158 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
10159 op_shardedwq
.queue_front(
10161 unique_ptr
<OpQueueItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
10163 cct
->_conf
->osd_peering_op_priority
,
10166 evt
->get_epoch_sent()));
10170 * NOTE: dequeue called in worker thread, with pg lock
10172 void OSD::dequeue_op(
10173 PGRef pg
, OpRequestRef op
,
10174 ThreadPool::TPHandle
&handle
)
10177 OID_EVENT_TRACE_WITH_MSG(op
->get_req(), "DEQUEUE_OP_BEGIN", false);
10179 utime_t now
= ceph_clock_now();
10180 op
->set_dequeued_time(now
);
10181 utime_t latency
= now
- op
->get_req()->get_recv_stamp();
10182 dout(10) << "dequeue_op " << op
<< " prio " << op
->get_req()->get_priority()
10183 << " cost " << op
->get_req()->get_cost()
10184 << " latency " << latency
10185 << " " << *(op
->get_req())
10186 << " pg " << *pg
<< dendl
;
10188 logger
->tinc(l_osd_op_before_dequeue_op_lat
, latency
);
10190 auto priv
= op
->get_req()->get_connection()->get_priv();
10191 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
10192 maybe_share_map(session
, op
, pg
->get_osdmap());
10195 if (pg
->is_deleting())
10198 op
->mark_reached_pg();
10199 op
->osd_trace
.event("dequeue_op");
10201 pg
->do_request(op
, handle
);
10204 dout(10) << "dequeue_op " << op
<< " finish" << dendl
;
10205 OID_EVENT_TRACE_WITH_MSG(op
->get_req(), "DEQUEUE_OP_END", false);
10209 void OSD::dequeue_peering_evt(
10212 PGPeeringEventRef evt
,
10213 ThreadPool::TPHandle
& handle
)
10215 PG::RecoveryCtx rctx
= create_context();
10216 auto curmap
= sdata
->get_osdmap();
10217 epoch_t need_up_thru
= 0, same_interval_since
= 0;
10219 if (const MQuery
*q
= dynamic_cast<const MQuery
*>(evt
->evt
.get())) {
10220 handle_pg_query_nopg(*q
);
10222 derr
<< __func__
<< " unrecognized pg-less event " << evt
->get_desc() << dendl
;
10225 } else if (advance_pg(curmap
->get_epoch(), pg
, handle
, &rctx
)) {
10226 pg
->do_peering_event(evt
, &rctx
);
10227 if (pg
->is_deleted()) {
10228 // do not dispatch rctx; the final _delete_some already did it.
10229 discard_context(rctx
);
10233 dispatch_context_transaction(rctx
, pg
, &handle
);
10234 need_up_thru
= pg
->get_need_up_thru();
10235 same_interval_since
= pg
->get_same_interval_since();
10239 if (need_up_thru
) {
10240 queue_want_up_thru(same_interval_since
);
10242 dispatch_context(rctx
, pg
, curmap
, &handle
);
10244 service
.send_pg_temp();
10247 void OSD::dequeue_delete(
10251 ThreadPool::TPHandle
& handle
)
10253 dequeue_peering_evt(
10257 std::make_shared
<PGPeeringEvent
>(
10259 PG::DeleteSome())),
10265 // --------------------------------
10267 const char** OSD::get_tracked_conf_keys() const
10269 static const char* KEYS
[] = {
10270 "osd_max_backfills",
10271 "osd_min_recovery_priority",
10272 "osd_max_trimming_pgs",
10273 "osd_op_complaint_time",
10274 "osd_op_log_threshold",
10275 "osd_op_history_size",
10276 "osd_op_history_duration",
10277 "osd_op_history_slow_op_size",
10278 "osd_op_history_slow_op_threshold",
10279 "osd_enable_op_tracker",
10280 "osd_map_cache_size",
10281 "osd_pg_epoch_max_lag_factor",
10282 "osd_pg_epoch_persisted_max_stale",
10283 // clog & admin clog
10284 "clog_to_monitors",
10286 "clog_to_syslog_facility",
10287 "clog_to_syslog_level",
10288 "osd_objectstore_fuse",
10290 "clog_to_graylog_host",
10291 "clog_to_graylog_port",
10294 "osd_recovery_delay_start",
10295 "osd_client_message_size_cap",
10296 "osd_client_message_cap",
10297 "osd_heartbeat_min_size",
10298 "osd_heartbeat_interval",
10299 "osd_scrub_min_interval",
10300 "osd_scrub_max_interval",
10306 void OSD::handle_conf_change(const ConfigProxy
& conf
,
10307 const std::set
<std::string
> &changed
)
10309 Mutex::Locker
l(osd_lock
);
10310 if (changed
.count("osd_max_backfills")) {
10311 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
10312 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
10314 if (changed
.count("osd_min_recovery_priority")) {
10315 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
10316 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
10318 if (changed
.count("osd_max_trimming_pgs")) {
10319 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
10321 if (changed
.count("osd_op_complaint_time") ||
10322 changed
.count("osd_op_log_threshold")) {
10323 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
10324 cct
->_conf
->osd_op_log_threshold
);
10326 if (changed
.count("osd_op_history_size") ||
10327 changed
.count("osd_op_history_duration")) {
10328 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
10329 cct
->_conf
->osd_op_history_duration
);
10331 if (changed
.count("osd_op_history_slow_op_size") ||
10332 changed
.count("osd_op_history_slow_op_threshold")) {
10333 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
10334 cct
->_conf
->osd_op_history_slow_op_threshold
);
10336 if (changed
.count("osd_enable_op_tracker")) {
10337 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
10339 if (changed
.count("osd_map_cache_size")) {
10340 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10341 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10342 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10344 if (changed
.count("clog_to_monitors") ||
10345 changed
.count("clog_to_syslog") ||
10346 changed
.count("clog_to_syslog_level") ||
10347 changed
.count("clog_to_syslog_facility") ||
10348 changed
.count("clog_to_graylog") ||
10349 changed
.count("clog_to_graylog_host") ||
10350 changed
.count("clog_to_graylog_port") ||
10351 changed
.count("host") ||
10352 changed
.count("fsid")) {
10353 update_log_config();
10355 if (changed
.count("osd_pg_epoch_max_lag_factor")) {
10356 m_osd_pg_epoch_max_lag_factor
= conf
.get_val
<double>(
10357 "osd_pg_epoch_max_lag_factor");
10360 #ifdef HAVE_LIBFUSE
10361 if (changed
.count("osd_objectstore_fuse")) {
10363 enable_disable_fuse(false);
10368 if (changed
.count("osd_recovery_delay_start")) {
10369 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
10370 service
.kick_recovery_queue();
10373 if (changed
.count("osd_client_message_cap")) {
10374 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
10375 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10376 if (pol
.throttler_messages
&& newval
> 0) {
10377 pol
.throttler_messages
->reset_max(newval
);
10380 if (changed
.count("osd_client_message_size_cap")) {
10381 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
10382 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10383 if (pol
.throttler_bytes
&& newval
> 0) {
10384 pol
.throttler_bytes
->reset_max(newval
);
10388 if (changed
.count("osd_scrub_min_interval") ||
10389 changed
.count("osd_scrub_max_interval")) {
10390 resched_all_scrubs();
10391 dout(0) << __func__
<< ": scrub interval change" << dendl
;
10396 void OSD::update_log_config()
10398 map
<string
,string
> log_to_monitors
;
10399 map
<string
,string
> log_to_syslog
;
10400 map
<string
,string
> log_channel
;
10401 map
<string
,string
> log_prio
;
10402 map
<string
,string
> log_to_graylog
;
10403 map
<string
,string
> log_to_graylog_host
;
10404 map
<string
,string
> log_to_graylog_port
;
10408 if (parse_log_client_options(cct
, log_to_monitors
, log_to_syslog
,
10409 log_channel
, log_prio
, log_to_graylog
,
10410 log_to_graylog_host
, log_to_graylog_port
,
10412 clog
->update_config(log_to_monitors
, log_to_syslog
,
10413 log_channel
, log_prio
, log_to_graylog
,
10414 log_to_graylog_host
, log_to_graylog_port
,
10416 derr
<< "log_to_monitors " << log_to_monitors
<< dendl
;
10419 void OSD::check_config()
10421 // some sanity checks
10422 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
10423 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
10424 << " is not > osd_pg_epoch_persisted_max_stale ("
10425 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
10429 // --------------------------------
10431 void OSD::get_latest_osdmap()
10433 dout(10) << __func__
<< " -- start" << dendl
;
10436 service
.objecter
->wait_for_latest_osdmap(&cond
);
10439 dout(10) << __func__
<< " -- finish" << dendl
;
10442 // --------------------------------
10444 int OSD::init_op_flags(OpRequestRef
& op
)
10446 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
10447 vector
<OSDOp
>::const_iterator iter
;
10449 // client flags have no bearing on whether an op is a read, write, etc.
10452 if (m
->has_flag(CEPH_OSD_FLAG_RWORDERED
)) {
10453 op
->set_force_rwordered();
10456 // set bits based on op codes, called methods.
10457 for (iter
= m
->ops
.begin(); iter
!= m
->ops
.end(); ++iter
) {
10458 if ((iter
->op
.op
== CEPH_OSD_OP_WATCH
&&
10459 iter
->op
.watch
.op
== CEPH_OSD_WATCH_OP_PING
)) {
10460 /* This a bit odd. PING isn't actually a write. It can't
10461 * result in an update to the object_info. PINGs also aren't
10462 * resent, so there's no reason to write out a log entry.
10464 * However, we pipeline them behind writes, so let's force
10465 * the write_ordered flag.
10467 op
->set_force_rwordered();
10469 if (ceph_osd_op_mode_modify(iter
->op
.op
))
10472 if (ceph_osd_op_mode_read(iter
->op
.op
))
10475 // set READ flag if there are src_oids
10476 if (iter
->soid
.oid
.name
.length())
10479 // set PGOP flag if there are PG ops
10480 if (ceph_osd_op_type_pg(iter
->op
.op
))
10483 if (ceph_osd_op_mode_cache(iter
->op
.op
))
10486 // check for ec base pool
10487 int64_t poolid
= m
->get_pg().pool();
10488 const pg_pool_t
*pool
= osdmap
->get_pg_pool(poolid
);
10489 if (pool
&& pool
->is_tier()) {
10490 const pg_pool_t
*base_pool
= osdmap
->get_pg_pool(pool
->tier_of
);
10491 if (base_pool
&& base_pool
->require_rollback()) {
10492 if ((iter
->op
.op
!= CEPH_OSD_OP_READ
) &&
10493 (iter
->op
.op
!= CEPH_OSD_OP_CHECKSUM
) &&
10494 (iter
->op
.op
!= CEPH_OSD_OP_CMPEXT
) &&
10495 (iter
->op
.op
!= CEPH_OSD_OP_STAT
) &&
10496 (iter
->op
.op
!= CEPH_OSD_OP_ISDIRTY
) &&
10497 (iter
->op
.op
!= CEPH_OSD_OP_UNDIRTY
) &&
10498 (iter
->op
.op
!= CEPH_OSD_OP_GETXATTR
) &&
10499 (iter
->op
.op
!= CEPH_OSD_OP_GETXATTRS
) &&
10500 (iter
->op
.op
!= CEPH_OSD_OP_CMPXATTR
) &&
10501 (iter
->op
.op
!= CEPH_OSD_OP_ASSERT_VER
) &&
10502 (iter
->op
.op
!= CEPH_OSD_OP_LIST_WATCHERS
) &&
10503 (iter
->op
.op
!= CEPH_OSD_OP_LIST_SNAPS
) &&
10504 (iter
->op
.op
!= CEPH_OSD_OP_SETALLOCHINT
) &&
10505 (iter
->op
.op
!= CEPH_OSD_OP_WRITEFULL
) &&
10506 (iter
->op
.op
!= CEPH_OSD_OP_ROLLBACK
) &&
10507 (iter
->op
.op
!= CEPH_OSD_OP_CREATE
) &&
10508 (iter
->op
.op
!= CEPH_OSD_OP_DELETE
) &&
10509 (iter
->op
.op
!= CEPH_OSD_OP_SETXATTR
) &&
10510 (iter
->op
.op
!= CEPH_OSD_OP_RMXATTR
) &&
10511 (iter
->op
.op
!= CEPH_OSD_OP_STARTSYNC
) &&
10512 (iter
->op
.op
!= CEPH_OSD_OP_COPY_GET
) &&
10513 (iter
->op
.op
!= CEPH_OSD_OP_COPY_FROM
)) {
10519 switch (iter
->op
.op
) {
10520 case CEPH_OSD_OP_CALL
:
10522 bufferlist::iterator bp
= const_cast<bufferlist
&>(iter
->indata
).begin();
10523 int is_write
, is_read
;
10524 string cname
, mname
;
10525 bp
.copy(iter
->op
.cls
.class_len
, cname
);
10526 bp
.copy(iter
->op
.cls
.method_len
, mname
);
10528 ClassHandler::ClassData
*cls
;
10529 int r
= class_handler
->open_class(cname
, &cls
);
10531 derr
<< "class " << cname
<< " open got " << cpp_strerror(r
) << dendl
;
10534 else if (r
!= -EPERM
) // propagate permission errors
10538 int flags
= cls
->get_method_flags(mname
.c_str());
10540 if (flags
== -ENOENT
)
10546 is_read
= flags
& CLS_METHOD_RD
;
10547 is_write
= flags
& CLS_METHOD_WR
;
10548 bool is_promote
= flags
& CLS_METHOD_PROMOTE
;
10550 dout(10) << "class " << cname
<< " method " << mname
<< " "
10551 << "flags=" << (is_read
? "r" : "")
10552 << (is_write
? "w" : "")
10553 << (is_promote
? "p" : "")
10556 op
->set_class_read();
10558 op
->set_class_write();
10561 op
->add_class(std::move(cname
), std::move(mname
), is_read
, is_write
,
10566 case CEPH_OSD_OP_WATCH
:
10567 // force the read bit for watch since it is depends on previous
10568 // watch state (and may return early if the watch exists) or, in
10569 // the case of ping, is simply a read op.
10572 case CEPH_OSD_OP_NOTIFY
:
10573 case CEPH_OSD_OP_NOTIFY_ACK
:
10579 case CEPH_OSD_OP_DELETE
:
10580 // if we get a delete with FAILOK we can skip handle cache. without
10581 // FAILOK we still need to promote (or do something smarter) to
10582 // determine whether to return ENOENT or 0.
10583 if (iter
== m
->ops
.begin() &&
10584 iter
->op
.flags
== CEPH_OSD_OP_FLAG_FAILOK
) {
10585 op
->set_skip_handle_cache();
10587 // skip promotion when proxying a delete op
10588 if (m
->ops
.size() == 1) {
10589 op
->set_skip_promote();
10593 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
10594 case CEPH_OSD_OP_CACHE_FLUSH
:
10595 case CEPH_OSD_OP_CACHE_EVICT
:
10596 // If try_flush/flush/evict is the only op, can skip handle cache.
10597 if (m
->ops
.size() == 1) {
10598 op
->set_skip_handle_cache();
10602 case CEPH_OSD_OP_READ
:
10603 case CEPH_OSD_OP_SYNC_READ
:
10604 case CEPH_OSD_OP_SPARSE_READ
:
10605 case CEPH_OSD_OP_CHECKSUM
:
10606 case CEPH_OSD_OP_WRITEFULL
:
10607 if (m
->ops
.size() == 1 &&
10608 (iter
->op
.flags
& CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
||
10609 iter
->op
.flags
& CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
)) {
10610 op
->set_skip_promote();
10614 // force promotion when pin an object in cache tier
10615 case CEPH_OSD_OP_CACHE_PIN
:
10624 if (op
->rmw_flags
== 0)
10630 void OSD::set_perf_queries(
10631 const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
) {
10632 dout(10) << "setting " << queries
.size() << " queries" << dendl
;
10634 std::list
<OSDPerfMetricQuery
> supported_queries
;
10635 for (auto &it
: queries
) {
10636 auto &query
= it
.first
;
10637 if (!query
.key_descriptor
.empty()) {
10638 supported_queries
.push_back(query
);
10641 if (supported_queries
.size() < queries
.size()) {
10642 dout(1) << queries
.size() - supported_queries
.size()
10643 << " unsupported queries" << dendl
;
10647 Mutex::Locker
locker(m_perf_queries_lock
);
10648 m_perf_queries
= supported_queries
;
10649 m_perf_limits
= queries
;
10652 std::vector
<PGRef
> pgs
;
10654 for (auto& pg
: pgs
) {
10656 pg
->set_dynamic_perf_stats_queries(supported_queries
);
10661 void OSD::get_perf_reports(
10662 std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> *reports
) {
10663 std::vector
<PGRef
> pgs
;
10665 DynamicPerfStats dps
;
10666 for (auto& pg
: pgs
) {
10667 // m_perf_queries can be modified only in set_perf_queries by mgr client
10668 // request, and it is protected by by mgr client's lock, which is held
10669 // when set_perf_queries/get_perf_reports are called, so we may not hold
10670 // m_perf_queries_lock here.
10671 DynamicPerfStats
pg_dps(m_perf_queries
);
10673 pg
->get_dynamic_perf_stats(&pg_dps
);
10677 dps
.add_to_reports(m_perf_limits
, reports
);
10678 dout(20) << "reports for " << reports
->size() << " queries" << dendl
;
10681 // =============================================================
10683 #undef dout_context
10684 #define dout_context cct
10686 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10688 void OSDShard::_attach_pg(OSDShardPGSlot
*slot
, PG
*pg
)
10690 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10692 pg
->osd_shard
= this;
10693 pg
->pg_slot
= slot
;
10694 osd
->inc_num_pgs();
10696 slot
->epoch
= pg
->get_osdmap_epoch();
10697 pg_slots_by_epoch
.insert(*slot
);
10700 void OSDShard::_detach_pg(OSDShardPGSlot
*slot
)
10702 dout(10) << slot
->pg
->pg_id
<< " " << slot
->pg
<< dendl
;
10703 slot
->pg
->osd_shard
= nullptr;
10704 slot
->pg
->pg_slot
= nullptr;
10705 slot
->pg
= nullptr;
10706 osd
->dec_num_pgs();
10708 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10710 if (waiting_for_min_pg_epoch
) {
10711 min_pg_epoch_cond
.notify_all();
10715 void OSDShard::update_pg_epoch(OSDShardPGSlot
*slot
, epoch_t e
)
10717 std::lock_guard
l(shard_lock
);
10718 dout(30) << "min was " << pg_slots_by_epoch
.begin()->epoch
10719 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10720 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10721 dout(20) << slot
->pg
->pg_id
<< " " << slot
->epoch
<< " -> " << e
<< dendl
;
10723 pg_slots_by_epoch
.insert(*slot
);
10724 dout(30) << "min is now " << pg_slots_by_epoch
.begin()->epoch
10725 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10726 if (waiting_for_min_pg_epoch
) {
10727 min_pg_epoch_cond
.notify_all();
10731 epoch_t
OSDShard::get_min_pg_epoch()
10733 std::lock_guard
l(shard_lock
);
10734 auto p
= pg_slots_by_epoch
.begin();
10735 if (p
== pg_slots_by_epoch
.end()) {
10741 void OSDShard::wait_min_pg_epoch(epoch_t need
)
10743 std::unique_lock l
{shard_lock
};
10744 ++waiting_for_min_pg_epoch
;
10745 min_pg_epoch_cond
.wait(l
, [need
, this] {
10746 if (pg_slots_by_epoch
.empty()) {
10748 } else if (pg_slots_by_epoch
.begin()->epoch
>= need
) {
10751 dout(10) << need
<< " waiting on "
10752 << pg_slots_by_epoch
.begin()->epoch
<< dendl
;
10756 --waiting_for_min_pg_epoch
;
10759 epoch_t
OSDShard::get_max_waiting_epoch()
10761 std::lock_guard
l(shard_lock
);
10763 for (auto& i
: pg_slots
) {
10764 if (!i
.second
->waiting_peering
.empty()) {
10765 r
= std::max(r
, i
.second
->waiting_peering
.rbegin()->first
);
10771 void OSDShard::consume_map(
10772 OSDMapRef
& new_osdmap
,
10773 unsigned *pushes_to_free
)
10775 std::lock_guard
l(shard_lock
);
10776 OSDMapRef old_osdmap
;
10778 std::lock_guard
l(osdmap_lock
);
10779 old_osdmap
= std::move(shard_osdmap
);
10780 shard_osdmap
= new_osdmap
;
10782 dout(10) << new_osdmap
->get_epoch()
10783 << " (was " << (old_osdmap
? old_osdmap
->get_epoch() : 0) << ")"
10785 bool queued
= false;
10788 auto p
= pg_slots
.begin();
10789 while (p
!= pg_slots
.end()) {
10790 OSDShardPGSlot
*slot
= p
->second
.get();
10791 const spg_t
& pgid
= p
->first
;
10792 dout(20) << __func__
<< " " << pgid
<< dendl
;
10793 if (!slot
->waiting_for_split
.empty()) {
10794 dout(20) << __func__
<< " " << pgid
10795 << " waiting for split " << slot
->waiting_for_split
<< dendl
;
10799 if (slot
->waiting_for_merge_epoch
> new_osdmap
->get_epoch()) {
10800 dout(20) << __func__
<< " " << pgid
10801 << " waiting for merge by epoch " << slot
->waiting_for_merge_epoch
10806 if (!slot
->waiting_peering
.empty()) {
10807 epoch_t first
= slot
->waiting_peering
.begin()->first
;
10808 if (first
<= new_osdmap
->get_epoch()) {
10809 dout(20) << __func__
<< " " << pgid
10810 << " pending_peering first epoch " << first
10811 << " <= " << new_osdmap
->get_epoch() << ", requeueing" << dendl
;
10812 _wake_pg_slot(pgid
, slot
);
10818 if (!slot
->waiting
.empty()) {
10819 if (new_osdmap
->is_up_acting_osd_shard(pgid
, osd
->get_nodeid())) {
10820 dout(20) << __func__
<< " " << pgid
<< " maps to us, keeping"
10825 while (!slot
->waiting
.empty() &&
10826 slot
->waiting
.front().get_map_epoch() <= new_osdmap
->get_epoch()) {
10827 auto& qi
= slot
->waiting
.front();
10828 dout(20) << __func__
<< " " << pgid
10829 << " waiting item " << qi
10830 << " epoch " << qi
.get_map_epoch()
10831 << " <= " << new_osdmap
->get_epoch()
10833 << (qi
.get_map_epoch() < new_osdmap
->get_epoch() ? "stale" :
10835 << ", dropping" << dendl
;
10836 *pushes_to_free
+= qi
.get_reserved_pushes();
10837 slot
->waiting
.pop_front();
10840 if (slot
->waiting
.empty() &&
10841 slot
->num_running
== 0 &&
10842 slot
->waiting_for_split
.empty() &&
10844 dout(20) << __func__
<< " " << pgid
<< " empty, pruning" << dendl
;
10845 p
= pg_slots
.erase(p
);
10852 std::lock_guard l
{sdata_wait_lock
};
10853 sdata_cond
.notify_one();
10857 void OSDShard::_wake_pg_slot(
10859 OSDShardPGSlot
*slot
)
10861 dout(20) << __func__
<< " " << pgid
10862 << " to_process " << slot
->to_process
10863 << " waiting " << slot
->waiting
10864 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10865 for (auto i
= slot
->to_process
.rbegin();
10866 i
!= slot
->to_process
.rend();
10868 _enqueue_front(std::move(*i
), osd
->op_prio_cutoff
);
10870 slot
->to_process
.clear();
10871 for (auto i
= slot
->waiting
.rbegin();
10872 i
!= slot
->waiting
.rend();
10874 _enqueue_front(std::move(*i
), osd
->op_prio_cutoff
);
10876 slot
->waiting
.clear();
10877 for (auto i
= slot
->waiting_peering
.rbegin();
10878 i
!= slot
->waiting_peering
.rend();
10880 // this is overkill; we requeue everything, even if some of these
10881 // items are waiting for maps we don't have yet. FIXME, maybe,
10882 // someday, if we decide this inefficiency matters
10883 for (auto j
= i
->second
.rbegin(); j
!= i
->second
.rend(); ++j
) {
10884 _enqueue_front(std::move(*j
), osd
->op_prio_cutoff
);
10887 slot
->waiting_peering
.clear();
10888 ++slot
->requeue_seq
;
10891 void OSDShard::identify_splits_and_merges(
10892 const OSDMapRef
& as_of_osdmap
,
10893 set
<pair
<spg_t
,epoch_t
>> *split_pgs
,
10894 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10896 std::lock_guard
l(shard_lock
);
10897 if (shard_osdmap
) {
10898 for (auto& i
: pg_slots
) {
10899 const spg_t
& pgid
= i
.first
;
10900 auto *slot
= i
.second
.get();
10902 osd
->service
.identify_splits_and_merges(
10903 shard_osdmap
, as_of_osdmap
, pgid
,
10904 split_pgs
, merge_pgs
);
10905 } else if (!slot
->waiting_for_split
.empty()) {
10906 osd
->service
.identify_splits_and_merges(
10907 shard_osdmap
, as_of_osdmap
, pgid
,
10908 split_pgs
, nullptr);
10910 dout(20) << __func__
<< " slot " << pgid
10911 << " has no pg and waiting_for_split "
10912 << slot
->waiting_for_split
<< dendl
;
10918 void OSDShard::prime_splits(const OSDMapRef
& as_of_osdmap
,
10919 set
<pair
<spg_t
,epoch_t
>> *pgids
)
10921 std::lock_guard
l(shard_lock
);
10922 _prime_splits(pgids
);
10923 if (shard_osdmap
->get_epoch() > as_of_osdmap
->get_epoch()) {
10924 set
<pair
<spg_t
,epoch_t
>> newer_children
;
10925 for (auto i
: *pgids
) {
10926 osd
->service
.identify_splits_and_merges(
10927 as_of_osdmap
, shard_osdmap
, i
.first
,
10928 &newer_children
, nullptr);
10930 newer_children
.insert(pgids
->begin(), pgids
->end());
10931 dout(10) << "as_of_osdmap " << as_of_osdmap
->get_epoch() << " < shard "
10932 << shard_osdmap
->get_epoch() << ", new children " << newer_children
10934 _prime_splits(&newer_children
);
10935 // note: we don't care what is left over here for other shards.
10936 // if this shard is ahead of us and one isn't, e.g., one thread is
10937 // calling into prime_splits via _process (due to a newly created
10938 // pg) and this shard has a newer map due to a racing consume_map,
10939 // then any grandchildren left here will be identified (or were
10940 // identified) when the slower shard's osdmap is advanced.
10941 // _prime_splits() will tolerate the case where the pgid is
10946 void OSDShard::_prime_splits(set
<pair
<spg_t
,epoch_t
>> *pgids
)
10948 dout(10) << *pgids
<< dendl
;
10949 auto p
= pgids
->begin();
10950 while (p
!= pgids
->end()) {
10951 unsigned shard_index
= p
->first
.hash_to_shard(osd
->num_shards
);
10952 if (shard_index
== shard_id
) {
10953 auto r
= pg_slots
.emplace(p
->first
, nullptr);
10955 dout(10) << "priming slot " << p
->first
<< " e" << p
->second
<< dendl
;
10956 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10957 r
.first
->second
->waiting_for_split
.insert(p
->second
);
10960 ceph_assert(q
!= pg_slots
.end());
10961 dout(10) << "priming (existing) slot " << p
->first
<< " e" << p
->second
10963 q
->second
->waiting_for_split
.insert(p
->second
);
10965 p
= pgids
->erase(p
);
10972 void OSDShard::prime_merges(const OSDMapRef
& as_of_osdmap
,
10973 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10975 std::lock_guard
l(shard_lock
);
10976 dout(20) << __func__
<< " checking shard " << shard_id
10977 << " for remaining merge pgs " << merge_pgs
<< dendl
;
10978 auto p
= merge_pgs
->begin();
10979 while (p
!= merge_pgs
->end()) {
10980 spg_t pgid
= p
->first
;
10981 epoch_t epoch
= p
->second
;
10982 unsigned shard_index
= pgid
.hash_to_shard(osd
->num_shards
);
10983 if (shard_index
!= shard_id
) {
10987 OSDShardPGSlot
*slot
;
10988 auto r
= pg_slots
.emplace(pgid
, nullptr);
10990 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10992 slot
= r
.first
->second
.get();
10995 dout(20) << __func__
<< " have merge participant pg " << pgid
10996 << " " << slot
->pg
<< dendl
;
10997 } else if (!slot
->waiting_for_split
.empty() &&
10998 *slot
->waiting_for_split
.begin() < epoch
) {
10999 dout(20) << __func__
<< " pending split on merge participant pg " << pgid
11000 << " " << slot
->waiting_for_split
<< dendl
;
11002 dout(20) << __func__
<< " creating empty merge participant " << pgid
11003 << " for merge in " << epoch
<< dendl
;
11004 // leave history zeroed; PG::merge_from() will fill it in.
11005 pg_history_t history
;
11006 PGCreateInfo
cinfo(pgid
, epoch
- 1,
11007 history
, PastIntervals(), false);
11008 PGRef pg
= osd
->handle_pg_create_info(shard_osdmap
, &cinfo
);
11009 _attach_pg(r
.first
->second
.get(), pg
.get());
11010 _wake_pg_slot(pgid
, slot
);
11013 // mark slot for merge
11014 dout(20) << __func__
<< " marking merge participant " << pgid
<< dendl
;
11015 slot
->waiting_for_merge_epoch
= epoch
;
11016 p
= merge_pgs
->erase(p
);
11020 void OSDShard::register_and_wake_split_child(PG
*pg
)
11024 std::lock_guard
l(shard_lock
);
11025 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
11026 auto p
= pg_slots
.find(pg
->pg_id
);
11027 ceph_assert(p
!= pg_slots
.end());
11028 auto *slot
= p
->second
.get();
11029 dout(20) << pg
->pg_id
<< " waiting_for_split " << slot
->waiting_for_split
11031 ceph_assert(!slot
->pg
);
11032 ceph_assert(!slot
->waiting_for_split
.empty());
11033 _attach_pg(slot
, pg
);
11035 epoch
= pg
->get_osdmap_epoch();
11036 ceph_assert(slot
->waiting_for_split
.count(epoch
));
11037 slot
->waiting_for_split
.erase(epoch
);
11038 if (slot
->waiting_for_split
.empty()) {
11039 _wake_pg_slot(pg
->pg_id
, slot
);
11041 dout(10) << __func__
<< " still waiting for split on "
11042 << slot
->waiting_for_split
<< dendl
;
11046 // kick child to ensure it pulls up to the latest osdmap
11047 osd
->enqueue_peering_evt(
11050 std::make_shared
<PGPeeringEvent
>(
11055 std::lock_guard l
{sdata_wait_lock
};
11056 sdata_cond
.notify_one();
11059 void OSDShard::unprime_split_children(spg_t parent
, unsigned old_pg_num
)
11061 std::lock_guard
l(shard_lock
);
11062 vector
<spg_t
> to_delete
;
11063 for (auto& i
: pg_slots
) {
11064 if (i
.first
!= parent
&&
11065 i
.first
.get_ancestor(old_pg_num
) == parent
) {
11066 dout(10) << __func__
<< " parent " << parent
<< " clearing " << i
.first
11068 _wake_pg_slot(i
.first
, i
.second
.get());
11069 to_delete
.push_back(i
.first
);
11072 for (auto pgid
: to_delete
) {
11073 pg_slots
.erase(pgid
);
11078 // =============================================================
11080 #undef dout_context
11081 #define dout_context osd->cct
11083 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
11085 void OSD::ShardedOpWQ::_add_slot_waiter(
11087 OSDShardPGSlot
*slot
,
11090 if (qi
.is_peering()) {
11091 dout(20) << __func__
<< " " << pgid
11092 << " peering, item epoch is "
11093 << qi
.get_map_epoch()
11094 << ", will wait on " << qi
<< dendl
;
11095 slot
->waiting_peering
[qi
.get_map_epoch()].push_back(std::move(qi
));
11097 dout(20) << __func__
<< " " << pgid
11098 << " item epoch is "
11099 << qi
.get_map_epoch()
11100 << ", will wait on " << qi
<< dendl
;
11101 slot
->waiting
.push_back(std::move(qi
));
11106 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
11108 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
11110 uint32_t shard_index
= thread_index
% osd
->num_shards
;
11111 auto& sdata
= osd
->shards
[shard_index
];
11112 ceph_assert(sdata
);
11114 // If all threads of shards do oncommits, there is a out-of-order
11115 // problem. So we choose the thread which has the smallest
11116 // thread_index(thread_index < num_shards) of shard to do oncommit
11118 bool is_smallest_thread_index
= thread_index
< osd
->num_shards
;
11121 sdata
->shard_lock
.lock();
11122 if (sdata
->pqueue
->empty() &&
11123 (!is_smallest_thread_index
|| sdata
->context_queue
.empty())) {
11124 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
11125 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
11126 // we raced with a context_queue addition, don't wait
11127 wait_lock
.unlock();
11128 } else if (!sdata
->stop_waiting
) {
11129 dout(20) << __func__
<< " empty q, waiting" << dendl
;
11130 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
11131 sdata
->shard_lock
.unlock();
11132 sdata
->sdata_cond
.wait(wait_lock
);
11133 wait_lock
.unlock();
11134 sdata
->shard_lock
.lock();
11135 if (sdata
->pqueue
->empty() &&
11136 !(is_smallest_thread_index
&& !sdata
->context_queue
.empty())) {
11137 sdata
->shard_lock
.unlock();
11140 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
11141 osd
->cct
->_conf
->threadpool_default_timeout
, 0);
11143 dout(20) << __func__
<< " need return immediately" << dendl
;
11144 wait_lock
.unlock();
11145 sdata
->shard_lock
.unlock();
11150 list
<Context
*> oncommits
;
11151 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
11152 sdata
->context_queue
.swap(oncommits
);
11155 if (sdata
->pqueue
->empty()) {
11156 if (osd
->is_stopping()) {
11157 sdata
->shard_lock
.unlock();
11158 for (auto c
: oncommits
) {
11159 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
11162 return; // OSD shutdown, discard.
11164 sdata
->shard_lock
.unlock();
11165 handle_oncommits(oncommits
);
11169 OpQueueItem item
= sdata
->pqueue
->dequeue();
11170 if (osd
->is_stopping()) {
11171 sdata
->shard_lock
.unlock();
11172 for (auto c
: oncommits
) {
11173 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
11176 return; // OSD shutdown, discard.
11179 const auto token
= item
.get_ordering_token();
11180 auto r
= sdata
->pg_slots
.emplace(token
, nullptr);
11182 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
11184 OSDShardPGSlot
*slot
= r
.first
->second
.get();
11185 dout(20) << __func__
<< " " << token
11186 << (r
.second
? " (new)" : "")
11187 << " to_process " << slot
->to_process
11188 << " waiting " << slot
->waiting
11189 << " waiting_peering " << slot
->waiting_peering
11191 slot
->to_process
.push_back(std::move(item
));
11192 dout(20) << __func__
<< " " << slot
->to_process
.back()
11193 << " queued" << dendl
;
11196 PGRef pg
= slot
->pg
;
11198 // lock pg (if we have it)
11200 // note the requeue seq now...
11201 uint64_t requeue_seq
= slot
->requeue_seq
;
11202 ++slot
->num_running
;
11204 sdata
->shard_lock
.unlock();
11205 osd
->service
.maybe_inject_dispatch_delay();
11207 osd
->service
.maybe_inject_dispatch_delay();
11208 sdata
->shard_lock
.lock();
11210 auto q
= sdata
->pg_slots
.find(token
);
11211 if (q
== sdata
->pg_slots
.end()) {
11212 // this can happen if we race with pg removal.
11213 dout(20) << __func__
<< " slot " << token
<< " no longer there" << dendl
;
11215 sdata
->shard_lock
.unlock();
11216 handle_oncommits(oncommits
);
11219 slot
= q
->second
.get();
11220 --slot
->num_running
;
11222 if (slot
->to_process
.empty()) {
11223 // raced with _wake_pg_slot or consume_map
11224 dout(20) << __func__
<< " " << token
11225 << " nothing queued" << dendl
;
11227 sdata
->shard_lock
.unlock();
11228 handle_oncommits(oncommits
);
11231 if (requeue_seq
!= slot
->requeue_seq
) {
11232 dout(20) << __func__
<< " " << token
11233 << " requeue_seq " << slot
->requeue_seq
<< " > our "
11234 << requeue_seq
<< ", we raced with _wake_pg_slot"
11237 sdata
->shard_lock
.unlock();
11238 handle_oncommits(oncommits
);
11241 if (slot
->pg
!= pg
) {
11242 // this can happen if we race with pg removal.
11243 dout(20) << __func__
<< " slot " << token
<< " no longer attached to "
11250 dout(20) << __func__
<< " " << token
11251 << " to_process " << slot
->to_process
11252 << " waiting " << slot
->waiting
11253 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
11255 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
11259 auto qi
= std::move(slot
->to_process
.front());
11260 slot
->to_process
.pop_front();
11261 dout(20) << __func__
<< " " << qi
<< " pg " << pg
<< dendl
;
11262 set
<pair
<spg_t
,epoch_t
>> new_children
;
11266 // should this pg shard exist on this osd in this (or a later) epoch?
11267 osdmap
= sdata
->shard_osdmap
;
11268 const PGCreateInfo
*create_info
= qi
.creates_pg();
11269 if (!slot
->waiting_for_split
.empty()) {
11270 dout(20) << __func__
<< " " << token
11271 << " splitting " << slot
->waiting_for_split
<< dendl
;
11272 _add_slot_waiter(token
, slot
, std::move(qi
));
11273 } else if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
11274 dout(20) << __func__
<< " " << token
11275 << " map " << qi
.get_map_epoch() << " > "
11276 << osdmap
->get_epoch() << dendl
;
11277 _add_slot_waiter(token
, slot
, std::move(qi
));
11278 } else if (qi
.is_peering()) {
11279 if (!qi
.peering_requires_pg()) {
11280 // for pg-less events, we run them under the ordering lock, since
11281 // we don't have the pg lock to keep them ordered.
11282 qi
.run(osd
, sdata
, pg
, tp_handle
);
11283 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
11285 if (create_info
->by_mon
&&
11286 osdmap
->get_pg_acting_primary(token
.pgid
) != osd
->whoami
) {
11287 dout(20) << __func__
<< " " << token
11288 << " no pg, no longer primary, ignoring mon create on "
11291 dout(20) << __func__
<< " " << token
11292 << " no pg, should create on " << qi
<< dendl
;
11293 pg
= osd
->handle_pg_create_info(osdmap
, create_info
);
11295 // we created the pg! drop out and continue "normally"!
11296 sdata
->_attach_pg(slot
, pg
.get());
11297 sdata
->_wake_pg_slot(token
, slot
);
11299 // identify split children between create epoch and shard epoch.
11300 osd
->service
.identify_splits_and_merges(
11301 pg
->get_osdmap(), osdmap
, pg
->pg_id
, &new_children
, nullptr);
11302 sdata
->_prime_splits(&new_children
);
11303 // distribute remaining split children to other shards below!
11306 dout(20) << __func__
<< " ignored create on " << qi
<< dendl
;
11309 dout(20) << __func__
<< " " << token
11310 << " no pg, peering, !create, discarding " << qi
<< dendl
;
11313 dout(20) << __func__
<< " " << token
11314 << " no pg, peering, doesn't map here e" << osdmap
->get_epoch()
11315 << ", discarding " << qi
11318 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
11319 dout(20) << __func__
<< " " << token
11320 << " no pg, should exist e" << osdmap
->get_epoch()
11321 << ", will wait on " << qi
<< dendl
;
11322 _add_slot_waiter(token
, slot
, std::move(qi
));
11324 dout(20) << __func__
<< " " << token
11325 << " no pg, shouldn't exist e" << osdmap
->get_epoch()
11326 << ", dropping " << qi
<< dendl
;
11327 // share map with client?
11328 if (boost::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11329 auto priv
= (*_op
)->get_req()->get_connection()->get_priv();
11330 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
11331 osd
->maybe_share_map(session
, *_op
, sdata
->shard_osdmap
);
11334 unsigned pushes_to_free
= qi
.get_reserved_pushes();
11335 if (pushes_to_free
> 0) {
11336 sdata
->shard_lock
.unlock();
11337 osd
->service
.release_reserved_pushes(pushes_to_free
);
11338 handle_oncommits(oncommits
);
11342 sdata
->shard_lock
.unlock();
11343 handle_oncommits(oncommits
);
11346 if (qi
.is_peering()) {
11347 OSDMapRef osdmap
= sdata
->shard_osdmap
;
11348 if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
11349 _add_slot_waiter(token
, slot
, std::move(qi
));
11350 sdata
->shard_lock
.unlock();
11352 handle_oncommits(oncommits
);
11356 sdata
->shard_lock
.unlock();
11358 if (!new_children
.empty()) {
11359 for (auto shard
: osd
->shards
) {
11360 shard
->prime_splits(osdmap
, &new_children
);
11362 ceph_assert(new_children
.empty());
11365 // osd_opwq_process marks the point at which an operation has been dequeued
11366 // and will begin to be handled by a worker thread.
11370 if (boost::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11371 reqid
= (*_op
)->get_reqid();
11374 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
11375 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
11378 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
11379 Formatter
*f
= Formatter::create("json");
11380 f
->open_object_section("q");
11382 f
->close_section();
11387 qi
.run(osd
, sdata
, pg
, tp_handle
);
11392 if (boost::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11393 reqid
= (*_op
)->get_reqid();
11396 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
11397 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
11400 handle_oncommits(oncommits
);
11403 void OSD::ShardedOpWQ::_enqueue(OpQueueItem
&& item
) {
11404 uint32_t shard_index
=
11405 item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11407 OSDShard
* sdata
= osd
->shards
[shard_index
];
11408 assert (NULL
!= sdata
);
11409 unsigned priority
= item
.get_priority();
11410 unsigned cost
= item
.get_cost();
11411 sdata
->shard_lock
.lock();
11413 dout(20) << __func__
<< " " << item
<< dendl
;
11414 if (priority
>= osd
->op_prio_cutoff
)
11415 sdata
->pqueue
->enqueue_strict(
11416 item
.get_owner(), priority
, std::move(item
));
11418 sdata
->pqueue
->enqueue(
11419 item
.get_owner(), priority
, cost
, std::move(item
));
11420 sdata
->shard_lock
.unlock();
11422 std::lock_guard l
{sdata
->sdata_wait_lock
};
11423 sdata
->sdata_cond
.notify_one();
11426 void OSD::ShardedOpWQ::_enqueue_front(OpQueueItem
&& item
)
11428 auto shard_index
= item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11429 auto& sdata
= osd
->shards
[shard_index
];
11430 ceph_assert(sdata
);
11431 sdata
->shard_lock
.lock();
11432 auto p
= sdata
->pg_slots
.find(item
.get_ordering_token());
11433 if (p
!= sdata
->pg_slots
.end() &&
11434 !p
->second
->to_process
.empty()) {
11435 // we may be racing with _process, which has dequeued a new item
11436 // from pqueue, put it on to_process, and is now busy taking the
11437 // pg lock. ensure this old requeued item is ordered before any
11438 // such newer item in to_process.
11439 p
->second
->to_process
.push_front(std::move(item
));
11440 item
= std::move(p
->second
->to_process
.back());
11441 p
->second
->to_process
.pop_back();
11442 dout(20) << __func__
11443 << " " << p
->second
->to_process
.front()
11444 << " shuffled w/ " << item
<< dendl
;
11446 dout(20) << __func__
<< " " << item
<< dendl
;
11448 sdata
->_enqueue_front(std::move(item
), osd
->op_prio_cutoff
);
11449 sdata
->shard_lock
.unlock();
11450 std::lock_guard l
{sdata
->sdata_wait_lock
};
11451 sdata
->sdata_cond
.notify_one();
11455 namespace osd_cmds
{
11457 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
,
11460 if (!ceph_using_tcmalloc()) {
11461 os
<< "could not issue heap profiler command -- not using tcmalloc!";
11462 return -EOPNOTSUPP
;
11466 if (!cmd_getval(&cct
, cmdmap
, "heapcmd", cmd
)) {
11467 os
<< "unable to get value for command \"" << cmd
<< "\"";
11471 std::vector
<std::string
> cmd_vec
;
11472 get_str_vec(cmd
, cmd_vec
);
11475 if (cmd_getval(&cct
, cmdmap
, "value", val
)) {
11476 cmd_vec
.push_back(val
);
11479 ceph_heap_profiler_handle_command(cmd_vec
, os
);
11484 }} // namespace ceph::osd_cmds
11487 std::ostream
& operator<<(std::ostream
& out
, const io_queue
& q
) {
11489 case io_queue::prioritized
:
11490 out
<< "prioritized";
11492 case io_queue::weightedpriority
:
11493 out
<< "weightedpriority";
11495 case io_queue::mclock_opclass
:
11496 out
<< "mclock_opclass";
11498 case io_queue::mclock_client
:
11499 out
<< "mclock_client";