1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
26 #include <boost/scoped_ptr.hpp>
28 #ifdef HAVE_SYS_PARAM_H
29 #include <sys/param.h>
32 #ifdef HAVE_SYS_MOUNT_H
33 #include <sys/mount.h>
38 #include "include/types.h"
39 #include "include/compat.h"
40 #include "include/random.h"
45 #include "osdc/Objecter.h"
47 #include "common/errno.h"
48 #include "common/ceph_argparse.h"
49 #include "common/ceph_time.h"
50 #include "common/version.h"
51 #include "common/pick_address.h"
52 #include "common/blkdev.h"
53 #include "common/numa.h"
55 #include "os/ObjectStore.h"
57 #include "os/FuseStore.h"
60 #include "PrimaryLogPG.h"
62 #include "msg/Messenger.h"
63 #include "msg/Message.h"
65 #include "mon/MonClient.h"
67 #include "messages/MLog.h"
69 #include "messages/MGenericMessage.h"
70 #include "messages/MOSDPing.h"
71 #include "messages/MOSDFailure.h"
72 #include "messages/MOSDMarkMeDown.h"
73 #include "messages/MOSDFull.h"
74 #include "messages/MOSDOp.h"
75 #include "messages/MOSDOpReply.h"
76 #include "messages/MOSDBackoff.h"
77 #include "messages/MOSDBeacon.h"
78 #include "messages/MOSDRepOp.h"
79 #include "messages/MOSDRepOpReply.h"
80 #include "messages/MOSDBoot.h"
81 #include "messages/MOSDPGTemp.h"
82 #include "messages/MOSDPGReadyToMerge.h"
84 #include "messages/MOSDMap.h"
85 #include "messages/MMonGetOSDMap.h"
86 #include "messages/MOSDPGNotify.h"
87 #include "messages/MOSDPGQuery.h"
88 #include "messages/MOSDPGLog.h"
89 #include "messages/MOSDPGRemove.h"
90 #include "messages/MOSDPGInfo.h"
91 #include "messages/MOSDPGCreate.h"
92 #include "messages/MOSDPGCreate2.h"
93 #include "messages/MOSDPGTrim.h"
94 #include "messages/MOSDPGScan.h"
95 #include "messages/MBackfillReserve.h"
96 #include "messages/MRecoveryReserve.h"
97 #include "messages/MOSDForceRecovery.h"
98 #include "messages/MOSDECSubOpWrite.h"
99 #include "messages/MOSDECSubOpWriteReply.h"
100 #include "messages/MOSDECSubOpRead.h"
101 #include "messages/MOSDECSubOpReadReply.h"
102 #include "messages/MOSDPGCreated.h"
103 #include "messages/MOSDPGUpdateLogMissing.h"
104 #include "messages/MOSDPGUpdateLogMissingReply.h"
106 #include "messages/MOSDPeeringOp.h"
108 #include "messages/MOSDAlive.h"
110 #include "messages/MOSDScrub.h"
111 #include "messages/MOSDScrub2.h"
112 #include "messages/MOSDRepScrub.h"
114 #include "messages/MMonCommand.h"
115 #include "messages/MCommand.h"
116 #include "messages/MCommandReply.h"
118 #include "messages/MPGStats.h"
119 #include "messages/MPGStatsAck.h"
121 #include "messages/MWatchNotify.h"
122 #include "messages/MOSDPGPush.h"
123 #include "messages/MOSDPGPushReply.h"
124 #include "messages/MOSDPGPull.h"
126 #include "common/perf_counters.h"
127 #include "common/Timer.h"
128 #include "common/LogClient.h"
129 #include "common/AsyncReserver.h"
130 #include "common/HeartbeatMap.h"
131 #include "common/admin_socket.h"
132 #include "common/ceph_context.h"
134 #include "global/signal_handler.h"
135 #include "global/pidfile.h"
137 #include "include/color.h"
138 #include "perfglue/cpu_profiler.h"
139 #include "perfglue/heap_profiler.h"
141 #include "osd/OpRequest.h"
143 #include "auth/AuthAuthorizeHandler.h"
144 #include "auth/RotatingKeyRing.h"
146 #include "objclass/objclass.h"
148 #include "common/cmdparse.h"
149 #include "include/str_list.h"
150 #include "include/util.h"
152 #include "include/ceph_assert.h"
153 #include "common/config.h"
154 #include "common/EventTrace.h"
156 #include "json_spirit/json_spirit_reader.h"
157 #include "json_spirit/json_spirit_writer.h"
160 #define TRACEPOINT_DEFINE
161 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
162 #include "tracing/osd.h"
163 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
164 #undef TRACEPOINT_DEFINE
166 #define tracepoint(...)
169 #define dout_context cct
170 #define dout_subsys ceph_subsys_osd
172 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
175 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
176 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
179 //Initial features in new superblock.
180 //Features here are also automatically upgraded
181 CompatSet
OSD::get_osd_initial_compat_set() {
182 CompatSet::FeatureSet ceph_osd_feature_compat
;
183 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
184 CompatSet::FeatureSet ceph_osd_feature_incompat
;
185 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
186 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
187 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
188 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
189 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
190 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
191 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
192 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
193 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
194 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
195 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
196 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
197 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
198 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
199 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES
);
200 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
201 ceph_osd_feature_incompat
);
204 //Features are added here that this OSD supports.
205 CompatSet
OSD::get_osd_compat_set() {
206 CompatSet compat
= get_osd_initial_compat_set();
207 //Any features here can be set in code, but not in initial superblock
208 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
212 OSDService::OSDService(OSD
*osd
) :
215 whoami(osd
->whoami
), store(osd
->store
),
216 log_client(osd
->log_client
), clog(osd
->clog
),
217 pg_recovery_stats(osd
->pg_recovery_stats
),
218 cluster_messenger(osd
->cluster_messenger
),
219 client_messenger(osd
->client_messenger
),
221 recoverystate_perf(osd
->recoverystate_perf
),
223 class_handler(osd
->class_handler
),
224 osd_max_object_size(cct
->_conf
, "osd_max_object_size"),
225 osd_skip_data_digest(cct
->_conf
, "osd_skip_data_digest"),
226 publish_lock
{ceph::make_mutex("OSDService::publish_lock")},
227 pre_publish_lock
{ceph::make_mutex("OSDService::pre_publish_lock")},
229 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
230 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
232 agent_lock("OSDService::agent_lock"),
233 agent_valid_iterator(false),
235 flush_mode_high_count(0),
238 agent_stop_flag(false),
239 agent_timer_lock("OSDService::agent_timer_lock"),
240 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
241 last_recalibrate(ceph_clock_now()),
242 promote_max_objects(0),
243 promote_max_bytes(0),
244 objecter(new Objecter(osd
->client_messenger
->cct
, osd
->objecter_messenger
, osd
->monc
, NULL
, 0, 0)),
245 m_objecter_finishers(cct
->_conf
->osd_objecter_finishers
),
246 watch_lock("OSDService::watch_lock"),
247 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
249 recovery_request_lock("OSDService::recovery_request_lock"),
250 recovery_request_timer(cct
, recovery_request_lock
, false),
251 sleep_lock("OSDService::sleep_lock"),
252 sleep_timer(cct
, sleep_lock
, false),
253 reserver_finisher(cct
),
254 local_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
255 cct
->_conf
->osd_min_recovery_priority
),
256 remote_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
257 cct
->_conf
->osd_min_recovery_priority
),
258 pg_temp_lock("OSDService::pg_temp_lock"),
259 snap_reserver(cct
, &reserver_finisher
,
260 cct
->_conf
->osd_max_trimming_pgs
),
261 recovery_lock("OSDService::recovery_lock"),
262 recovery_ops_active(0),
263 recovery_ops_reserved(0),
264 recovery_paused(false),
265 map_cache_lock("OSDService::map_cache_lock"),
266 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
267 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
268 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
269 stat_lock("OSDService::stat_lock"),
270 full_status_lock("OSDService::full_status_lock"),
272 cur_ratio(0), physical_ratio(0),
273 epoch_lock("OSDService::epoch_lock"),
274 boot_epoch(0), up_epoch(0), bind_epoch(0),
275 is_stopping_lock("OSDService::is_stopping_lock")
277 , pgid_lock("OSDService::pgid_lock")
282 for (int i
= 0; i
< m_objecter_finishers
; i
++) {
284 str
<< "objecter-finisher-" << i
;
285 Finisher
*fin
= new Finisher(osd
->client_messenger
->cct
, str
.str(), "finisher");
286 objecter_finishers
.push_back(fin
);
290 OSDService::~OSDService()
294 for (auto f
: objecter_finishers
) {
303 void OSDService::add_pgid(spg_t pgid
, PG
*pg
){
304 std::lock_guard
l(pgid_lock
);
305 if (!pgid_tracker
.count(pgid
)) {
308 pgid_tracker
[pgid
]++;
310 void OSDService::remove_pgid(spg_t pgid
, PG
*pg
)
312 std::lock_guard
l(pgid_lock
);
313 ceph_assert(pgid_tracker
.count(pgid
));
314 ceph_assert(pgid_tracker
[pgid
] > 0);
315 pgid_tracker
[pgid
]--;
316 if (pgid_tracker
[pgid
] == 0) {
317 pgid_tracker
.erase(pgid
);
318 live_pgs
.erase(pgid
);
321 void OSDService::dump_live_pgids()
323 std::lock_guard
l(pgid_lock
);
324 derr
<< "live pgids:" << dendl
;
325 for (map
<spg_t
, int>::const_iterator i
= pgid_tracker
.cbegin();
326 i
!= pgid_tracker
.cend();
328 derr
<< "\t" << *i
<< dendl
;
329 live_pgs
[i
->first
]->dump_live_ids();
336 void OSDService::identify_splits_and_merges(
340 set
<pair
<spg_t
,epoch_t
>> *split_children
,
341 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
343 if (!old_map
->have_pg_pool(pgid
.pool())) {
346 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
347 auto p
= osd
->pg_num_history
.pg_nums
.find(pgid
.pool());
348 if (p
== osd
->pg_num_history
.pg_nums
.end()) {
351 dout(20) << __func__
<< " " << pgid
<< " e" << old_map
->get_epoch()
352 << " to e" << new_map
->get_epoch()
353 << " pg_nums " << p
->second
<< dendl
;
355 queue
.push_back(pgid
);
356 while (!queue
.empty()) {
357 auto cur
= queue
.front();
359 unsigned pgnum
= old_pgnum
;
360 for (auto q
= p
->second
.lower_bound(old_map
->get_epoch());
361 q
!= p
->second
.end() &&
362 q
->first
<= new_map
->get_epoch();
364 if (pgnum
< q
->second
) {
366 if (cur
.ps() < pgnum
) {
368 if (cur
.is_split(pgnum
, q
->second
, &children
)) {
369 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
370 << " pg_num " << pgnum
<< " -> " << q
->second
371 << " children " << children
<< dendl
;
372 for (auto i
: children
) {
373 split_children
->insert(make_pair(i
, q
->first
));
377 } else if (cur
.ps() < q
->second
) {
378 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
379 << " pg_num " << pgnum
<< " -> " << q
->second
380 << " is a child" << dendl
;
381 // normally we'd capture this from the parent, but it's
382 // possible the parent doesn't exist yet (it will be
383 // fabricated to allow an intervening merge). note this PG
384 // as a split child here to be sure we catch it.
385 split_children
->insert(make_pair(cur
, q
->first
));
387 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
388 << " pg_num " << pgnum
<< " -> " << q
->second
389 << " is post-split, skipping" << dendl
;
391 } else if (merge_pgs
) {
393 if (cur
.ps() >= q
->second
) {
394 if (cur
.ps() < pgnum
) {
396 if (cur
.is_merge_source(pgnum
, q
->second
, &parent
)) {
398 parent
.is_split(q
->second
, pgnum
, &children
);
399 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
400 << " pg_num " << pgnum
<< " -> " << q
->second
401 << " is merge source, target " << parent
402 << ", source(s) " << children
<< dendl
;
403 merge_pgs
->insert(make_pair(parent
, q
->first
));
404 for (auto c
: children
) {
405 merge_pgs
->insert(make_pair(c
, q
->first
));
409 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
410 << " pg_num " << pgnum
<< " -> " << q
->second
411 << " is beyond old pgnum, skipping" << dendl
;
415 if (cur
.is_split(q
->second
, pgnum
, &children
)) {
416 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
417 << " pg_num " << pgnum
<< " -> " << q
->second
418 << " is merge target, source " << children
<< dendl
;
419 for (auto c
: children
) {
420 merge_pgs
->insert(make_pair(c
, q
->first
));
422 merge_pgs
->insert(make_pair(cur
, q
->first
));
431 void OSDService::need_heartbeat_peer_update()
433 osd
->need_heartbeat_peer_update();
436 void OSDService::start_shutdown()
439 std::lock_guard
l(agent_timer_lock
);
440 agent_timer
.shutdown();
444 std::lock_guard
l(sleep_lock
);
445 sleep_timer
.shutdown();
449 std::lock_guard
l(recovery_request_lock
);
450 recovery_request_timer
.shutdown();
454 void OSDService::shutdown_reserver()
456 reserver_finisher
.wait_for_empty();
457 reserver_finisher
.stop();
460 void OSDService::shutdown()
463 std::lock_guard
l(watch_lock
);
464 watch_timer
.shutdown();
467 objecter
->shutdown();
468 for (auto f
: objecter_finishers
) {
473 publish_map(OSDMapRef());
474 next_osdmap
= OSDMapRef();
477 void OSDService::init()
479 reserver_finisher
.start();
480 for (auto f
: objecter_finishers
) {
483 objecter
->set_client_incarnation(0);
485 // deprioritize objecter in daemonperf output
486 objecter
->get_logger()->set_prio_adjust(-3);
491 agent_thread
.create("osd_srv_agent");
493 if (cct
->_conf
->osd_recovery_delay_start
)
494 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
497 void OSDService::final_init()
499 objecter
->start(osdmap
.get());
502 void OSDService::activate_map()
504 // wake/unwake the tiering agent
507 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
513 void OSDService::request_osdmap_update(epoch_t e
)
515 osd
->osdmap_subscribe(e
, false);
518 class AgentTimeoutCB
: public Context
{
521 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
522 void finish(int) override
{
523 pg
->agent_choose_mode_restart();
527 void OSDService::agent_entry()
529 dout(10) << __func__
<< " start" << dendl
;
532 while (!agent_stop_flag
) {
533 if (agent_queue
.empty()) {
534 dout(20) << __func__
<< " empty queue" << dendl
;
535 agent_cond
.Wait(agent_lock
);
538 uint64_t level
= agent_queue
.rbegin()->first
;
539 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
541 << " tiers " << agent_queue
.size()
542 << ", top is " << level
543 << " with pgs " << top
.size()
544 << ", ops " << agent_ops
<< "/"
545 << cct
->_conf
->osd_agent_max_ops
546 << (agent_active
? " active" : " NOT ACTIVE")
548 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
549 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
550 int agent_flush_quota
= max
;
551 if (!flush_mode_high_count
)
552 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
553 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
554 agent_cond
.Wait(agent_lock
);
558 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
559 agent_queue_pos
= top
.begin();
560 agent_valid_iterator
= true;
562 PGRef pg
= *agent_queue_pos
;
563 dout(10) << "high_count " << flush_mode_high_count
564 << " agent_ops " << agent_ops
565 << " flush_quota " << agent_flush_quota
<< dendl
;
567 if (!pg
->agent_work(max
, agent_flush_quota
)) {
568 dout(10) << __func__
<< " " << pg
->pg_id
569 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
570 << " seconds" << dendl
;
572 osd
->logger
->inc(l_osd_tier_delay
);
573 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
574 agent_timer_lock
.Lock();
575 Context
*cb
= new AgentTimeoutCB(pg
);
576 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
577 agent_timer_lock
.Unlock();
582 dout(10) << __func__
<< " finish" << dendl
;
585 void OSDService::agent_stop()
588 std::lock_guard
l(agent_lock
);
590 // By this time all ops should be cancelled
591 ceph_assert(agent_ops
== 0);
592 // By this time all PGs are shutdown and dequeued
593 if (!agent_queue
.empty()) {
594 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
595 derr
<< "agent queue not empty, for example " << (*top
.begin())->get_pgid() << dendl
;
596 ceph_abort_msg("agent queue not empty");
599 agent_stop_flag
= true;
605 // -------------------------------------
607 void OSDService::promote_throttle_recalibrate()
609 utime_t now
= ceph_clock_now();
610 double dur
= now
- last_recalibrate
;
611 last_recalibrate
= now
;
612 unsigned prob
= promote_probability_millis
;
614 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
615 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
617 unsigned min_prob
= 1;
619 uint64_t attempts
, obj
, bytes
;
620 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
621 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
622 << obj
<< " objects and " << byte_u_t(bytes
) << "; target "
623 << target_obj_sec
<< " obj/sec or "
624 << byte_u_t(target_bytes_sec
) << "/sec"
627 // calculate what the probability *should* be, given the targets
629 if (attempts
&& dur
> 0) {
630 uint64_t avg_size
= 1;
632 avg_size
= std::max
<uint64_t>(bytes
/ obj
, 1);
633 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
634 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
636 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
637 << avg_size
<< dendl
;
638 if (target_obj_sec
&& target_bytes_sec
)
639 new_prob
= std::min(po
, pb
);
640 else if (target_obj_sec
)
642 else if (target_bytes_sec
)
649 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
651 // correct for persistent skew between target rate and actual rate, adjust
654 if (attempts
&& obj
) {
655 actual
= obj
* 1000 / attempts
;
656 ratio
= (double)actual
/ (double)prob
;
657 new_prob
= (double)new_prob
/ ratio
;
659 new_prob
= std::max(new_prob
, min_prob
);
660 new_prob
= std::min(new_prob
, 1000u);
663 prob
= (prob
+ new_prob
) / 2;
664 prob
= std::max(prob
, min_prob
);
665 prob
= std::min(prob
, 1000u);
666 dout(10) << __func__
<< " actual " << actual
667 << ", actual/prob ratio " << ratio
668 << ", adjusted new_prob " << new_prob
669 << ", prob " << promote_probability_millis
<< " -> " << prob
671 promote_probability_millis
= prob
;
673 // set hard limits for this interval to mitigate stampedes
674 promote_max_objects
= target_obj_sec
* osd
->OSD_TICK_INTERVAL
* 2;
675 promote_max_bytes
= target_bytes_sec
* osd
->OSD_TICK_INTERVAL
* 2;
678 // -------------------------------------
680 float OSDService::get_failsafe_full_ratio()
682 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
683 if (full_ratio
> 1.0) full_ratio
/= 100.0;
687 OSDService::s_names
OSDService::recalc_full_state(float ratio
, float pratio
, string
&inject
)
689 // The OSDMap ratios take precendence. So if the failsafe is .95 and
690 // the admin sets the cluster full to .96, the failsafe moves up to .96
691 // too. (Not that having failsafe == full is ideal, but it's better than
692 // dropping writes before the clusters appears full.)
693 OSDMapRef osdmap
= get_osdmap();
694 if (!osdmap
|| osdmap
->get_epoch() == 0) {
697 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
698 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
699 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
700 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
702 if (osdmap
->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
703 // use the failsafe for nearfull and full; the mon isn't using the
704 // flags anyway because we're mid-upgrade.
705 full_ratio
= failsafe_ratio
;
706 backfillfull_ratio
= failsafe_ratio
;
707 nearfull_ratio
= failsafe_ratio
;
708 } else if (full_ratio
<= 0 ||
709 backfillfull_ratio
<= 0 ||
710 nearfull_ratio
<= 0) {
711 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
712 // use failsafe flag. ick. the monitor did something wrong or the user
713 // did something stupid.
714 full_ratio
= failsafe_ratio
;
715 backfillfull_ratio
= failsafe_ratio
;
716 nearfull_ratio
= failsafe_ratio
;
719 if (injectfull_state
> NONE
&& injectfull
) {
720 inject
= "(Injected)";
721 return injectfull_state
;
722 } else if (pratio
> failsafe_ratio
) {
724 } else if (ratio
> full_ratio
) {
726 } else if (ratio
> backfillfull_ratio
) {
728 } else if (ratio
> nearfull_ratio
) {
734 void OSDService::check_full_status(float ratio
, float pratio
)
736 std::lock_guard
l(full_status_lock
);
739 physical_ratio
= pratio
;
743 new_state
= recalc_full_state(ratio
, pratio
, inject
);
745 dout(20) << __func__
<< " cur ratio " << ratio
746 << ", physical ratio " << pratio
747 << ", new state " << get_full_state_name(new_state
)
752 if (cur_state
!= new_state
) {
753 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
754 << " -> " << get_full_state_name(new_state
) << dendl
;
755 if (new_state
== FAILSAFE
) {
756 clog
->error() << "full status failsafe engaged, dropping updates, now "
757 << (int)roundf(ratio
* 100) << "% full";
758 } else if (cur_state
== FAILSAFE
) {
759 clog
->error() << "full status failsafe disengaged, no longer dropping "
760 << "updates, now " << (int)roundf(ratio
* 100) << "% full";
762 cur_state
= new_state
;
766 bool OSDService::need_fullness_update()
768 OSDMapRef osdmap
= get_osdmap();
770 if (osdmap
->exists(whoami
)) {
771 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
773 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
775 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
782 else if (is_backfillfull())
784 else if (is_nearfull())
789 bool OSDService::_check_inject_full(DoutPrefixProvider
*dpp
, s_names type
) const
791 if (injectfull
&& injectfull_state
>= type
) {
792 // injectfull is either a count of the number of times to return failsafe full
793 // or if -1 then always return full
796 ldpp_dout(dpp
, 10) << __func__
<< " Injected " << get_full_state_name(type
) << " OSD ("
797 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")"
804 bool OSDService::_check_full(DoutPrefixProvider
*dpp
, s_names type
) const
806 std::lock_guard
l(full_status_lock
);
808 if (_check_inject_full(dpp
, type
))
811 if (cur_state
>= type
)
812 ldpp_dout(dpp
, 10) << __func__
<< " current usage is " << cur_ratio
813 << " physical " << physical_ratio
<< dendl
;
815 return cur_state
>= type
;
818 bool OSDService::_tentative_full(DoutPrefixProvider
*dpp
, s_names type
, uint64_t adjust_used
, osd_stat_t adjusted_stat
)
820 ldpp_dout(dpp
, 20) << __func__
<< " type " << get_full_state_name(type
) << " adjust_used " << (adjust_used
>> 10) << "KiB" << dendl
;
822 std::lock_guard
l(full_status_lock
);
823 if (_check_inject_full(dpp
, type
)) {
829 float ratio
= compute_adjusted_ratio(adjusted_stat
, &pratio
, adjust_used
);
832 s_names tentative_state
= recalc_full_state(ratio
, pratio
, notused
);
834 if (tentative_state
>= type
)
835 ldpp_dout(dpp
, 10) << __func__
<< " tentative usage is " << ratio
<< dendl
;
837 return tentative_state
>= type
;
840 bool OSDService::check_failsafe_full(DoutPrefixProvider
*dpp
) const
842 return _check_full(dpp
, FAILSAFE
);
845 bool OSDService::check_full(DoutPrefixProvider
*dpp
) const
847 return _check_full(dpp
, FULL
);
850 bool OSDService::tentative_backfill_full(DoutPrefixProvider
*dpp
, uint64_t adjust_used
, osd_stat_t stats
)
852 return _tentative_full(dpp
, BACKFILLFULL
, adjust_used
, stats
);
855 bool OSDService::check_backfill_full(DoutPrefixProvider
*dpp
) const
857 return _check_full(dpp
, BACKFILLFULL
);
860 bool OSDService::check_nearfull(DoutPrefixProvider
*dpp
) const
862 return _check_full(dpp
, NEARFULL
);
865 bool OSDService::is_failsafe_full() const
867 std::lock_guard
l(full_status_lock
);
868 return cur_state
== FAILSAFE
;
871 bool OSDService::is_full() const
873 std::lock_guard
l(full_status_lock
);
874 return cur_state
>= FULL
;
877 bool OSDService::is_backfillfull() const
879 std::lock_guard
l(full_status_lock
);
880 return cur_state
>= BACKFILLFULL
;
883 bool OSDService::is_nearfull() const
885 std::lock_guard
l(full_status_lock
);
886 return cur_state
>= NEARFULL
;
889 void OSDService::set_injectfull(s_names type
, int64_t count
)
891 std::lock_guard
l(full_status_lock
);
892 injectfull_state
= type
;
896 void OSDService::set_statfs(const struct store_statfs_t
&stbuf
,
897 osd_alert_list_t
& alerts
)
899 uint64_t bytes
= stbuf
.total
;
900 uint64_t avail
= stbuf
.available
;
901 uint64_t used
= stbuf
.get_used_raw();
903 // For testing fake statfs values so it doesn't matter if all
904 // OSDs are using the same partition.
905 if (cct
->_conf
->fake_statfs_for_testing
) {
906 uint64_t total_num_bytes
= 0;
910 total_num_bytes
+= p
->get_stats_num_bytes();
912 bytes
= cct
->_conf
->fake_statfs_for_testing
;
913 if (total_num_bytes
< bytes
)
914 avail
= bytes
- total_num_bytes
;
917 dout(0) << __func__
<< " fake total " << cct
->_conf
->fake_statfs_for_testing
918 << " adjust available " << avail
920 used
= bytes
- avail
;
923 osd
->logger
->set(l_osd_stat_bytes
, bytes
);
924 osd
->logger
->set(l_osd_stat_bytes_used
, used
);
925 osd
->logger
->set(l_osd_stat_bytes_avail
, avail
);
927 std::lock_guard
l(stat_lock
);
928 osd_stat
.statfs
= stbuf
;
929 osd_stat
.os_alerts
.clear();
930 osd_stat
.os_alerts
[whoami
].swap(alerts
);
931 if (cct
->_conf
->fake_statfs_for_testing
) {
932 osd_stat
.statfs
.total
= bytes
;
933 osd_stat
.statfs
.available
= avail
;
934 // For testing don't want used to go negative, so clear reserved
935 osd_stat
.statfs
.internally_reserved
= 0;
939 osd_stat_t
OSDService::set_osd_stat(vector
<int>& hb_peers
,
942 std::lock_guard
l(stat_lock
);
943 osd_stat
.hb_peers
.swap(hb_peers
);
944 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
945 osd_stat
.num_pgs
= num_pgs
;
949 void OSDService::inc_osd_stat_repaired()
951 std::lock_guard
l(stat_lock
);
952 osd_stat
.num_shards_repaired
++;
956 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat
, float *pratio
,
957 uint64_t adjust_used
)
960 ((float)new_stat
.statfs
.get_used()) / ((float)new_stat
.statfs
.total
);
963 dout(20) << __func__
<< " Before kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
964 if (new_stat
.statfs
.available
> adjust_used
)
965 new_stat
.statfs
.available
-= adjust_used
;
967 new_stat
.statfs
.available
= 0;
968 dout(20) << __func__
<< " After kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
971 // Check all pgs and adjust kb_used to include all pending backfill data
972 int backfill_adjusted
= 0;
976 backfill_adjusted
+= p
->pg_stat_adjust(&new_stat
);
978 if (backfill_adjusted
) {
979 dout(20) << __func__
<< " backfill adjusted " << new_stat
<< dendl
;
981 return ((float)new_stat
.statfs
.get_used()) / ((float)new_stat
.statfs
.total
);
984 bool OSDService::check_osdmap_full(const set
<pg_shard_t
> &missing_on
)
986 OSDMapRef osdmap
= get_osdmap();
987 for (auto shard
: missing_on
) {
988 if (osdmap
->get_state(shard
.osd
) & CEPH_OSD_FULL
)
994 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
996 OSDMapRef next_map
= get_nextmap_reserved();
997 // service map is always newer/newest
998 ceph_assert(from_epoch
<= next_map
->get_epoch());
1000 if (next_map
->is_down(peer
) ||
1001 next_map
->get_info(peer
).up_from
> from_epoch
) {
1003 release_map(next_map
);
1006 ConnectionRef peer_con
= osd
->cluster_messenger
->connect_to_osd(
1007 next_map
->get_cluster_addrs(peer
));
1008 share_map_peer(peer
, peer_con
.get(), next_map
);
1009 peer_con
->send_message(m
);
1010 release_map(next_map
);
1013 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
1015 OSDMapRef next_map
= get_nextmap_reserved();
1016 // service map is always newer/newest
1017 ceph_assert(from_epoch
<= next_map
->get_epoch());
1019 if (next_map
->is_down(peer
) ||
1020 next_map
->get_info(peer
).up_from
> from_epoch
) {
1021 release_map(next_map
);
1024 ConnectionRef con
= osd
->cluster_messenger
->connect_to_osd(
1025 next_map
->get_cluster_addrs(peer
));
1026 release_map(next_map
);
1030 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
1032 OSDMapRef next_map
= get_nextmap_reserved();
1033 // service map is always newer/newest
1034 ceph_assert(from_epoch
<= next_map
->get_epoch());
1036 pair
<ConnectionRef
,ConnectionRef
> ret
;
1037 if (next_map
->is_down(peer
) ||
1038 next_map
->get_info(peer
).up_from
> from_epoch
) {
1039 release_map(next_map
);
1042 ret
.first
= osd
->hb_back_client_messenger
->connect_to_osd(
1043 next_map
->get_hb_back_addrs(peer
));
1044 ret
.second
= osd
->hb_front_client_messenger
->connect_to_osd(
1045 next_map
->get_hb_front_addrs(peer
));
1046 release_map(next_map
);
1050 entity_name_t
OSDService::get_cluster_msgr_name() const
1052 return cluster_messenger
->get_myname();
1055 void OSDService::queue_want_pg_temp(pg_t pgid
,
1056 const vector
<int>& want
,
1059 std::lock_guard
l(pg_temp_lock
);
1060 auto p
= pg_temp_pending
.find(pgid
);
1061 if (p
== pg_temp_pending
.end() ||
1062 p
->second
.acting
!= want
||
1064 pg_temp_wanted
[pgid
] = {want
, forced
};
1068 void OSDService::remove_want_pg_temp(pg_t pgid
)
1070 std::lock_guard
l(pg_temp_lock
);
1071 pg_temp_wanted
.erase(pgid
);
1072 pg_temp_pending
.erase(pgid
);
1075 void OSDService::_sent_pg_temp()
1077 #ifdef HAVE_STDLIB_MAP_SPLICING
1078 pg_temp_pending
.merge(pg_temp_wanted
);
1080 pg_temp_pending
.insert(make_move_iterator(begin(pg_temp_wanted
)),
1081 make_move_iterator(end(pg_temp_wanted
)));
1083 pg_temp_wanted
.clear();
1086 void OSDService::requeue_pg_temp()
1088 std::lock_guard
l(pg_temp_lock
);
1089 // wanted overrides pending. note that remove_want_pg_temp
1090 // clears the item out of both.
1091 unsigned old_wanted
= pg_temp_wanted
.size();
1092 unsigned old_pending
= pg_temp_pending
.size();
1094 pg_temp_wanted
.swap(pg_temp_pending
);
1095 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1096 << pg_temp_wanted
.size() << dendl
;
1099 std::ostream
& operator<<(std::ostream
& out
,
1100 const OSDService::pg_temp_t
& pg_temp
)
1102 out
<< pg_temp
.acting
;
1103 if (pg_temp
.forced
) {
1109 void OSDService::send_pg_temp()
1111 std::lock_guard
l(pg_temp_lock
);
1112 if (pg_temp_wanted
.empty())
1114 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1115 MOSDPGTemp
*ms
[2] = {nullptr, nullptr};
1116 for (auto& [pgid
, pg_temp
] : pg_temp_wanted
) {
1117 auto& m
= ms
[pg_temp
.forced
];
1119 m
= new MOSDPGTemp(osdmap
->get_epoch());
1120 m
->forced
= pg_temp
.forced
;
1122 m
->pg_temp
.emplace(pgid
, pg_temp
.acting
);
1126 monc
->send_mon_message(m
);
1132 void OSDService::send_pg_created(pg_t pgid
)
1134 std::lock_guard
l(pg_created_lock
);
1135 dout(20) << __func__
<< dendl
;
1136 auto o
= get_osdmap();
1137 if (o
->require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
1138 pg_created
.insert(pgid
);
1139 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1143 void OSDService::send_pg_created()
1145 std::lock_guard
l(pg_created_lock
);
1146 dout(20) << __func__
<< dendl
;
1147 auto o
= get_osdmap();
1148 if (o
->require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
1149 for (auto pgid
: pg_created
) {
1150 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1155 void OSDService::prune_pg_created()
1157 std::lock_guard
l(pg_created_lock
);
1158 dout(20) << __func__
<< dendl
;
1159 auto o
= get_osdmap();
1160 auto i
= pg_created
.begin();
1161 while (i
!= pg_created
.end()) {
1162 auto p
= o
->get_pg_pool(i
->pool());
1163 if (!p
|| !p
->has_flag(pg_pool_t::FLAG_CREATING
)) {
1164 dout(20) << __func__
<< " pruning " << *i
<< dendl
;
1165 i
= pg_created
.erase(i
);
1167 dout(20) << __func__
<< " keeping " << *i
<< dendl
;
1174 // --------------------------------------
1177 epoch_t
OSDService::get_peer_epoch(int peer
)
1179 std::lock_guard
l(peer_map_epoch_lock
);
1180 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1181 if (p
== peer_map_epoch
.end())
1186 epoch_t
OSDService::note_peer_epoch(int peer
, epoch_t e
)
1188 std::lock_guard
l(peer_map_epoch_lock
);
1189 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1190 if (p
!= peer_map_epoch
.end()) {
1191 if (p
->second
< e
) {
1192 dout(10) << "note_peer_epoch osd." << peer
<< " has " << e
<< dendl
;
1195 dout(30) << "note_peer_epoch osd." << peer
<< " has " << p
->second
<< " >= " << e
<< dendl
;
1199 dout(10) << "note_peer_epoch osd." << peer
<< " now has " << e
<< dendl
;
1200 peer_map_epoch
[peer
] = e
;
1205 void OSDService::forget_peer_epoch(int peer
, epoch_t as_of
)
1207 std::lock_guard
l(peer_map_epoch_lock
);
1208 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1209 if (p
!= peer_map_epoch
.end()) {
1210 if (p
->second
<= as_of
) {
1211 dout(10) << "forget_peer_epoch osd." << peer
<< " as_of " << as_of
1212 << " had " << p
->second
<< dendl
;
1213 peer_map_epoch
.erase(p
);
1215 dout(10) << "forget_peer_epoch osd." << peer
<< " as_of " << as_of
1216 << " has " << p
->second
<< " - not forgetting" << dendl
;
1221 bool OSDService::should_share_map(entity_name_t name
, Connection
*con
,
1222 epoch_t epoch
, const OSDMapRef
& osdmap
,
1223 const epoch_t
*sent_epoch_p
)
1225 dout(20) << "should_share_map "
1226 << name
<< " " << con
->get_peer_addr()
1227 << " " << epoch
<< dendl
;
1229 // does client have old map?
1230 if (name
.is_client()) {
1231 bool message_sendmap
= epoch
< osdmap
->get_epoch();
1232 if (message_sendmap
&& sent_epoch_p
) {
1233 dout(20) << "client session last_sent_epoch: "
1235 << " versus osdmap epoch " << osdmap
->get_epoch() << dendl
;
1236 if (*sent_epoch_p
< osdmap
->get_epoch()) {
1238 } // else we don't need to send it out again
1242 if (con
->get_messenger() == osd
->cluster_messenger
&&
1243 con
!= osd
->cluster_messenger
->get_loopback_connection() &&
1244 osdmap
->is_up(name
.num()) &&
1245 (osdmap
->get_cluster_addrs(name
.num()) == con
->get_peer_addrs() ||
1246 osdmap
->get_hb_back_addrs(name
.num()) == con
->get_peer_addrs())) {
1248 epoch_t has
= std::max(get_peer_epoch(name
.num()), epoch
);
1251 if (has
< osdmap
->get_epoch()) {
1252 dout(10) << name
<< " " << con
->get_peer_addr()
1253 << " has old map " << epoch
<< " < "
1254 << osdmap
->get_epoch() << dendl
;
1262 void OSDService::share_map(
1267 epoch_t
*sent_epoch_p
)
1269 dout(20) << "share_map "
1270 << name
<< " " << con
->get_peer_addr()
1271 << " " << epoch
<< dendl
;
1273 if (!osd
->is_active()) {
1274 /*It is safe not to proceed as OSD is not in healthy state*/
1278 bool want_shared
= should_share_map(name
, con
, epoch
,
1279 osdmap
, sent_epoch_p
);
1282 if (name
.is_client()) {
1283 dout(10) << name
<< " has old map " << epoch
1284 << " < " << osdmap
->get_epoch() << dendl
;
1285 // we know the Session is valid or we wouldn't be sending
1287 *sent_epoch_p
= osdmap
->get_epoch();
1289 send_incremental_map(epoch
, con
, osdmap
);
1290 } else if (con
->get_messenger() == osd
->cluster_messenger
&&
1291 osdmap
->is_up(name
.num()) &&
1292 (osdmap
->get_cluster_addrs(name
.num()) == con
->get_peer_addrs() ||
1293 osdmap
->get_hb_back_addrs(name
.num()) == con
->get_peer_addrs())) {
1294 dout(10) << name
<< " " << con
->get_peer_addrs()
1295 << " has old map " << epoch
<< " < "
1296 << osdmap
->get_epoch() << dendl
;
1297 note_peer_epoch(name
.num(), osdmap
->get_epoch());
1298 send_incremental_map(epoch
, con
, osdmap
);
1303 void OSDService::share_map_peer(int peer
, Connection
*con
, OSDMapRef map
)
1309 epoch_t pe
= get_peer_epoch(peer
);
1311 if (pe
< map
->get_epoch()) {
1312 send_incremental_map(pe
, con
, map
);
1313 note_peer_epoch(peer
, map
->get_epoch());
1315 dout(20) << "share_map_peer " << con
<< " already has epoch " << pe
<< dendl
;
1317 dout(20) << "share_map_peer " << con
<< " don't know epoch, doing nothing" << dendl
;
1318 // no idea about peer's epoch.
1319 // ??? send recent ???
1324 bool OSDService::can_inc_scrubs_pending()
1326 bool can_inc
= false;
1327 std::lock_guard
l(sched_scrub_lock
);
1329 if (scrubs_pending
+ scrubs_active
< cct
->_conf
->osd_max_scrubs
) {
1330 dout(20) << __func__
<< " " << scrubs_pending
<< " -> " << (scrubs_pending
+1)
1331 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", active " << scrubs_active
1335 dout(20) << __func__
<< " " << scrubs_pending
<< " + " << scrubs_active
1336 << " active >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1342 bool OSDService::inc_scrubs_pending()
1344 bool result
= false;
1346 sched_scrub_lock
.Lock();
1347 if (scrubs_pending
+ scrubs_active
< cct
->_conf
->osd_max_scrubs
) {
1348 dout(20) << "inc_scrubs_pending " << scrubs_pending
<< " -> " << (scrubs_pending
+1)
1349 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", active " << scrubs_active
<< ")" << dendl
;
1353 dout(20) << "inc_scrubs_pending " << scrubs_pending
<< " + " << scrubs_active
<< " active >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1355 sched_scrub_lock
.Unlock();
1360 void OSDService::dec_scrubs_pending()
1362 sched_scrub_lock
.Lock();
1363 dout(20) << "dec_scrubs_pending " << scrubs_pending
<< " -> " << (scrubs_pending
-1)
1364 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", active " << scrubs_active
<< ")" << dendl
;
1366 ceph_assert(scrubs_pending
>= 0);
1367 sched_scrub_lock
.Unlock();
1370 void OSDService::inc_scrubs_active(bool reserved
)
1372 sched_scrub_lock
.Lock();
1376 dout(20) << "inc_scrubs_active " << (scrubs_active
-1) << " -> " << scrubs_active
1377 << " (max " << cct
->_conf
->osd_max_scrubs
1378 << ", pending " << (scrubs_pending
+1) << " -> " << scrubs_pending
<< ")" << dendl
;
1379 ceph_assert(scrubs_pending
>= 0);
1381 dout(20) << "inc_scrubs_active " << (scrubs_active
-1) << " -> " << scrubs_active
1382 << " (max " << cct
->_conf
->osd_max_scrubs
1383 << ", pending " << scrubs_pending
<< ")" << dendl
;
1385 sched_scrub_lock
.Unlock();
1388 void OSDService::dec_scrubs_active()
1390 sched_scrub_lock
.Lock();
1391 dout(20) << "dec_scrubs_active " << scrubs_active
<< " -> " << (scrubs_active
-1)
1392 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", pending " << scrubs_pending
<< ")" << dendl
;
1394 ceph_assert(scrubs_active
>= 0);
1395 sched_scrub_lock
.Unlock();
1398 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1399 epoch_t
*_bind_epoch
) const
1401 std::lock_guard
l(epoch_lock
);
1403 *_boot_epoch
= boot_epoch
;
1405 *_up_epoch
= up_epoch
;
1407 *_bind_epoch
= bind_epoch
;
1410 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1411 const epoch_t
*_bind_epoch
)
1413 std::lock_guard
l(epoch_lock
);
1415 ceph_assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1416 boot_epoch
= *_boot_epoch
;
1419 ceph_assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1420 up_epoch
= *_up_epoch
;
1423 ceph_assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1424 bind_epoch
= *_bind_epoch
;
1428 bool OSDService::prepare_to_stop()
1430 std::lock_guard
l(is_stopping_lock
);
1431 if (get_state() != NOT_STOPPING
)
1434 OSDMapRef osdmap
= get_osdmap();
1435 if (osdmap
&& osdmap
->is_up(whoami
)) {
1436 dout(0) << __func__
<< " telling mon we are shutting down" << dendl
;
1437 set_state(PREPARING_TO_STOP
);
1438 monc
->send_mon_message(
1442 osdmap
->get_addrs(whoami
),
1443 osdmap
->get_epoch(),
1446 utime_t now
= ceph_clock_now();
1448 timeout
.set_from_double(now
+ cct
->_conf
->osd_mon_shutdown_timeout
);
1449 while ((ceph_clock_now() < timeout
) &&
1450 (get_state() != STOPPING
)) {
1451 is_stopping_cond
.WaitUntil(is_stopping_lock
, timeout
);
1454 dout(0) << __func__
<< " starting shutdown" << dendl
;
1455 set_state(STOPPING
);
1459 void OSDService::got_stop_ack()
1461 std::lock_guard
l(is_stopping_lock
);
1462 if (get_state() == PREPARING_TO_STOP
) {
1463 dout(0) << __func__
<< " starting shutdown" << dendl
;
1464 set_state(STOPPING
);
1465 is_stopping_cond
.Signal();
1467 dout(10) << __func__
<< " ignoring msg" << dendl
;
1471 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1472 OSDSuperblock
& sblock
)
1474 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1475 osdmap
->get_encoding_features());
1476 m
->oldest_map
= max_oldest_map
;
1477 m
->newest_map
= sblock
.newest_map
;
1479 int max
= cct
->_conf
->osd_map_message_max
;
1480 ssize_t max_bytes
= cct
->_conf
->osd_map_message_max_bytes
;
1482 if (since
< m
->oldest_map
) {
1483 // we don't have the next map the target wants, so start with a
1486 dout(10) << __func__
<< " oldest map " << max_oldest_map
<< " > since "
1487 << since
<< ", starting with full map" << dendl
;
1488 since
= m
->oldest_map
;
1489 if (!get_map_bl(since
, bl
)) {
1490 derr
<< __func__
<< " missing full map " << since
<< dendl
;
1494 max_bytes
-= bl
.length();
1495 m
->maps
[since
].claim(bl
);
1497 for (epoch_t e
= since
+ 1; e
<= to
; ++e
) {
1499 if (get_inc_map_bl(e
, bl
)) {
1500 m
->incremental_maps
[e
].claim(bl
);
1502 derr
<< __func__
<< " missing incremental map " << e
<< dendl
;
1503 if (!get_map_bl(e
, bl
)) {
1504 derr
<< __func__
<< " also missing full map " << e
<< dendl
;
1507 m
->maps
[e
].claim(bl
);
1510 max_bytes
-= bl
.length();
1511 if (max
<= 0 || max_bytes
<= 0) {
1518 if (!m
->maps
.empty() ||
1519 !m
->incremental_maps
.empty()) {
1520 // send what we have so far
1525 if (get_inc_map_bl(m
->newest_map
, bl
)) {
1526 m
->incremental_maps
[m
->newest_map
].claim(bl
);
1528 derr
<< __func__
<< " unable to load latest map " << m
->newest_map
<< dendl
;
1529 if (!get_map_bl(m
->newest_map
, bl
)) {
1530 derr
<< __func__
<< " unable to load latest full map " << m
->newest_map
1534 m
->maps
[m
->newest_map
].claim(bl
);
1539 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1541 con
->send_message(m
);
1544 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1547 epoch_t to
= osdmap
->get_epoch();
1548 dout(10) << "send_incremental_map " << since
<< " -> " << to
1549 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1553 OSDSuperblock
sblock(get_superblock());
1554 if (since
< sblock
.oldest_map
) {
1555 // just send latest full map
1556 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1557 osdmap
->get_encoding_features());
1558 m
->oldest_map
= max_oldest_map
;
1559 m
->newest_map
= sblock
.newest_map
;
1560 get_map_bl(to
, m
->maps
[to
]);
1565 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1566 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1567 << ", only sending most recent" << dendl
;
1568 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1571 m
= build_incremental_map_msg(since
, to
, sblock
);
1576 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1578 bool found
= map_bl_cache
.lookup(e
, &bl
);
1581 logger
->inc(l_osd_map_bl_cache_hit
);
1585 logger
->inc(l_osd_map_bl_cache_miss
);
1586 found
= store
->read(meta_ch
,
1587 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
,
1588 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1595 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1597 std::lock_guard
l(map_cache_lock
);
1598 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1601 logger
->inc(l_osd_map_bl_cache_hit
);
1605 logger
->inc(l_osd_map_bl_cache_miss
);
1606 found
= store
->read(meta_ch
,
1607 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
,
1608 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1610 _add_map_inc_bl(e
, bl
);
1615 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1617 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1618 // cache a contiguous buffer
1619 if (bl
.get_num_buffers() > 1) {
1622 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1623 map_bl_cache
.add(e
, bl
);
1626 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1628 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1629 // cache a contiguous buffer
1630 if (bl
.get_num_buffers() > 1) {
1633 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1634 map_bl_inc_cache
.add(e
, bl
);
1637 int OSDService::get_deleted_pool_pg_num(int64_t pool
)
1639 std::lock_guard
l(map_cache_lock
);
1640 auto p
= deleted_pool_pg_nums
.find(pool
);
1641 if (p
!= deleted_pool_pg_nums
.end()) {
1644 dout(20) << __func__
<< " " << pool
<< " loading" << dendl
;
1645 ghobject_t oid
= OSD::make_final_pool_info_oid(pool
);
1647 int r
= store
->read(meta_ch
, oid
, 0, 0, bl
);
1648 ceph_assert(r
>= 0);
1649 auto blp
= bl
.cbegin();
1652 deleted_pool_pg_nums
[pool
] = pi
.get_pg_num();
1653 dout(20) << __func__
<< " " << pool
<< " got " << pi
.get_pg_num() << dendl
;
1654 return pi
.get_pg_num();
1657 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1659 epoch_t e
= o
->get_epoch();
1661 if (cct
->_conf
->osd_map_dedup
) {
1662 // Dedup against an existing map at a nearby epoch
1663 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1665 OSDMap::dedup(for_dedup
.get(), o
);
1669 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1676 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1678 std::lock_guard
l(map_cache_lock
);
1679 OSDMapRef retval
= map_cache
.lookup(epoch
);
1681 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1683 logger
->inc(l_osd_map_cache_hit
);
1688 logger
->inc(l_osd_map_cache_miss
);
1689 epoch_t lb
= map_cache
.cached_key_lower_bound();
1691 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1692 logger
->inc(l_osd_map_cache_miss_low
);
1693 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1697 OSDMap
*map
= new OSDMap
;
1699 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1701 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1702 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1708 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1710 return _add_map(map
);
1716 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1718 reply_op_error(op
, err
, eversion_t(), 0);
1721 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1724 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1725 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1727 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1729 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
, true);
1730 reply
->set_reply_versions(v
, uv
);
1731 m
->get_connection()->send_message(reply
);
1734 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1736 if (!cct
->_conf
->osd_debug_misdirected_ops
) {
1740 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1741 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1743 ceph_assert(m
->get_map_epoch() >= pg
->get_history().same_primary_since
);
1745 if (pg
->is_ec_pg()) {
1747 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1748 * can get this result:
1749 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1750 * [CRUSH_ITEM_NONE, 2, 3]/3
1751 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1753 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1755 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1758 * We can't compute the op target based on the sending map epoch due to
1759 * splitting. The simplest thing is to detect such cases here and drop
1760 * them without an error (the client will resend anyway).
1762 ceph_assert(m
->get_map_epoch() <= superblock
.newest_map
);
1763 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1765 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1766 << m
->get_map_epoch() << ", dropping" << dendl
;
1769 pg_t _pgid
= m
->get_raw_pg();
1771 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1772 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1773 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1774 pgid
.shard
!= pg
->pg_id
.shard
) {
1775 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1776 << m
->get_map_epoch() << ", dropping" << dendl
;
1781 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1782 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1783 << " pg " << m
->get_raw_pg()
1784 << " to osd." << whoami
1785 << " not " << pg
->get_acting()
1786 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1789 void OSDService::enqueue_back(OpQueueItem
&& qi
)
1791 osd
->op_shardedwq
.queue(std::move(qi
));
1794 void OSDService::enqueue_front(OpQueueItem
&& qi
)
1796 osd
->op_shardedwq
.queue_front(std::move(qi
));
1799 void OSDService::queue_recovery_context(
1801 GenContext
<ThreadPool::TPHandle
&> *c
)
1803 epoch_t e
= get_osdmap_epoch();
1806 unique_ptr
<OpQueueItem::OpQueueable
>(
1807 new PGRecoveryContext(pg
->get_pgid(), c
, e
)),
1808 cct
->_conf
->osd_recovery_cost
,
1809 cct
->_conf
->osd_recovery_priority
,
1815 void OSDService::queue_for_snap_trim(PG
*pg
)
1817 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1820 unique_ptr
<OpQueueItem::OpQueueable
>(
1821 new PGSnapTrim(pg
->get_pgid(), pg
->get_osdmap_epoch())),
1822 cct
->_conf
->osd_snap_trim_cost
,
1823 cct
->_conf
->osd_snap_trim_priority
,
1826 pg
->get_osdmap_epoch()));
1829 void OSDService::queue_for_scrub(PG
*pg
, bool with_high_priority
)
1831 unsigned scrub_queue_priority
= pg
->scrubber
.priority
;
1832 if (with_high_priority
&& scrub_queue_priority
< cct
->_conf
->osd_client_op_priority
) {
1833 scrub_queue_priority
= cct
->_conf
->osd_client_op_priority
;
1835 const auto epoch
= pg
->get_osdmap_epoch();
1838 unique_ptr
<OpQueueItem::OpQueueable
>(new PGScrub(pg
->get_pgid(), epoch
)),
1839 cct
->_conf
->osd_scrub_cost
,
1840 scrub_queue_priority
,
1846 void OSDService::queue_for_pg_delete(spg_t pgid
, epoch_t e
)
1848 dout(10) << __func__
<< " on " << pgid
<< " e " << e
<< dendl
;
1851 unique_ptr
<OpQueueItem::OpQueueable
>(
1852 new PGDelete(pgid
, e
)),
1853 cct
->_conf
->osd_pg_delete_cost
,
1854 cct
->_conf
->osd_pg_delete_priority
,
1860 bool OSDService::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
1862 return osd
->try_finish_pg_delete(pg
, old_pg_num
);
1867 void OSDService::set_ready_to_merge_source(PG
*pg
, eversion_t version
)
1869 std::lock_guard
l(merge_lock
);
1870 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1871 ready_to_merge_source
[pg
->pg_id
.pgid
] = version
;
1872 assert(not_ready_to_merge_source
.count(pg
->pg_id
.pgid
) == 0);
1873 _send_ready_to_merge();
1876 void OSDService::set_ready_to_merge_target(PG
*pg
,
1878 epoch_t last_epoch_started
,
1879 epoch_t last_epoch_clean
)
1881 std::lock_guard
l(merge_lock
);
1882 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1883 ready_to_merge_target
.insert(make_pair(pg
->pg_id
.pgid
,
1886 last_epoch_clean
)));
1887 assert(not_ready_to_merge_target
.count(pg
->pg_id
.pgid
) == 0);
1888 _send_ready_to_merge();
1891 void OSDService::set_not_ready_to_merge_source(pg_t source
)
1893 std::lock_guard
l(merge_lock
);
1894 dout(10) << __func__
<< " " << source
<< dendl
;
1895 not_ready_to_merge_source
.insert(source
);
1896 assert(ready_to_merge_source
.count(source
) == 0);
1897 _send_ready_to_merge();
1900 void OSDService::set_not_ready_to_merge_target(pg_t target
, pg_t source
)
1902 std::lock_guard
l(merge_lock
);
1903 dout(10) << __func__
<< " " << target
<< " source " << source
<< dendl
;
1904 not_ready_to_merge_target
[target
] = source
;
1905 assert(ready_to_merge_target
.count(target
) == 0);
1906 _send_ready_to_merge();
1909 void OSDService::send_ready_to_merge()
1911 std::lock_guard
l(merge_lock
);
1912 _send_ready_to_merge();
1915 void OSDService::_send_ready_to_merge()
1917 dout(20) << __func__
1918 << " ready_to_merge_source " << ready_to_merge_source
1919 << " not_ready_to_merge_source " << not_ready_to_merge_source
1920 << " ready_to_merge_target " << ready_to_merge_target
1921 << " not_ready_to_merge_target " << not_ready_to_merge_target
1922 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1924 for (auto src
: not_ready_to_merge_source
) {
1925 if (sent_ready_to_merge_source
.count(src
) == 0) {
1926 monc
->send_mon_message(new MOSDPGReadyToMerge(
1930 osdmap
->get_epoch()));
1931 sent_ready_to_merge_source
.insert(src
);
1934 for (auto p
: not_ready_to_merge_target
) {
1935 if (sent_ready_to_merge_source
.count(p
.second
) == 0) {
1936 monc
->send_mon_message(new MOSDPGReadyToMerge(
1940 osdmap
->get_epoch()));
1941 sent_ready_to_merge_source
.insert(p
.second
);
1944 for (auto src
: ready_to_merge_source
) {
1945 if (not_ready_to_merge_source
.count(src
.first
) ||
1946 not_ready_to_merge_target
.count(src
.first
.get_parent())) {
1949 auto p
= ready_to_merge_target
.find(src
.first
.get_parent());
1950 if (p
!= ready_to_merge_target
.end() &&
1951 sent_ready_to_merge_source
.count(src
.first
) == 0) {
1952 monc
->send_mon_message(new MOSDPGReadyToMerge(
1953 src
.first
, // source pgid
1954 src
.second
, // src version
1955 std::get
<0>(p
->second
), // target version
1956 std::get
<1>(p
->second
), // PG's last_epoch_started
1957 std::get
<2>(p
->second
), // PG's last_epoch_clean
1959 osdmap
->get_epoch()));
1960 sent_ready_to_merge_source
.insert(src
.first
);
1965 void OSDService::clear_ready_to_merge(PG
*pg
)
1967 std::lock_guard
l(merge_lock
);
1968 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1969 ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1970 ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1971 not_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1972 not_ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1973 sent_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1976 void OSDService::clear_sent_ready_to_merge()
1978 std::lock_guard
l(merge_lock
);
1979 sent_ready_to_merge_source
.clear();
1982 void OSDService::prune_sent_ready_to_merge(OSDMapRef
& osdmap
)
1984 std::lock_guard
l(merge_lock
);
1985 auto i
= sent_ready_to_merge_source
.begin();
1986 while (i
!= sent_ready_to_merge_source
.end()) {
1987 if (!osdmap
->pg_exists(*i
)) {
1988 dout(10) << __func__
<< " " << *i
<< dendl
;
1989 i
= sent_ready_to_merge_source
.erase(i
);
1998 void OSDService::_queue_for_recovery(
1999 std::pair
<epoch_t
, PGRef
> p
,
2000 uint64_t reserved_pushes
)
2002 ceph_assert(recovery_lock
.is_locked_by_me());
2005 unique_ptr
<OpQueueItem::OpQueueable
>(
2007 p
.second
->get_pgid(), p
.first
, reserved_pushes
)),
2008 cct
->_conf
->osd_recovery_cost
,
2009 cct
->_conf
->osd_recovery_priority
,
2015 // ====================================================================
2019 #define dout_prefix *_dout
2021 // Commands shared between OSD's console and admin console:
2023 namespace osd_cmds
{
2025 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
);
2027 }} // namespace ceph::osd_cmds
2029 int OSD::mkfs(CephContext
*cct
, ObjectStore
*store
, uuid_d fsid
, int whoami
)
2035 ObjectStore::CollectionHandle ch
;
2037 // if we are fed a uuid for this osd, use it.
2038 store
->set_fsid(cct
->_conf
->osd_uuid
);
2040 ret
= store
->mkfs();
2042 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error "
2043 << cpp_strerror(ret
) << dendl
;
2047 store
->set_cache_shards(1); // doesn't matter for mkfs!
2049 ret
= store
->mount();
2051 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error "
2052 << cpp_strerror(ret
) << dendl
;
2056 ch
= store
->open_collection(coll_t::meta());
2058 ret
= store
->read(ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
2060 derr
<< "OSD::mkfs: have meta collection but no superblock" << dendl
;
2063 /* if we already have superblock, check content of superblock */
2064 dout(0) << " have superblock" << dendl
;
2065 auto p
= sbbl
.cbegin();
2067 if (whoami
!= sb
.whoami
) {
2068 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
2073 if (fsid
!= sb
.cluster_fsid
) {
2074 derr
<< "provided cluster fsid " << fsid
2075 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
2080 // create superblock
2081 sb
.cluster_fsid
= fsid
;
2082 sb
.osd_fsid
= store
->get_fsid();
2084 sb
.compat_features
= get_osd_initial_compat_set();
2089 ObjectStore::CollectionHandle ch
= store
->create_new_collection(
2091 ObjectStore::Transaction t
;
2092 t
.create_collection(coll_t::meta(), 0);
2093 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
2094 ret
= store
->queue_transaction(ch
, std::move(t
));
2096 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2097 << "queue_transaction returned " << cpp_strerror(ret
) << dendl
;
2102 ret
= write_meta(cct
, store
, sb
.cluster_fsid
, sb
.osd_fsid
, whoami
);
2104 derr
<< "OSD::mkfs: failed to write fsid file: error "
2105 << cpp_strerror(ret
) << dendl
;
2119 int OSD::write_meta(CephContext
*cct
, ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
)
2124 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
2125 r
= store
->write_meta("magic", val
);
2129 snprintf(val
, sizeof(val
), "%d", whoami
);
2130 r
= store
->write_meta("whoami", val
);
2134 cluster_fsid
.print(val
);
2135 r
= store
->write_meta("ceph_fsid", val
);
2139 string key
= cct
->_conf
.get_val
<string
>("key");
2141 r
= store
->write_meta("osd_key", key
);
2145 string keyfile
= cct
->_conf
.get_val
<string
>("keyfile");
2146 if (!keyfile
.empty()) {
2149 r
= keybl
.read_file(keyfile
.c_str(), &err
);
2151 derr
<< __func__
<< " failed to read keyfile " << keyfile
<< ": "
2152 << err
<< ": " << cpp_strerror(r
) << dendl
;
2155 r
= store
->write_meta("osd_key", keybl
.to_str());
2161 r
= store
->write_meta("ready", "ready");
2168 int OSD::peek_meta(ObjectStore
*store
,
2170 uuid_d
*cluster_fsid
,
2173 int *require_osd_release
)
2177 int r
= store
->read_meta("magic", &val
);
2182 r
= store
->read_meta("whoami", &val
);
2185 *whoami
= atoi(val
.c_str());
2187 r
= store
->read_meta("ceph_fsid", &val
);
2190 r
= cluster_fsid
->parse(val
.c_str());
2194 r
= store
->read_meta("fsid", &val
);
2196 *osd_fsid
= uuid_d();
2198 r
= osd_fsid
->parse(val
.c_str());
2203 r
= store
->read_meta("require_osd_release", &val
);
2205 *require_osd_release
= atoi(val
.c_str());
2213 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2217 OSD::OSD(CephContext
*cct_
, ObjectStore
*store_
,
2219 Messenger
*internal_messenger
,
2220 Messenger
*external_messenger
,
2221 Messenger
*hb_client_front
,
2222 Messenger
*hb_client_back
,
2223 Messenger
*hb_front_serverm
,
2224 Messenger
*hb_back_serverm
,
2225 Messenger
*osdc_messenger
,
2227 const std::string
&dev
, const std::string
&jdev
) :
2229 osd_lock("OSD::osd_lock"),
2230 tick_timer(cct
, osd_lock
),
2231 tick_timer_lock("OSD::tick_timer_lock"),
2232 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
2233 gss_ktfile_client(cct
->_conf
.get_val
<std::string
>("gss_ktab_client_file")),
2234 cluster_messenger(internal_messenger
),
2235 client_messenger(external_messenger
),
2236 objecter_messenger(osdc_messenger
),
2238 mgrc(cct_
, client_messenger
),
2240 recoverystate_perf(NULL
),
2242 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
2243 clog(log_client
.create_channel()),
2245 dev_path(dev
), journal_path(jdev
),
2246 store_is_rotational(store
->is_rotational()),
2247 trace_endpoint("0.0.0.0", 0, "osd"),
2249 m_osd_pg_epoch_max_lag_factor(cct
->_conf
.get_val
<double>(
2250 "osd_pg_epoch_max_lag_factor")),
2251 osd_compat(get_osd_compat_set()),
2252 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
2253 get_num_op_threads()),
2254 command_tp(cct
, "OSD::command_tp", "tp_osd_cmd", 1),
2255 session_waiting_lock("OSD::session_waiting_lock"),
2256 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
2257 heartbeat_lock("OSD::heartbeat_lock"),
2258 heartbeat_stop(false),
2259 heartbeat_need_update(true),
2260 hb_front_client_messenger(hb_client_front
),
2261 hb_back_client_messenger(hb_client_back
),
2262 hb_front_server_messenger(hb_front_serverm
),
2263 hb_back_server_messenger(hb_back_serverm
),
2265 heartbeat_thread(this),
2266 heartbeat_dispatcher(this),
2267 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
2268 cct
->_conf
->osd_num_op_tracker_shard
),
2269 test_ops_hook(NULL
),
2270 op_queue(get_io_queue()),
2271 op_prio_cutoff(get_io_prio_cut()),
2274 cct
->_conf
->osd_op_thread_timeout
,
2275 cct
->_conf
->osd_op_thread_suicide_timeout
,
2277 map_lock("OSD::map_lock"),
2278 last_pg_create_epoch(0),
2279 mon_report_lock("OSD::mon_report_lock"),
2282 requested_full_first(0),
2283 requested_full_last(0),
2286 cct
->_conf
->osd_command_thread_timeout
,
2287 cct
->_conf
->osd_command_thread_suicide_timeout
,
2292 if (!gss_ktfile_client
.empty()) {
2293 // Assert we can export environment variable
2295 The default client keytab is used, if it is present and readable,
2296 to automatically obtain initial credentials for GSSAPI client
2297 applications. The principal name of the first entry in the client
2298 keytab is used by default when obtaining initial credentials.
2299 1. The KRB5_CLIENT_KTNAME environment variable.
2300 2. The default_client_keytab_name profile variable in [libdefaults].
2301 3. The hardcoded default, DEFCKTNAME.
2303 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2304 gss_ktfile_client
.c_str(), 1));
2305 ceph_assert(set_result
== 0);
2308 monc
->set_messenger(client_messenger
);
2309 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
2310 cct
->_conf
->osd_op_log_threshold
);
2311 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
2312 cct
->_conf
->osd_op_history_duration
);
2313 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
2314 cct
->_conf
->osd_op_history_slow_op_threshold
);
2316 std::stringstream ss
;
2317 ss
<< "osd." << whoami
;
2318 trace_endpoint
.copy_name(ss
.str());
2321 // initialize shards
2322 num_shards
= get_num_op_shards();
2323 for (uint32_t i
= 0; i
< num_shards
; i
++) {
2324 OSDShard
*one_shard
= new OSDShard(
2328 cct
->_conf
->osd_op_pq_max_tokens_per_priority
,
2329 cct
->_conf
->osd_op_pq_min_cost
,
2331 shards
.push_back(one_shard
);
2337 while (!shards
.empty()) {
2338 delete shards
.back();
2341 delete class_handler
;
2342 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
2343 cct
->get_perfcounters_collection()->remove(logger
);
2344 delete recoverystate_perf
;
2349 double OSD::get_tick_interval() const
2351 // vary +/- 5% to avoid scrub scheduling livelocks
2352 constexpr auto delta
= 0.05;
2353 return (OSD_TICK_INTERVAL
*
2354 ceph::util::generate_random_number(1.0 - delta
, 1.0 + delta
));
2357 void cls_initialize(ClassHandler
*ch
);
2359 void OSD::handle_signal(int signum
)
2361 ceph_assert(signum
== SIGINT
|| signum
== SIGTERM
);
2362 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
2368 std::lock_guard
lock(osd_lock
);
2372 if (store
->test_mount_in_use()) {
2373 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
2374 << "currently in use. (Is ceph-osd already running?)" << dendl
;
2378 cct
->_conf
.add_observer(this);
2382 int OSD::set_numa_affinity()
2384 // storage numa node
2385 int store_node
= -1;
2386 store
->get_numa_node(&store_node
, nullptr, nullptr);
2387 if (store_node
>= 0) {
2388 dout(1) << __func__
<< " storage numa node " << store_node
<< dendl
;
2391 // check network numa node(s)
2392 int front_node
= -1, back_node
= -1;
2393 string front_iface
= pick_iface(
2395 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
2396 string back_iface
= pick_iface(
2398 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
2399 int r
= get_iface_numa_node(front_iface
, &front_node
);
2401 dout(1) << __func__
<< " public network " << front_iface
<< " numa node "
2402 << front_node
<< dendl
;
2403 r
= get_iface_numa_node(back_iface
, &back_node
);
2405 dout(1) << __func__
<< " cluster network " << back_iface
<< " numa node "
2406 << back_node
<< dendl
;
2407 if (front_node
== back_node
&&
2408 front_node
== store_node
) {
2409 dout(1) << " objectstore and network numa nodes all match" << dendl
;
2410 if (g_conf().get_val
<bool>("osd_numa_auto_affinity")) {
2411 numa_node
= front_node
;
2414 dout(1) << __func__
<< " objectstore and network numa nodes do not match"
2419 derr
<< __func__
<< " unable to identify public interface '" << front_iface
2420 << "' numa node: " << cpp_strerror(r
) << dendl
;
2422 if (int node
= g_conf().get_val
<int64_t>("osd_numa_node"); node
>= 0) {
2423 // this takes precedence over the automagic logic above
2426 if (numa_node
>= 0) {
2427 int r
= get_numa_node_cpu_set(numa_node
, &numa_cpu_set_size
, &numa_cpu_set
);
2429 dout(1) << __func__
<< " unable to determine numa node " << numa_node
2430 << " CPUs" << dendl
;
2433 dout(1) << __func__
<< " setting numa affinity to node " << numa_node
2435 << cpu_set_to_str_list(numa_cpu_set_size
, &numa_cpu_set
)
2437 r
= sched_setaffinity(getpid(), numa_cpu_set_size
, &numa_cpu_set
);
2440 derr
<< __func__
<< " failed to set numa affinity: " << cpp_strerror(r
)
2446 dout(1) << __func__
<< " not setting numa affinity" << dendl
;
2453 class OSDSocketHook
: public AdminSocketHook
{
2456 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
2457 bool call(std::string_view admin_command
, const cmdmap_t
& cmdmap
,
2458 std::string_view format
, bufferlist
& out
) override
{
2462 r
= osd
->asok_command(admin_command
, cmdmap
, format
, ss
);
2463 } catch (const bad_cmd_get
& e
) {
2472 std::set
<int64_t> OSD::get_mapped_pools()
2474 std::set
<int64_t> pools
;
2475 std::vector
<spg_t
> pgids
;
2477 for (const auto &pgid
: pgids
) {
2478 pools
.insert(pgid
.pool());
2483 bool OSD::asok_command(std::string_view admin_command
, const cmdmap_t
& cmdmap
,
2484 std::string_view format
, ostream
& ss
)
2486 Formatter
*f
= Formatter::create(format
, "json-pretty", "json-pretty");
2487 if (admin_command
== "status") {
2488 f
->open_object_section("status");
2489 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
2490 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
2491 f
->dump_unsigned("whoami", superblock
.whoami
);
2492 f
->dump_string("state", get_state_name(get_state()));
2493 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
2494 f
->dump_unsigned("newest_map", superblock
.newest_map
);
2495 f
->dump_unsigned("num_pgs", num_pgs
);
2497 } else if (admin_command
== "flush_journal") {
2498 store
->flush_journal();
2499 } else if (admin_command
== "dump_ops_in_flight" ||
2500 admin_command
== "ops" ||
2501 admin_command
== "dump_blocked_ops" ||
2502 admin_command
== "dump_historic_ops" ||
2503 admin_command
== "dump_historic_ops_by_duration" ||
2504 admin_command
== "dump_historic_slow_ops") {
2506 const string error_str
= "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2507 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2508 will start to track new ops received afterwards.";
2510 set
<string
> filters
;
2511 vector
<string
> filter_str
;
2512 if (cmd_getval(cct
, cmdmap
, "filterstr", filter_str
)) {
2513 copy(filter_str
.begin(), filter_str
.end(),
2514 inserter(filters
, filters
.end()));
2517 if (admin_command
== "dump_ops_in_flight" ||
2518 admin_command
== "ops") {
2519 if (!op_tracker
.dump_ops_in_flight(f
, false, filters
)) {
2523 if (admin_command
== "dump_blocked_ops") {
2524 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
)) {
2528 if (admin_command
== "dump_historic_ops") {
2529 if (!op_tracker
.dump_historic_ops(f
, false, filters
)) {
2533 if (admin_command
== "dump_historic_ops_by_duration") {
2534 if (!op_tracker
.dump_historic_ops(f
, true, filters
)) {
2538 if (admin_command
== "dump_historic_slow_ops") {
2539 if (!op_tracker
.dump_historic_slow_ops(f
, filters
)) {
2543 } else if (admin_command
== "dump_op_pq_state") {
2544 f
->open_object_section("pq");
2545 op_shardedwq
.dump(f
);
2547 } else if (admin_command
== "dump_blacklist") {
2548 list
<pair
<entity_addr_t
,utime_t
> > bl
;
2549 OSDMapRef curmap
= service
.get_osdmap();
2551 f
->open_array_section("blacklist");
2552 curmap
->get_blacklist(&bl
);
2553 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
2554 it
!= bl
.end(); ++it
) {
2555 f
->open_object_section("entry");
2556 f
->open_object_section("entity_addr_t");
2558 f
->close_section(); //entity_addr_t
2559 it
->second
.localtime(f
->dump_stream("expire_time"));
2560 f
->close_section(); //entry
2562 f
->close_section(); //blacklist
2563 } else if (admin_command
== "dump_watchers") {
2564 list
<obj_watch_item_t
> watchers
;
2568 for (auto& pg
: pgs
) {
2569 list
<obj_watch_item_t
> pg_watchers
;
2570 pg
->get_watchers(&pg_watchers
);
2571 watchers
.splice(watchers
.end(), pg_watchers
);
2574 f
->open_array_section("watchers");
2575 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2576 it
!= watchers
.end(); ++it
) {
2578 f
->open_object_section("watch");
2580 f
->dump_string("namespace", it
->obj
.nspace
);
2581 f
->dump_string("object", it
->obj
.oid
.name
);
2583 f
->open_object_section("entity_name");
2584 it
->wi
.name
.dump(f
);
2585 f
->close_section(); //entity_name_t
2587 f
->dump_unsigned("cookie", it
->wi
.cookie
);
2588 f
->dump_unsigned("timeout", it
->wi
.timeout_seconds
);
2590 f
->open_object_section("entity_addr_t");
2591 it
->wi
.addr
.dump(f
);
2592 f
->close_section(); //entity_addr_t
2594 f
->close_section(); //watch
2597 f
->close_section(); //watchers
2598 } else if (admin_command
== "dump_reservations") {
2599 f
->open_object_section("reservations");
2600 f
->open_object_section("local_reservations");
2601 service
.local_reserver
.dump(f
);
2603 f
->open_object_section("remote_reservations");
2604 service
.remote_reserver
.dump(f
);
2607 } else if (admin_command
== "get_latest_osdmap") {
2608 get_latest_osdmap();
2609 } else if (admin_command
== "heap") {
2610 auto result
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ss
);
2612 // Note: Failed heap profile commands won't necessarily trigger an error:
2613 f
->open_object_section("result");
2614 f
->dump_string("error", cpp_strerror(result
));
2615 f
->dump_bool("success", result
>= 0);
2617 } else if (admin_command
== "set_heap_property") {
2621 bool success
= false;
2622 if (!cmd_getval(cct
, cmdmap
, "property", property
)) {
2623 error
= "unable to get property";
2625 } else if (!cmd_getval(cct
, cmdmap
, "value", value
)) {
2626 error
= "unable to get value";
2628 } else if (value
< 0) {
2629 error
= "negative value not allowed";
2631 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2632 error
= "invalid property";
2637 f
->open_object_section("result");
2638 f
->dump_string("error", error
);
2639 f
->dump_bool("success", success
);
2641 } else if (admin_command
== "get_heap_property") {
2645 bool success
= false;
2646 if (!cmd_getval(cct
, cmdmap
, "property", property
)) {
2647 error
= "unable to get property";
2649 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2650 error
= "invalid property";
2655 f
->open_object_section("result");
2656 f
->dump_string("error", error
);
2657 f
->dump_bool("success", success
);
2658 f
->dump_int("value", value
);
2660 } else if (admin_command
== "dump_objectstore_kv_stats") {
2661 store
->get_db_statistics(f
);
2662 } else if (admin_command
== "dump_scrubs") {
2663 service
.dumps_scrub(f
);
2664 } else if (admin_command
== "calc_objectstore_db_histogram") {
2665 store
->generate_db_histogram(f
);
2666 } else if (admin_command
== "flush_store_cache") {
2667 store
->flush_cache(&ss
);
2668 } else if (admin_command
== "dump_pgstate_history") {
2669 f
->open_object_section("pgstate_history");
2672 for (auto& pg
: pgs
) {
2673 f
->dump_stream("pg") << pg
->pg_id
;
2674 pg
->dump_pgstate_history(f
);
2677 } else if (admin_command
== "compact") {
2678 dout(1) << "triggering manual compaction" << dendl
;
2679 auto start
= ceph::coarse_mono_clock::now();
2681 auto end
= ceph::coarse_mono_clock::now();
2682 double duration
= std::chrono::duration
<double>(end
-start
).count();
2683 dout(1) << "finished manual compaction in "
2685 << " seconds" << dendl
;
2686 f
->open_object_section("compact_result");
2687 f
->dump_float("elapsed_time", duration
);
2689 } else if (admin_command
== "get_mapped_pools") {
2690 f
->open_array_section("mapped_pools");
2691 set
<int64_t> poollist
= get_mapped_pools();
2692 for (auto pool
: poollist
) {
2693 f
->dump_int("pool_id", pool
);
2696 } else if (admin_command
== "smart") {
2698 cmd_getval(cct
, cmdmap
, "devid", devid
);
2699 probe_smart(devid
, ss
);
2700 } else if (admin_command
== "list_devices") {
2701 set
<string
> devnames
;
2702 store
->get_devices(&devnames
);
2703 f
->open_object_section("list_devices");
2704 for (auto dev
: devnames
) {
2705 if (dev
.find("dm-") == 0) {
2708 f
->dump_string("device", "/dev/" + dev
);
2711 } else if (admin_command
== "send_beacon") {
2713 send_beacon(ceph::coarse_mono_clock::now());
2716 ceph_abort_msg("broken asok registration");
2723 class TestOpsSocketHook
: public AdminSocketHook
{
2724 OSDService
*service
;
2727 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
2728 bool call(std::string_view command
, const cmdmap_t
& cmdmap
,
2729 std::string_view format
, bufferlist
& out
) override
{
2732 test_ops(service
, store
, command
, cmdmap
, ss
);
2733 } catch (const bad_cmd_get
& e
) {
2739 void test_ops(OSDService
*service
, ObjectStore
*store
,
2740 std::string_view command
, const cmdmap_t
& cmdmap
, ostream
&ss
);
2744 class OSD::C_Tick
: public Context
{
2747 explicit C_Tick(OSD
*o
) : osd(o
) {}
2748 void finish(int r
) override
{
2753 class OSD::C_Tick_WithoutOSDLock
: public Context
{
2756 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
2757 void finish(int r
) override
{
2758 osd
->tick_without_osd_lock();
2762 int OSD::enable_disable_fuse(bool stop
)
2766 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
2767 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
2768 dout(1) << __func__
<< " disabling" << dendl
;
2772 r
= ::rmdir(mntpath
.c_str());
2775 derr
<< __func__
<< " failed to rmdir " << mntpath
<< ": "
2776 << cpp_strerror(r
) << dendl
;
2781 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
2782 dout(1) << __func__
<< " enabling" << dendl
;
2783 r
= ::mkdir(mntpath
.c_str(), 0700);
2786 if (r
< 0 && r
!= -EEXIST
) {
2787 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
2788 << cpp_strerror(r
) << dendl
;
2791 fuse_store
= new FuseStore(store
, mntpath
);
2792 r
= fuse_store
->start();
2794 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
2800 #endif // HAVE_LIBFUSE
2804 int OSD::get_num_op_shards()
2806 if (cct
->_conf
->osd_op_num_shards
)
2807 return cct
->_conf
->osd_op_num_shards
;
2808 if (store_is_rotational
)
2809 return cct
->_conf
->osd_op_num_shards_hdd
;
2811 return cct
->_conf
->osd_op_num_shards_ssd
;
2814 int OSD::get_num_op_threads()
2816 if (cct
->_conf
->osd_op_num_threads_per_shard
)
2817 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard
;
2818 if (store_is_rotational
)
2819 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_hdd
;
2821 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_ssd
;
2824 float OSD::get_osd_recovery_sleep()
2826 if (cct
->_conf
->osd_recovery_sleep
)
2827 return cct
->_conf
->osd_recovery_sleep
;
2828 if (!store_is_rotational
&& !journal_is_rotational
)
2829 return cct
->_conf
->osd_recovery_sleep_ssd
;
2830 else if (store_is_rotational
&& !journal_is_rotational
)
2831 return cct
->_conf
.get_val
<double>("osd_recovery_sleep_hybrid");
2833 return cct
->_conf
->osd_recovery_sleep_hdd
;
2836 float OSD::get_osd_delete_sleep()
2838 float osd_delete_sleep
= cct
->_conf
.get_val
<double>("osd_delete_sleep");
2839 if (osd_delete_sleep
> 0)
2840 return osd_delete_sleep
;
2841 if (!store_is_rotational
&& !journal_is_rotational
)
2842 return cct
->_conf
.get_val
<double>("osd_delete_sleep_ssd");
2843 if (store_is_rotational
&& !journal_is_rotational
)
2844 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hybrid");
2845 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hdd");
2848 float OSD::get_osd_snap_trim_sleep()
2850 float osd_snap_trim_sleep
= cct
->_conf
.get_val
<double>("osd_snap_trim_sleep");
2851 if (osd_snap_trim_sleep
> 0)
2852 return osd_snap_trim_sleep
;
2853 if (!store_is_rotational
&& !journal_is_rotational
)
2854 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_ssd");
2855 if (store_is_rotational
&& !journal_is_rotational
)
2856 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hybrid");
2857 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hdd");
2862 CompatSet initial
, diff
;
2863 std::lock_guard
lock(osd_lock
);
2868 tick_timer_without_osd_lock
.init();
2869 service
.recovery_request_timer
.init();
2870 service
.sleep_timer
.init();
2872 boot_finisher
.start();
2876 store
->read_meta("require_osd_release", &val
);
2877 last_require_osd_release
= atoi(val
.c_str());
2881 dout(2) << "init " << dev_path
2882 << " (looks like " << (store_is_rotational
? "hdd" : "ssd") << ")"
2884 dout(2) << "journal " << journal_path
<< dendl
;
2885 ceph_assert(store
); // call pre_init() first!
2887 store
->set_cache_shards(get_num_op_shards());
2889 int r
= store
->mount();
2891 derr
<< "OSD:init: unable to mount object store" << dendl
;
2894 journal_is_rotational
= store
->is_journal_rotational();
2895 dout(2) << "journal looks like " << (journal_is_rotational
? "hdd" : "ssd")
2898 enable_disable_fuse(false);
2900 dout(2) << "boot" << dendl
;
2902 service
.meta_ch
= store
->open_collection(coll_t::meta());
2904 // initialize the daily loadavg with current 15min loadavg
2906 if (getloadavg(loadavgs
, 3) == 3) {
2907 daily_loadavg
= loadavgs
[2];
2909 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
2910 daily_loadavg
= 1.0;
2913 int rotating_auth_attempts
= 0;
2914 auto rotating_auth_timeout
=
2915 g_conf().get_val
<int64_t>("rotating_keys_bootstrap_timeout");
2917 // sanity check long object name handling
2920 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
2921 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
2922 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
2923 r
= store
->validate_hobject_key(l
);
2925 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
2926 << "object name[space] len" << dendl
;
2927 derr
<< " osd max object name len = "
2928 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
2929 derr
<< " osd max object namespace len = "
2930 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
2931 derr
<< cpp_strerror(r
) << dendl
;
2932 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
2935 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
2938 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
2943 r
= read_superblock();
2945 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
2950 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
2951 derr
<< "The disk uses features unsupported by the executable." << dendl
;
2952 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
2953 derr
<< " daemon features " << osd_compat
<< dendl
;
2955 if (osd_compat
.writeable(superblock
.compat_features
)) {
2956 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
2957 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
2962 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
2963 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
2969 assert_warn(whoami
== superblock
.whoami
);
2970 if (whoami
!= superblock
.whoami
) {
2971 derr
<< "OSD::init: superblock says osd"
2972 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
2977 // load up "current" osdmap
2978 assert_warn(!osdmap
);
2980 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
2984 osdmap
= get_map(superblock
.current_epoch
);
2986 // make sure we don't have legacy pgs deleting
2989 int r
= store
->list_collections(ls
);
2990 ceph_assert(r
>= 0);
2993 if (c
.is_pg(&pgid
) &&
2994 !osdmap
->have_pg_pool(pgid
.pool())) {
2995 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
2996 if (!store
->exists(service
.meta_ch
, oid
)) {
2997 derr
<< __func__
<< " missing pg_pool_t for deleted pool "
2998 << pgid
.pool() << " for pg " << pgid
2999 << "; please downgrade to luminous and allow "
3000 << "pg deletion to complete before upgrading" << dendl
;
3007 initial
= get_osd_initial_compat_set();
3008 diff
= superblock
.compat_features
.unsupported(initial
);
3009 if (superblock
.compat_features
.merge(initial
)) {
3010 // We need to persist the new compat_set before we
3012 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
3013 ObjectStore::Transaction t
;
3014 write_superblock(t
);
3015 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3020 // make sure snap mapper object exists
3021 if (!store
->exists(service
.meta_ch
, OSD::make_snapmapper_oid())) {
3022 dout(10) << "init creating/touching snapmapper object" << dendl
;
3023 ObjectStore::Transaction t
;
3024 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3025 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3030 class_handler
= new ClassHandler(cct
);
3031 cls_initialize(class_handler
);
3033 if (cct
->_conf
->osd_open_classes_on_start
) {
3034 int r
= class_handler
->open_all_classes();
3036 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
3039 check_osdmap_features();
3041 create_recoverystate_perf();
3044 epoch_t bind_epoch
= osdmap
->get_epoch();
3045 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
3048 clear_temp_objects();
3050 // initialize osdmap references in sharded wq
3051 for (auto& shard
: shards
) {
3052 std::lock_guard
l(shard
->osdmap_lock
);
3053 shard
->shard_osdmap
= osdmap
;
3056 // load up pgs (as they previously existed)
3059 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
3060 dout(0) << "using " << op_queue
<< " op queue with priority op cut off at " <<
3061 op_prio_cutoff
<< "." << dendl
;
3067 struct store_statfs_t stbuf
;
3068 osd_alert_list_t alerts
;
3069 int r
= store
->statfs(&stbuf
, &alerts
);
3070 ceph_assert(r
== 0);
3071 service
.set_statfs(stbuf
, alerts
);
3074 // client_messenger auth_client is already set up by monc.
3075 for (auto m
: { cluster_messenger
,
3077 hb_front_client_messenger
,
3078 hb_back_client_messenger
,
3079 hb_front_server_messenger
,
3080 hb_back_server_messenger
} ) {
3081 m
->set_auth_client(monc
);
3083 for (auto m
: { client_messenger
,
3085 hb_front_server_messenger
,
3086 hb_back_server_messenger
}) {
3087 m
->set_auth_server(monc
);
3089 monc
->set_handle_authentication_dispatcher(this);
3091 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
3092 | CEPH_ENTITY_TYPE_MGR
);
3097 mgrc
.set_pgstats_cb([this](){ return collect_pg_stats(); });
3098 mgrc
.set_perf_metric_query_cb(
3099 [this](const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
) {
3100 set_perf_queries(queries
);
3102 [this](std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> *reports
) {
3103 get_perf_reports(reports
);
3107 // tell monc about log_client so it will know about mon session resets
3108 monc
->set_log_client(&log_client
);
3109 update_log_config();
3112 client_messenger
->add_dispatcher_tail(&mgrc
);
3113 client_messenger
->add_dispatcher_tail(this);
3114 cluster_messenger
->add_dispatcher_head(this);
3116 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3117 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3118 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3119 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3121 objecter_messenger
->add_dispatcher_head(service
.objecter
);
3124 service
.publish_map(osdmap
);
3125 service
.publish_superblock(superblock
);
3126 service
.max_oldest_map
= superblock
.oldest_map
;
3128 for (auto& shard
: shards
) {
3129 // put PGs in a temporary set because we may modify pg_slots
3130 // unordered_map below.
3132 for (auto& i
: shard
->pg_slots
) {
3133 PGRef pg
= i
.second
->pg
;
3139 for (auto pg
: pgs
) {
3141 set
<pair
<spg_t
,epoch_t
>> new_children
;
3142 set
<pair
<spg_t
,epoch_t
>> merge_pgs
;
3143 service
.identify_splits_and_merges(pg
->get_osdmap(), osdmap
, pg
->pg_id
,
3144 &new_children
, &merge_pgs
);
3145 if (!new_children
.empty()) {
3146 for (auto shard
: shards
) {
3147 shard
->prime_splits(osdmap
, &new_children
);
3149 assert(new_children
.empty());
3151 if (!merge_pgs
.empty()) {
3152 for (auto shard
: shards
) {
3153 shard
->prime_merges(osdmap
, &merge_pgs
);
3155 assert(merge_pgs
.empty());
3164 // start the heartbeat
3165 heartbeat_thread
.create("osd_srv_heartbt");
3168 tick_timer
.add_event_after(get_tick_interval(),
3171 std::lock_guard
l(tick_timer_lock
);
3172 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
3173 new C_Tick_WithoutOSDLock(this));
3178 r
= monc
->authenticate();
3180 derr
<< __func__
<< " authentication failed: " << cpp_strerror(r
)
3185 while (monc
->wait_auth_rotating(rotating_auth_timeout
) < 0) {
3186 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
3187 ++rotating_auth_attempts
;
3188 if (rotating_auth_attempts
> g_conf()->max_rotating_auth_attempts
) {
3189 derr
<< __func__
<< " wait_auth_rotating timed out" << dendl
;
3194 r
= update_crush_device_class();
3196 derr
<< __func__
<< " unable to update_crush_device_class: "
3197 << cpp_strerror(r
) << dendl
;
3201 r
= update_crush_location();
3203 derr
<< __func__
<< " unable to update_crush_location: "
3204 << cpp_strerror(r
) << dendl
;
3212 // start objecter *after* we have authenticated, so that we don't ignore
3213 // the OSDMaps it requests.
3214 service
.final_init();
3218 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
3221 dout(0) << "done with init, starting boot process" << dendl
;
3223 // subscribe to any pg creations
3224 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
3226 // MgrClient needs this (it doesn't have MonClient reference itself)
3227 monc
->sub_want("mgrmap", 0, 0);
3229 // we don't need to ask for an osdmap here; objecter will
3230 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3239 enable_disable_fuse(true);
3246 void OSD::final_init()
3248 AdminSocket
*admin_socket
= cct
->get_admin_socket();
3249 asok_hook
= new OSDSocketHook(this);
3250 int r
= admin_socket
->register_command("status", "status", asok_hook
,
3251 "high-level status of OSD");
3252 ceph_assert(r
== 0);
3253 r
= admin_socket
->register_command("flush_journal", "flush_journal",
3255 "flush the journal to permanent store");
3256 ceph_assert(r
== 0);
3257 r
= admin_socket
->register_command("dump_ops_in_flight",
3258 "dump_ops_in_flight " \
3259 "name=filterstr,type=CephString,n=N,req=false",
3261 "show the ops currently in flight");
3262 ceph_assert(r
== 0);
3263 r
= admin_socket
->register_command("ops",
3265 "name=filterstr,type=CephString,n=N,req=false",
3267 "show the ops currently in flight");
3268 ceph_assert(r
== 0);
3269 r
= admin_socket
->register_command("dump_blocked_ops",
3270 "dump_blocked_ops " \
3271 "name=filterstr,type=CephString,n=N,req=false",
3273 "show the blocked ops currently in flight");
3274 ceph_assert(r
== 0);
3275 r
= admin_socket
->register_command("dump_historic_ops",
3276 "dump_historic_ops " \
3277 "name=filterstr,type=CephString,n=N,req=false",
3280 ceph_assert(r
== 0);
3281 r
= admin_socket
->register_command("dump_historic_slow_ops",
3282 "dump_historic_slow_ops " \
3283 "name=filterstr,type=CephString,n=N,req=false",
3285 "show slowest recent ops");
3286 ceph_assert(r
== 0);
3287 r
= admin_socket
->register_command("dump_historic_ops_by_duration",
3288 "dump_historic_ops_by_duration " \
3289 "name=filterstr,type=CephString,n=N,req=false",
3291 "show slowest recent ops, sorted by duration");
3292 ceph_assert(r
== 0);
3293 r
= admin_socket
->register_command("dump_op_pq_state", "dump_op_pq_state",
3295 "dump op priority queue state");
3296 ceph_assert(r
== 0);
3297 r
= admin_socket
->register_command("dump_blacklist", "dump_blacklist",
3299 "dump blacklisted clients and times");
3300 ceph_assert(r
== 0);
3301 r
= admin_socket
->register_command("dump_watchers", "dump_watchers",
3303 "show clients which have active watches,"
3304 " and on which objects");
3305 ceph_assert(r
== 0);
3306 r
= admin_socket
->register_command("dump_reservations", "dump_reservations",
3308 "show recovery reservations");
3309 ceph_assert(r
== 0);
3310 r
= admin_socket
->register_command("get_latest_osdmap", "get_latest_osdmap",
3312 "force osd to update the latest map from "
3314 ceph_assert(r
== 0);
3316 r
= admin_socket
->register_command( "heap",
3318 "name=heapcmd,type=CephString " \
3319 "name=value,type=CephString,req=false",
3321 "show heap usage info (available only if "
3322 "compiled with tcmalloc)");
3323 ceph_assert(r
== 0);
3325 r
= admin_socket
->register_command("set_heap_property",
3326 "set_heap_property " \
3327 "name=property,type=CephString " \
3328 "name=value,type=CephInt",
3330 "update malloc extension heap property");
3331 ceph_assert(r
== 0);
3333 r
= admin_socket
->register_command("get_heap_property",
3334 "get_heap_property " \
3335 "name=property,type=CephString",
3337 "get malloc extension heap property");
3338 ceph_assert(r
== 0);
3340 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
3341 "dump_objectstore_kv_stats",
3343 "print statistics of kvdb which used by bluestore");
3344 ceph_assert(r
== 0);
3346 r
= admin_socket
->register_command("dump_scrubs",
3349 "print scheduled scrubs");
3350 ceph_assert(r
== 0);
3352 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
3353 "calc_objectstore_db_histogram",
3355 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3356 ceph_assert(r
== 0);
3358 r
= admin_socket
->register_command("flush_store_cache",
3359 "flush_store_cache",
3361 "Flush bluestore internal cache");
3362 ceph_assert(r
== 0);
3363 r
= admin_socket
->register_command("dump_pgstate_history", "dump_pgstate_history",
3365 "show recent state history");
3366 ceph_assert(r
== 0);
3368 r
= admin_socket
->register_command("compact", "compact",
3370 "Commpact object store's omap."
3371 " WARNING: Compaction probably slows your requests");
3372 ceph_assert(r
== 0);
3374 r
= admin_socket
->register_command("get_mapped_pools", "get_mapped_pools",
3376 "dump pools whose PG(s) are mapped to this OSD.");
3378 ceph_assert(r
== 0);
3380 r
= admin_socket
->register_command("smart", "smart name=devid,type=CephString,req=False",
3382 "probe OSD devices for SMART data.");
3384 ceph_assert(r
== 0);
3386 r
= admin_socket
->register_command("list_devices", "list_devices",
3388 "list OSD devices.");
3389 r
= admin_socket
->register_command("send_beacon", "send_beacon",
3391 "send OSD beacon to mon immediately");
3393 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
);
3394 // Note: pools are CephString instead of CephPoolname because
3395 // these commands traditionally support both pool names and numbers
3396 r
= admin_socket
->register_command(
3399 "name=pool,type=CephString " \
3400 "name=objname,type=CephObjectname " \
3401 "name=key,type=CephString "\
3402 "name=val,type=CephString",
3405 ceph_assert(r
== 0);
3406 r
= admin_socket
->register_command(
3409 "name=pool,type=CephString " \
3410 "name=objname,type=CephObjectname " \
3411 "name=key,type=CephString",
3414 ceph_assert(r
== 0);
3415 r
= admin_socket
->register_command(
3418 "name=pool,type=CephString " \
3419 "name=objname,type=CephObjectname " \
3420 "name=header,type=CephString",
3423 ceph_assert(r
== 0);
3425 r
= admin_socket
->register_command(
3428 "name=pool,type=CephString " \
3429 "name=objname,type=CephObjectname",
3431 "output entire object map");
3432 ceph_assert(r
== 0);
3434 r
= admin_socket
->register_command(
3437 "name=pool,type=CephString " \
3438 "name=objname,type=CephObjectname " \
3439 "name=len,type=CephInt",
3441 "truncate object to length");
3442 ceph_assert(r
== 0);
3444 r
= admin_socket
->register_command(
3447 "name=pool,type=CephString " \
3448 "name=objname,type=CephObjectname " \
3449 "name=shardid,type=CephInt,req=false,range=0|255",
3451 "inject data error to an object");
3452 ceph_assert(r
== 0);
3454 r
= admin_socket
->register_command(
3457 "name=pool,type=CephString " \
3458 "name=objname,type=CephObjectname " \
3459 "name=shardid,type=CephInt,req=false,range=0|255",
3461 "inject metadata error to an object");
3462 ceph_assert(r
== 0);
3463 r
= admin_socket
->register_command(
3464 "set_recovery_delay",
3465 "set_recovery_delay " \
3466 "name=utime,type=CephInt,req=false",
3468 "Delay osd recovery by specified seconds");
3469 ceph_assert(r
== 0);
3470 r
= admin_socket
->register_command(
3473 "name=pgid,type=CephString " \
3474 "name=time,type=CephInt,req=false",
3476 "Trigger a scheduled scrub ");
3477 ceph_assert(r
== 0);
3478 r
= admin_socket
->register_command(
3479 "trigger_deep_scrub",
3480 "trigger_deep_scrub " \
3481 "name=pgid,type=CephString " \
3482 "name=time,type=CephInt,req=false",
3484 "Trigger a scheduled deep scrub ");
3485 ceph_assert(r
== 0);
3486 r
= admin_socket
->register_command(
3489 "name=type,type=CephString,req=false " \
3490 "name=count,type=CephInt,req=false ",
3492 "Inject a full disk (optional count times)");
3493 ceph_assert(r
== 0);
3496 void OSD::create_logger()
3498 dout(10) << "create_logger" << dendl
;
3500 PerfCountersBuilder
osd_plb(cct
, "osd", l_osd_first
, l_osd_last
);
3502 // Latency axis configuration for op histograms, values are in nanoseconds
3503 PerfHistogramCommon::axis_config_d op_hist_x_axis_config
{
3505 PerfHistogramCommon::SCALE_LOG2
, ///< Latency in logarithmic scale
3507 100000, ///< Quantization unit is 100usec
3508 32, ///< Enough to cover much longer than slow requests
3511 // Op size axis configuration for op histograms, values are in bytes
3512 PerfHistogramCommon::axis_config_d op_hist_y_axis_config
{
3513 "Request size (bytes)",
3514 PerfHistogramCommon::SCALE_LOG2
, ///< Request size in logarithmic scale
3516 512, ///< Quantization unit is 512 bytes
3517 32, ///< Enough to cover requests larger than GB
3521 // All the basic OSD operation stats are to be considered useful
3522 osd_plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
3525 l_osd_op_wip
, "op_wip",
3526 "Replication operations currently being processed (primary)");
3527 osd_plb
.add_u64_counter(
3529 "Client operations",
3530 "ops", PerfCountersBuilder::PRIO_CRITICAL
);
3531 osd_plb
.add_u64_counter(
3532 l_osd_op_inb
, "op_in_bytes",
3533 "Client operations total write size",
3534 "wr", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
3535 osd_plb
.add_u64_counter(
3536 l_osd_op_outb
, "op_out_bytes",
3537 "Client operations total read size",
3538 "rd", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
3539 osd_plb
.add_time_avg(
3540 l_osd_op_lat
, "op_latency",
3541 "Latency of client operations (including queue time)",
3543 osd_plb
.add_time_avg(
3544 l_osd_op_process_lat
, "op_process_latency",
3545 "Latency of client operations (excluding queue time)");
3546 osd_plb
.add_time_avg(
3547 l_osd_op_prepare_lat
, "op_prepare_latency",
3548 "Latency of client operations (excluding queue time and wait for finished)");
3550 osd_plb
.add_u64_counter(
3551 l_osd_op_r
, "op_r", "Client read operations");
3552 osd_plb
.add_u64_counter(
3553 l_osd_op_r_outb
, "op_r_out_bytes", "Client data read", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3554 osd_plb
.add_time_avg(
3555 l_osd_op_r_lat
, "op_r_latency",
3556 "Latency of read operation (including queue time)");
3557 osd_plb
.add_u64_counter_histogram(
3558 l_osd_op_r_lat_outb_hist
, "op_r_latency_out_bytes_histogram",
3559 op_hist_x_axis_config
, op_hist_y_axis_config
,
3560 "Histogram of operation latency (including queue time) + data read");
3561 osd_plb
.add_time_avg(
3562 l_osd_op_r_process_lat
, "op_r_process_latency",
3563 "Latency of read operation (excluding queue time)");
3564 osd_plb
.add_time_avg(
3565 l_osd_op_r_prepare_lat
, "op_r_prepare_latency",
3566 "Latency of read operations (excluding queue time and wait for finished)");
3567 osd_plb
.add_u64_counter(
3568 l_osd_op_w
, "op_w", "Client write operations");
3569 osd_plb
.add_u64_counter(
3570 l_osd_op_w_inb
, "op_w_in_bytes", "Client data written");
3571 osd_plb
.add_time_avg(
3572 l_osd_op_w_lat
, "op_w_latency",
3573 "Latency of write operation (including queue time)");
3574 osd_plb
.add_u64_counter_histogram(
3575 l_osd_op_w_lat_inb_hist
, "op_w_latency_in_bytes_histogram",
3576 op_hist_x_axis_config
, op_hist_y_axis_config
,
3577 "Histogram of operation latency (including queue time) + data written");
3578 osd_plb
.add_time_avg(
3579 l_osd_op_w_process_lat
, "op_w_process_latency",
3580 "Latency of write operation (excluding queue time)");
3581 osd_plb
.add_time_avg(
3582 l_osd_op_w_prepare_lat
, "op_w_prepare_latency",
3583 "Latency of write operations (excluding queue time and wait for finished)");
3584 osd_plb
.add_u64_counter(
3585 l_osd_op_rw
, "op_rw",
3586 "Client read-modify-write operations");
3587 osd_plb
.add_u64_counter(
3588 l_osd_op_rw_inb
, "op_rw_in_bytes",
3589 "Client read-modify-write operations write in", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3590 osd_plb
.add_u64_counter(
3591 l_osd_op_rw_outb
,"op_rw_out_bytes",
3592 "Client read-modify-write operations read out ", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3593 osd_plb
.add_time_avg(
3594 l_osd_op_rw_lat
, "op_rw_latency",
3595 "Latency of read-modify-write operation (including queue time)");
3596 osd_plb
.add_u64_counter_histogram(
3597 l_osd_op_rw_lat_inb_hist
, "op_rw_latency_in_bytes_histogram",
3598 op_hist_x_axis_config
, op_hist_y_axis_config
,
3599 "Histogram of rw operation latency (including queue time) + data written");
3600 osd_plb
.add_u64_counter_histogram(
3601 l_osd_op_rw_lat_outb_hist
, "op_rw_latency_out_bytes_histogram",
3602 op_hist_x_axis_config
, op_hist_y_axis_config
,
3603 "Histogram of rw operation latency (including queue time) + data read");
3604 osd_plb
.add_time_avg(
3605 l_osd_op_rw_process_lat
, "op_rw_process_latency",
3606 "Latency of read-modify-write operation (excluding queue time)");
3607 osd_plb
.add_time_avg(
3608 l_osd_op_rw_prepare_lat
, "op_rw_prepare_latency",
3609 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3611 // Now we move on to some more obscure stats, revert to assuming things
3612 // are low priority unless otherwise specified.
3613 osd_plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
3615 osd_plb
.add_time_avg(l_osd_op_before_queue_op_lat
, "op_before_queue_op_lat",
3616 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3617 osd_plb
.add_time_avg(l_osd_op_before_dequeue_op_lat
, "op_before_dequeue_op_lat",
3618 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3620 osd_plb
.add_u64_counter(
3621 l_osd_sop
, "subop", "Suboperations");
3622 osd_plb
.add_u64_counter(
3623 l_osd_sop_inb
, "subop_in_bytes", "Suboperations total size", NULL
, 0, unit_t(UNIT_BYTES
));
3624 osd_plb
.add_time_avg(l_osd_sop_lat
, "subop_latency", "Suboperations latency");
3626 osd_plb
.add_u64_counter(l_osd_sop_w
, "subop_w", "Replicated writes");
3627 osd_plb
.add_u64_counter(
3628 l_osd_sop_w_inb
, "subop_w_in_bytes", "Replicated written data size", NULL
, 0, unit_t(UNIT_BYTES
));
3629 osd_plb
.add_time_avg(
3630 l_osd_sop_w_lat
, "subop_w_latency", "Replicated writes latency");
3631 osd_plb
.add_u64_counter(
3632 l_osd_sop_pull
, "subop_pull", "Suboperations pull requests");
3633 osd_plb
.add_time_avg(
3634 l_osd_sop_pull_lat
, "subop_pull_latency", "Suboperations pull latency");
3635 osd_plb
.add_u64_counter(
3636 l_osd_sop_push
, "subop_push", "Suboperations push messages");
3637 osd_plb
.add_u64_counter(
3638 l_osd_sop_push_inb
, "subop_push_in_bytes", "Suboperations pushed size", NULL
, 0, unit_t(UNIT_BYTES
));
3639 osd_plb
.add_time_avg(
3640 l_osd_sop_push_lat
, "subop_push_latency", "Suboperations push latency");
3642 osd_plb
.add_u64_counter(l_osd_pull
, "pull", "Pull requests sent");
3643 osd_plb
.add_u64_counter(l_osd_push
, "push", "Push messages sent");
3644 osd_plb
.add_u64_counter(l_osd_push_outb
, "push_out_bytes", "Pushed size", NULL
, 0, unit_t(UNIT_BYTES
));
3646 osd_plb
.add_u64_counter(
3647 l_osd_rop
, "recovery_ops",
3648 "Started recovery operations",
3649 "rop", PerfCountersBuilder::PRIO_INTERESTING
);
3651 osd_plb
.add_u64_counter(
3652 l_osd_rbytes
, "recovery_bytes",
3654 "rbt", PerfCountersBuilder::PRIO_INTERESTING
);
3656 osd_plb
.add_u64(l_osd_loadavg
, "loadavg", "CPU load");
3658 l_osd_cached_crc
, "cached_crc", "Total number getting crc from crc_cache");
3660 l_osd_cached_crc_adjusted
, "cached_crc_adjusted",
3661 "Total number getting crc from crc_cache with adjusting");
3662 osd_plb
.add_u64(l_osd_missed_crc
, "missed_crc",
3663 "Total number of crc cache misses");
3665 osd_plb
.add_u64(l_osd_pg
, "numpg", "Placement groups",
3666 "pgs", PerfCountersBuilder::PRIO_USEFUL
);
3668 l_osd_pg_primary
, "numpg_primary",
3669 "Placement groups for which this osd is primary");
3671 l_osd_pg_replica
, "numpg_replica",
3672 "Placement groups for which this osd is replica");
3674 l_osd_pg_stray
, "numpg_stray",
3675 "Placement groups ready to be deleted from this osd");
3677 l_osd_pg_removing
, "numpg_removing",
3678 "Placement groups queued for local deletion", "pgsr",
3679 PerfCountersBuilder::PRIO_USEFUL
);
3681 l_osd_hb_to
, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3682 osd_plb
.add_u64_counter(l_osd_map
, "map_messages", "OSD map messages");
3683 osd_plb
.add_u64_counter(l_osd_mape
, "map_message_epochs", "OSD map epochs");
3684 osd_plb
.add_u64_counter(
3685 l_osd_mape_dup
, "map_message_epoch_dups", "OSD map duplicates");
3686 osd_plb
.add_u64_counter(
3687 l_osd_waiting_for_map
, "messages_delayed_for_map",
3688 "Operations waiting for OSD map");
3690 osd_plb
.add_u64_counter(
3691 l_osd_map_cache_hit
, "osd_map_cache_hit", "osdmap cache hit");
3692 osd_plb
.add_u64_counter(
3693 l_osd_map_cache_miss
, "osd_map_cache_miss", "osdmap cache miss");
3694 osd_plb
.add_u64_counter(
3695 l_osd_map_cache_miss_low
, "osd_map_cache_miss_low",
3696 "osdmap cache miss below cache lower bound");
3697 osd_plb
.add_u64_avg(
3698 l_osd_map_cache_miss_low_avg
, "osd_map_cache_miss_low_avg",
3699 "osdmap cache miss, avg distance below cache lower bound");
3700 osd_plb
.add_u64_counter(
3701 l_osd_map_bl_cache_hit
, "osd_map_bl_cache_hit",
3702 "OSDMap buffer cache hits");
3703 osd_plb
.add_u64_counter(
3704 l_osd_map_bl_cache_miss
, "osd_map_bl_cache_miss",
3705 "OSDMap buffer cache misses");
3708 l_osd_stat_bytes
, "stat_bytes", "OSD size", "size",
3709 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3711 l_osd_stat_bytes_used
, "stat_bytes_used", "Used space", "used",
3712 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
3713 osd_plb
.add_u64(l_osd_stat_bytes_avail
, "stat_bytes_avail", "Available space", NULL
, 0, unit_t(UNIT_BYTES
));
3715 osd_plb
.add_u64_counter(
3716 l_osd_copyfrom
, "copyfrom", "Rados \"copy-from\" operations");
3718 osd_plb
.add_u64_counter(l_osd_tier_promote
, "tier_promote", "Tier promotions");
3719 osd_plb
.add_u64_counter(l_osd_tier_flush
, "tier_flush", "Tier flushes");
3720 osd_plb
.add_u64_counter(
3721 l_osd_tier_flush_fail
, "tier_flush_fail", "Failed tier flushes");
3722 osd_plb
.add_u64_counter(
3723 l_osd_tier_try_flush
, "tier_try_flush", "Tier flush attempts");
3724 osd_plb
.add_u64_counter(
3725 l_osd_tier_try_flush_fail
, "tier_try_flush_fail",
3726 "Failed tier flush attempts");
3727 osd_plb
.add_u64_counter(
3728 l_osd_tier_evict
, "tier_evict", "Tier evictions");
3729 osd_plb
.add_u64_counter(
3730 l_osd_tier_whiteout
, "tier_whiteout", "Tier whiteouts");
3731 osd_plb
.add_u64_counter(
3732 l_osd_tier_dirty
, "tier_dirty", "Dirty tier flag set");
3733 osd_plb
.add_u64_counter(
3734 l_osd_tier_clean
, "tier_clean", "Dirty tier flag cleaned");
3735 osd_plb
.add_u64_counter(
3736 l_osd_tier_delay
, "tier_delay", "Tier delays (agent waiting)");
3737 osd_plb
.add_u64_counter(
3738 l_osd_tier_proxy_read
, "tier_proxy_read", "Tier proxy reads");
3739 osd_plb
.add_u64_counter(
3740 l_osd_tier_proxy_write
, "tier_proxy_write", "Tier proxy writes");
3742 osd_plb
.add_u64_counter(
3743 l_osd_agent_wake
, "agent_wake", "Tiering agent wake up");
3744 osd_plb
.add_u64_counter(
3745 l_osd_agent_skip
, "agent_skip", "Objects skipped by agent");
3746 osd_plb
.add_u64_counter(
3747 l_osd_agent_flush
, "agent_flush", "Tiering agent flushes");
3748 osd_plb
.add_u64_counter(
3749 l_osd_agent_evict
, "agent_evict", "Tiering agent evictions");
3751 osd_plb
.add_u64_counter(
3752 l_osd_object_ctx_cache_hit
, "object_ctx_cache_hit", "Object context cache hits");
3753 osd_plb
.add_u64_counter(
3754 l_osd_object_ctx_cache_total
, "object_ctx_cache_total", "Object context cache lookups");
3756 osd_plb
.add_u64_counter(l_osd_op_cache_hit
, "op_cache_hit");
3757 osd_plb
.add_time_avg(
3758 l_osd_tier_flush_lat
, "osd_tier_flush_lat", "Object flush latency");
3759 osd_plb
.add_time_avg(
3760 l_osd_tier_promote_lat
, "osd_tier_promote_lat", "Object promote latency");
3761 osd_plb
.add_time_avg(
3762 l_osd_tier_r_lat
, "osd_tier_r_lat", "Object proxy read latency");
3764 osd_plb
.add_u64_counter(
3765 l_osd_pg_info
, "osd_pg_info", "PG updated its info (using any method)");
3766 osd_plb
.add_u64_counter(
3767 l_osd_pg_fastinfo
, "osd_pg_fastinfo",
3768 "PG updated its info using fastinfo attr");
3769 osd_plb
.add_u64_counter(
3770 l_osd_pg_biginfo
, "osd_pg_biginfo", "PG updated its biginfo attr");
3772 logger
= osd_plb
.create_perf_counters();
3773 cct
->get_perfcounters_collection()->add(logger
);
3776 void OSD::create_recoverystate_perf()
3778 dout(10) << "create_recoverystate_perf" << dendl
;
3780 PerfCountersBuilder
rs_perf(cct
, "recoverystate_perf", rs_first
, rs_last
);
3782 rs_perf
.add_time_avg(rs_initial_latency
, "initial_latency", "Initial recovery state latency");
3783 rs_perf
.add_time_avg(rs_started_latency
, "started_latency", "Started recovery state latency");
3784 rs_perf
.add_time_avg(rs_reset_latency
, "reset_latency", "Reset recovery state latency");
3785 rs_perf
.add_time_avg(rs_start_latency
, "start_latency", "Start recovery state latency");
3786 rs_perf
.add_time_avg(rs_primary_latency
, "primary_latency", "Primary recovery state latency");
3787 rs_perf
.add_time_avg(rs_peering_latency
, "peering_latency", "Peering recovery state latency");
3788 rs_perf
.add_time_avg(rs_backfilling_latency
, "backfilling_latency", "Backfilling recovery state latency");
3789 rs_perf
.add_time_avg(rs_waitremotebackfillreserved_latency
, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3790 rs_perf
.add_time_avg(rs_waitlocalbackfillreserved_latency
, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3791 rs_perf
.add_time_avg(rs_notbackfilling_latency
, "notbackfilling_latency", "Notbackfilling recovery state latency");
3792 rs_perf
.add_time_avg(rs_repnotrecovering_latency
, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3793 rs_perf
.add_time_avg(rs_repwaitrecoveryreserved_latency
, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3794 rs_perf
.add_time_avg(rs_repwaitbackfillreserved_latency
, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3795 rs_perf
.add_time_avg(rs_reprecovering_latency
, "reprecovering_latency", "RepRecovering recovery state latency");
3796 rs_perf
.add_time_avg(rs_activating_latency
, "activating_latency", "Activating recovery state latency");
3797 rs_perf
.add_time_avg(rs_waitlocalrecoveryreserved_latency
, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3798 rs_perf
.add_time_avg(rs_waitremoterecoveryreserved_latency
, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3799 rs_perf
.add_time_avg(rs_recovering_latency
, "recovering_latency", "Recovering recovery state latency");
3800 rs_perf
.add_time_avg(rs_recovered_latency
, "recovered_latency", "Recovered recovery state latency");
3801 rs_perf
.add_time_avg(rs_clean_latency
, "clean_latency", "Clean recovery state latency");
3802 rs_perf
.add_time_avg(rs_active_latency
, "active_latency", "Active recovery state latency");
3803 rs_perf
.add_time_avg(rs_replicaactive_latency
, "replicaactive_latency", "Replicaactive recovery state latency");
3804 rs_perf
.add_time_avg(rs_stray_latency
, "stray_latency", "Stray recovery state latency");
3805 rs_perf
.add_time_avg(rs_getinfo_latency
, "getinfo_latency", "Getinfo recovery state latency");
3806 rs_perf
.add_time_avg(rs_getlog_latency
, "getlog_latency", "Getlog recovery state latency");
3807 rs_perf
.add_time_avg(rs_waitactingchange_latency
, "waitactingchange_latency", "Waitactingchange recovery state latency");
3808 rs_perf
.add_time_avg(rs_incomplete_latency
, "incomplete_latency", "Incomplete recovery state latency");
3809 rs_perf
.add_time_avg(rs_down_latency
, "down_latency", "Down recovery state latency");
3810 rs_perf
.add_time_avg(rs_getmissing_latency
, "getmissing_latency", "Getmissing recovery state latency");
3811 rs_perf
.add_time_avg(rs_waitupthru_latency
, "waitupthru_latency", "Waitupthru recovery state latency");
3812 rs_perf
.add_time_avg(rs_notrecovering_latency
, "notrecovering_latency", "Notrecovering recovery state latency");
3814 recoverystate_perf
= rs_perf
.create_perf_counters();
3815 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
3820 if (!service
.prepare_to_stop())
3821 return 0; // already shutting down
3823 if (is_stopping()) {
3827 dout(0) << "shutdown" << dendl
;
3829 set_state(STATE_STOPPING
);
3832 if (cct
->_conf
.get_val
<bool>("osd_debug_shutdown")) {
3833 cct
->_conf
.set_val("debug_osd", "100");
3834 cct
->_conf
.set_val("debug_journal", "100");
3835 cct
->_conf
.set_val("debug_filestore", "100");
3836 cct
->_conf
.set_val("debug_bluestore", "100");
3837 cct
->_conf
.set_val("debug_ms", "100");
3838 cct
->_conf
.apply_changes(nullptr);
3841 // stop MgrClient earlier as it's more like an internal consumer of OSD
3844 service
.start_shutdown();
3846 // stop sending work to pgs. this just prevents any new work in _process
3847 // from racing with on_shutdown and potentially entering the pg after.
3848 op_shardedwq
.drain();
3854 for (auto pg
: pgs
) {
3859 // drain op queue again (in case PGs requeued something)
3860 op_shardedwq
.drain();
3862 finished
.clear(); // zap waiters (bleh, this is messy)
3863 waiting_for_osdmap
.clear();
3866 // unregister commands
3867 cct
->get_admin_socket()->unregister_commands(asok_hook
);
3871 cct
->get_admin_socket()->unregister_commands(test_ops_hook
);
3872 delete test_ops_hook
;
3873 test_ops_hook
= NULL
;
3877 heartbeat_lock
.Lock();
3878 heartbeat_stop
= true;
3879 heartbeat_cond
.Signal();
3880 heartbeat_lock
.Unlock();
3881 heartbeat_thread
.join();
3885 dout(10) << "op sharded tp stopped" << dendl
;
3889 dout(10) << "command tp stopped" << dendl
;
3891 dout(10) << "stopping agent" << dendl
;
3892 service
.agent_stop();
3894 boot_finisher
.wait_for_empty();
3898 boot_finisher
.stop();
3899 reset_heartbeat_peers(true);
3901 tick_timer
.shutdown();
3904 std::lock_guard
l(tick_timer_lock
);
3905 tick_timer_without_osd_lock
.shutdown();
3908 // note unmount epoch
3909 dout(10) << "noting clean unmount in epoch " << osdmap
->get_epoch() << dendl
;
3910 superblock
.mounted
= service
.get_boot_epoch();
3911 superblock
.clean_thru
= osdmap
->get_epoch();
3912 ObjectStore::Transaction t
;
3913 write_superblock(t
);
3914 int r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3916 derr
<< "OSD::shutdown: error writing superblock: "
3917 << cpp_strerror(r
) << dendl
;
3921 service
.shutdown_reserver();
3924 #ifdef PG_DEBUG_REFS
3925 service
.dump_live_pgids();
3929 _get_pgs(&pgs
, true);
3933 for (auto& pg
: pgs
) {
3934 if (pg
->is_deleted()) {
3937 dout(20) << " kicking pg " << pg
<< dendl
;
3939 if (pg
->get_num_ref() != 1) {
3940 derr
<< "pgid " << pg
->get_pgid() << " has ref count of "
3941 << pg
->get_num_ref() << dendl
;
3942 #ifdef PG_DEBUG_REFS
3943 pg
->dump_live_ids();
3945 if (cct
->_conf
->osd_shutdown_pgref_assert
) {
3953 #ifdef PG_DEBUG_REFS
3954 service
.dump_live_pgids();
3958 cct
->_conf
.remove_observer(this);
3961 service
.meta_ch
.reset();
3963 dout(10) << "syncing store" << dendl
;
3964 enable_disable_fuse(true);
3966 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
3967 dout(10) << "flushing journal" << dendl
;
3968 store
->flush_journal();
3974 map_lock
.get_write();
3975 osdmap
= OSDMapRef();
3976 map_lock
.put_write();
3978 for (auto s
: shards
) {
3979 std::lock_guard
l(s
->osdmap_lock
);
3980 s
->shard_osdmap
= OSDMapRef();
3984 std::lock_guard
lock(osd_lock
);
3988 dout(10) << "Store synced" << dendl
;
3990 op_tracker
.on_shutdown();
3992 class_handler
->shutdown();
3993 client_messenger
->shutdown();
3994 cluster_messenger
->shutdown();
3995 hb_front_client_messenger
->shutdown();
3996 hb_back_client_messenger
->shutdown();
3997 objecter_messenger
->shutdown();
3998 hb_front_server_messenger
->shutdown();
3999 hb_back_server_messenger
->shutdown();
4004 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
4006 bool created
= false;
4008 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
4009 vector
<string
> vcmd
{cmd
};
4013 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
4016 if (r
== -ENOENT
&& !created
) {
4017 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
4018 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
4019 vector
<string
> vnewcmd
{newcmd
};
4023 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
4026 derr
<< __func__
<< " fail: osd does not exist and created failed: "
4027 << cpp_strerror(r
) << dendl
;
4033 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
4042 int OSD::update_crush_location()
4044 if (!cct
->_conf
->osd_crush_update_on_start
) {
4045 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
4050 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
4051 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
4053 struct store_statfs_t st
;
4054 osd_alert_list_t alerts
;
4055 int r
= store
->statfs(&st
, &alerts
);
4057 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
4060 snprintf(weight
, sizeof(weight
), "%.4lf",
4063 double(1ull << 40 /* TB */)));
4066 std::multimap
<string
,string
> loc
= cct
->crush_location
.get_location();
4067 dout(10) << __func__
<< " crush location is " << loc
<< dendl
;
4070 string("{\"prefix\": \"osd crush create-or-move\", ") +
4071 string("\"id\": ") + stringify(whoami
) + string(", ") +
4072 string("\"weight\":") + weight
+ string(", ") +
4073 string("\"args\": [");
4074 for (multimap
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
) {
4075 if (p
!= loc
.begin())
4077 cmd
+= "\"" + p
->first
+ "=" + p
->second
+ "\"";
4081 return mon_cmd_maybe_osd_create(cmd
);
4084 int OSD::update_crush_device_class()
4086 if (!cct
->_conf
->osd_class_update_on_start
) {
4087 dout(10) << __func__
<< " osd_class_update_on_start = false" << dendl
;
4091 string device_class
;
4092 int r
= store
->read_meta("crush_device_class", &device_class
);
4093 if (r
< 0 || device_class
.empty()) {
4094 device_class
= store
->get_default_device_class();
4097 if (device_class
.empty()) {
4098 dout(20) << __func__
<< " no device class stored locally" << dendl
;
4103 string("{\"prefix\": \"osd crush set-device-class\", ") +
4104 string("\"class\": \"") + device_class
+ string("\", ") +
4105 string("\"ids\": [\"") + stringify(whoami
) + string("\"]}");
4107 r
= mon_cmd_maybe_osd_create(cmd
);
4109 // good, already bound to a device-class
4116 void OSD::write_superblock(ObjectStore::Transaction
& t
)
4118 dout(10) << "write_superblock " << superblock
<< dendl
;
4120 //hack: at minimum it's using the baseline feature set
4121 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
4122 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
4125 encode(superblock
, bl
);
4126 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
4129 int OSD::read_superblock()
4132 int r
= store
->read(service
.meta_ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
4136 auto p
= bl
.cbegin();
4137 decode(superblock
, p
);
4139 dout(10) << "read_superblock " << superblock
<< dendl
;
4144 void OSD::clear_temp_objects()
4146 dout(10) << __func__
<< dendl
;
4148 store
->list_collections(ls
);
4149 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
4151 if (!p
->is_pg(&pgid
))
4154 // list temp objects
4155 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
4157 vector
<ghobject_t
> temps
;
4160 vector
<ghobject_t
> objects
;
4161 auto ch
= store
->open_collection(*p
);
4163 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4164 store
->get_ideal_list_max(),
4166 if (objects
.empty())
4168 vector
<ghobject_t
>::iterator q
;
4169 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
4170 // Hammer set pool for temps to -1, so check for clean-up
4171 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
4172 temps
.push_back(*q
);
4177 // If we saw a non-temp object and hit the break above we can
4178 // break out of the while loop too.
4179 if (q
!= objects
.end())
4182 if (!temps
.empty()) {
4183 ObjectStore::Transaction t
;
4185 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
4186 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
4188 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
4189 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4190 t
= ObjectStore::Transaction();
4195 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4201 void OSD::recursive_remove_collection(CephContext
* cct
,
4202 ObjectStore
*store
, spg_t pgid
,
4208 make_snapmapper_oid());
4210 ObjectStore::CollectionHandle ch
= store
->open_collection(tmp
);
4211 ObjectStore::Transaction t
;
4212 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
4215 int max
= cct
->_conf
->osd_target_transaction_size
;
4216 vector
<ghobject_t
> objects
;
4217 objects
.reserve(max
);
4220 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4221 max
, &objects
, &next
);
4222 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
4223 if (objects
.empty())
4225 for (auto& p
: objects
) {
4226 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
4227 int r
= mapper
.remove_oid(p
.hobj
, &_t
);
4228 if (r
!= 0 && r
!= -ENOENT
)
4232 int r
= store
->queue_transaction(ch
, std::move(t
));
4233 ceph_assert(r
== 0);
4234 t
= ObjectStore::Transaction();
4236 t
.remove_collection(tmp
);
4237 int r
= store
->queue_transaction(ch
, std::move(t
));
4238 ceph_assert(r
== 0);
4241 if (!ch
->flush_commit(&waiter
)) {
4247 // ======================================================
4251 OSDMapRef createmap
,
4254 dout(10) << __func__
<< " " << pgid
<< dendl
;
4256 map
<string
,string
> ec_profile
;
4258 if (createmap
->have_pg_pool(pgid
.pool())) {
4259 pi
= *createmap
->get_pg_pool(pgid
.pool());
4260 name
= createmap
->get_pool_name(pgid
.pool());
4261 if (pi
.is_erasure()) {
4262 ec_profile
= createmap
->get_erasure_code_profile(pi
.erasure_code_profile
);
4265 // pool was deleted; grab final pg_pool_t off disk.
4266 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
4268 int r
= store
->read(service
.meta_ch
, oid
, 0, 0, bl
);
4270 derr
<< __func__
<< " missing pool " << pgid
.pool() << " tombstone"
4274 ceph_assert(r
>= 0);
4275 auto p
= bl
.cbegin();
4278 if (p
.end()) { // dev release v13.0.2 did not include ec_profile
4279 derr
<< __func__
<< " missing ec_profile from pool " << pgid
.pool()
4280 << " tombstone" << dendl
;
4283 decode(ec_profile
, p
);
4285 PGPool
pool(cct
, createmap
, pgid
.pool(), pi
, name
);
4287 if (pi
.type
== pg_pool_t::TYPE_REPLICATED
||
4288 pi
.type
== pg_pool_t::TYPE_ERASURE
)
4289 pg
= new PrimaryLogPG(&service
, createmap
, pool
, ec_profile
, pgid
);
4295 void OSD::_get_pgs(vector
<PGRef
> *v
, bool clear_too
)
4298 v
->reserve(get_num_pgs());
4299 for (auto& s
: shards
) {
4300 std::lock_guard
l(s
->shard_lock
);
4301 for (auto& j
: s
->pg_slots
) {
4303 !j
.second
->pg
->is_deleted()) {
4304 v
->push_back(j
.second
->pg
);
4306 s
->_detach_pg(j
.second
.get());
4313 void OSD::_get_pgids(vector
<spg_t
> *v
)
4316 v
->reserve(get_num_pgs());
4317 for (auto& s
: shards
) {
4318 std::lock_guard
l(s
->shard_lock
);
4319 for (auto& j
: s
->pg_slots
) {
4321 !j
.second
->pg
->is_deleted()) {
4322 v
->push_back(j
.first
);
4328 void OSD::register_pg(PGRef pg
)
4330 spg_t pgid
= pg
->get_pgid();
4331 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4332 auto sdata
= shards
[shard_index
];
4333 std::lock_guard
l(sdata
->shard_lock
);
4334 auto r
= sdata
->pg_slots
.emplace(pgid
, make_unique
<OSDShardPGSlot
>());
4335 ceph_assert(r
.second
);
4336 auto *slot
= r
.first
->second
.get();
4337 dout(20) << __func__
<< " " << pgid
<< " " << pg
<< dendl
;
4338 sdata
->_attach_pg(slot
, pg
.get());
4341 bool OSD::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
4343 auto sdata
= pg
->osd_shard
;
4346 std::lock_guard
l(sdata
->shard_lock
);
4347 auto p
= sdata
->pg_slots
.find(pg
->pg_id
);
4348 if (p
== sdata
->pg_slots
.end() ||
4350 dout(20) << __func__
<< " " << pg
->pg_id
<< " not found" << dendl
;
4353 if (p
->second
->waiting_for_merge_epoch
) {
4354 dout(20) << __func__
<< " " << pg
->pg_id
<< " waiting for merge" << dendl
;
4357 dout(20) << __func__
<< " " << pg
->pg_id
<< " " << pg
<< dendl
;
4358 sdata
->_detach_pg(p
->second
.get());
4361 for (auto shard
: shards
) {
4362 shard
->unprime_split_children(pg
->pg_id
, old_pg_num
);
4365 // update pg count now since we might not get an osdmap any time soon.
4366 if (pg
->is_primary())
4367 service
.logger
->dec(l_osd_pg_primary
);
4368 else if (pg
->is_replica())
4369 service
.logger
->dec(l_osd_pg_replica
);
4371 service
.logger
->dec(l_osd_pg_stray
);
4376 PGRef
OSD::_lookup_pg(spg_t pgid
)
4378 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4379 auto sdata
= shards
[shard_index
];
4380 std::lock_guard
l(sdata
->shard_lock
);
4381 auto p
= sdata
->pg_slots
.find(pgid
);
4382 if (p
== sdata
->pg_slots
.end()) {
4385 return p
->second
->pg
;
4388 PGRef
OSD::_lookup_lock_pg(spg_t pgid
)
4390 PGRef pg
= _lookup_pg(pgid
);
4395 if (!pg
->is_deleted()) {
4402 PGRef
OSD::lookup_lock_pg(spg_t pgid
)
4404 return _lookup_lock_pg(pgid
);
4407 void OSD::load_pgs()
4409 ceph_assert(osd_lock
.is_locked());
4410 dout(0) << "load_pgs" << dendl
;
4413 auto pghist
= make_pg_num_history_oid();
4415 int r
= store
->read(service
.meta_ch
, pghist
, 0, 0, bl
, 0);
4416 if (r
>= 0 && bl
.length() > 0) {
4417 auto p
= bl
.cbegin();
4418 decode(pg_num_history
, p
);
4420 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
4424 int r
= store
->list_collections(ls
);
4426 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
4430 for (vector
<coll_t
>::iterator it
= ls
.begin();
4434 if (it
->is_temp(&pgid
) ||
4435 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
, pgid
))) {
4436 dout(10) << "load_pgs " << *it
4437 << " removing, legacy or flagged for removal pg" << dendl
;
4438 recursive_remove_collection(cct
, store
, pgid
, *it
);
4442 if (!it
->is_pg(&pgid
)) {
4443 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
4447 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
4448 epoch_t map_epoch
= 0;
4449 int r
= PG::peek_map_epoch(store
, pgid
, &map_epoch
);
4451 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
4457 if (map_epoch
> 0) {
4458 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
4460 if (!osdmap
->have_pg_pool(pgid
.pool())) {
4461 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
4462 << " on pg " << pgid
<< ", but the pool is not present in the "
4463 << "current map, so this is probably a result of bug 10617. "
4464 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4465 << "to clean it up later." << dendl
;
4468 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
4469 << map_epoch
<< ", but missing map. Crashing."
4471 ceph_abort_msg("Missing map in load_pgs");
4474 pg
= _make_pg(pgosdmap
, pgid
);
4476 pg
= _make_pg(osdmap
, pgid
);
4479 recursive_remove_collection(cct
, store
, pgid
, *it
);
4483 // there can be no waiters here, so we don't call _wake_pg_slot
4486 pg
->ch
= store
->open_collection(pg
->coll
);
4488 // read pg state, log
4489 pg
->read_state(store
);
4492 dout(10) << "load_pgs " << *it
<< " deleting dne" << dendl
;
4495 recursive_remove_collection(cct
, store
, pgid
, *it
);
4499 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4500 assert(NULL
!= shards
[shard_index
]);
4501 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4504 pg
->reg_next_scrub();
4506 dout(10) << __func__
<< " loaded " << *pg
<< dendl
;
4512 dout(0) << __func__
<< " opened " << num
<< " pgs" << dendl
;
4516 PGRef
OSD::handle_pg_create_info(const OSDMapRef
& osdmap
,
4517 const PGCreateInfo
*info
)
4519 spg_t pgid
= info
->pgid
;
4521 if (maybe_wait_for_max_pg(osdmap
, pgid
, info
->by_mon
)) {
4522 dout(10) << __func__
<< " hit max pg, dropping" << dendl
;
4526 PG::RecoveryCtx rctx
= create_context();
4528 OSDMapRef startmap
= get_map(info
->epoch
);
4531 int64_t pool_id
= pgid
.pgid
.pool();
4532 const pg_pool_t
*pool
= osdmap
->get_pg_pool(pool_id
);
4534 dout(10) << __func__
<< " ignoring " << pgid
<< ", pool dne" << dendl
;
4537 if (osdmap
->require_osd_release
>= CEPH_RELEASE_NAUTILUS
&&
4538 !pool
->has_flag(pg_pool_t::FLAG_CREATING
)) {
4539 // this ensures we do not process old creating messages after the
4540 // pool's initial pgs have been created (and pg are subsequently
4541 // allowed to split or merge).
4542 dout(20) << __func__
<< " dropping " << pgid
4543 << "create, pool does not have CREATING flag set" << dendl
;
4548 int up_primary
, acting_primary
;
4549 vector
<int> up
, acting
;
4550 startmap
->pg_to_up_acting_osds(
4551 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4553 const pg_pool_t
* pp
= startmap
->get_pg_pool(pgid
.pool());
4554 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
4555 store
->get_type() != "bluestore") {
4556 clog
->warn() << "pg " << pgid
4557 << " is at risk of silent data corruption: "
4558 << "the pool allows ec overwrites but is not stored in "
4559 << "bluestore, so deep scrubbing will not detect bitrot";
4561 PG::_create(*rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
4562 PG::_init(*rctx
.transaction
, pgid
, pp
);
4564 int role
= startmap
->calc_pg_role(whoami
, acting
, acting
.size());
4565 if (!pp
->is_replicated() && role
!= pgid
.shard
) {
4569 PGRef pg
= _make_pg(startmap
, pgid
);
4570 pg
->ch
= store
->create_new_collection(pg
->coll
);
4573 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4574 assert(NULL
!= shards
[shard_index
]);
4575 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4580 // we are holding the shard lock
4581 ceph_assert(!pg
->is_deleted());
4590 info
->past_intervals
,
4594 if (pg
->is_primary()) {
4595 Mutex::Locker
locker(m_perf_queries_lock
);
4596 pg
->set_dynamic_perf_stats_queries(m_perf_queries
);
4599 pg
->handle_initialize(&rctx
);
4600 pg
->handle_activate_map(&rctx
);
4602 dispatch_context(rctx
, pg
.get(), osdmap
, nullptr);
4604 dout(10) << __func__
<< " new pg " << *pg
<< dendl
;
4608 bool OSD::maybe_wait_for_max_pg(const OSDMapRef
& osdmap
,
4612 const auto max_pgs_per_osd
=
4613 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4614 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4616 if (num_pgs
< max_pgs_per_osd
) {
4620 std::lock_guard
l(pending_creates_lock
);
4621 if (is_mon_create
) {
4622 pending_creates_from_mon
++;
4624 bool is_primary
= osdmap
->get_pg_acting_rank(pgid
.pgid
, whoami
) == 0;
4625 pending_creates_from_osd
.emplace(pgid
.pgid
, is_primary
);
4627 dout(1) << __func__
<< " withhold creation of pg " << pgid
4628 << ": " << num_pgs
<< " >= "<< max_pgs_per_osd
<< dendl
;
4632 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4633 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4634 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4635 static vector
<int32_t> twiddle(const vector
<int>& acting
) {
4636 if (acting
.size() > 1) {
4639 vector
<int32_t> twiddled(acting
.begin(), acting
.end());
4640 twiddled
.push_back(-1);
4645 void OSD::resume_creating_pg()
4647 bool do_sub_pg_creates
= false;
4648 bool have_pending_creates
= false;
4650 const auto max_pgs_per_osd
=
4651 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4652 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4653 if (max_pgs_per_osd
<= num_pgs
) {
4654 // this could happen if admin decreases this setting before a PG is removed
4657 unsigned spare_pgs
= max_pgs_per_osd
- num_pgs
;
4658 std::lock_guard
l(pending_creates_lock
);
4659 if (pending_creates_from_mon
> 0) {
4660 dout(20) << __func__
<< " pending_creates_from_mon "
4661 << pending_creates_from_mon
<< dendl
;
4662 do_sub_pg_creates
= true;
4663 if (pending_creates_from_mon
>= spare_pgs
) {
4664 spare_pgs
= pending_creates_from_mon
= 0;
4666 spare_pgs
-= pending_creates_from_mon
;
4667 pending_creates_from_mon
= 0;
4670 auto pg
= pending_creates_from_osd
.cbegin();
4671 while (spare_pgs
> 0 && pg
!= pending_creates_from_osd
.cend()) {
4672 dout(20) << __func__
<< " pg " << pg
->first
<< dendl
;
4674 osdmap
->pg_to_up_acting_osds(pg
->first
, nullptr, nullptr, &acting
, nullptr);
4675 service
.queue_want_pg_temp(pg
->first
, twiddle(acting
), true);
4676 pg
= pending_creates_from_osd
.erase(pg
);
4677 do_sub_pg_creates
= true;
4680 have_pending_creates
= (pending_creates_from_mon
> 0 ||
4681 !pending_creates_from_osd
.empty());
4684 bool do_renew_subs
= false;
4685 if (do_sub_pg_creates
) {
4686 if (monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0)) {
4687 dout(4) << __func__
<< ": resolicit pg creates from mon since "
4688 << last_pg_create_epoch
<< dendl
;
4689 do_renew_subs
= true;
4692 version_t start
= osdmap
->get_epoch() + 1;
4693 if (have_pending_creates
) {
4694 // don't miss any new osdmap deleting PGs
4695 if (monc
->sub_want("osdmap", start
, 0)) {
4696 dout(4) << __func__
<< ": resolicit osdmap from mon since "
4698 do_renew_subs
= true;
4700 } else if (do_sub_pg_creates
) {
4701 // no need to subscribe the osdmap continuously anymore
4702 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4703 if (monc
->sub_want_increment("osdmap", start
, CEPH_SUBSCRIBE_ONETIME
)) {
4704 dout(4) << __func__
<< ": re-subscribe osdmap(onetime) since "
4706 do_renew_subs
= true;
4710 if (do_renew_subs
) {
4714 service
.send_pg_temp();
4717 void OSD::build_initial_pg_history(
4720 utime_t created_stamp
,
4724 dout(10) << __func__
<< " " << pgid
<< " created " << created
<< dendl
;
4725 h
->epoch_created
= created
;
4726 h
->epoch_pool_created
= created
;
4727 h
->same_interval_since
= created
;
4728 h
->same_up_since
= created
;
4729 h
->same_primary_since
= created
;
4730 h
->last_scrub_stamp
= created_stamp
;
4731 h
->last_deep_scrub_stamp
= created_stamp
;
4732 h
->last_clean_scrub_stamp
= created_stamp
;
4734 OSDMapRef lastmap
= service
.get_map(created
);
4735 int up_primary
, acting_primary
;
4736 vector
<int> up
, acting
;
4737 lastmap
->pg_to_up_acting_osds(
4738 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4740 ostringstream debug
;
4741 for (epoch_t e
= created
+ 1; e
<= osdmap
->get_epoch(); ++e
) {
4742 OSDMapRef osdmap
= service
.get_map(e
);
4743 int new_up_primary
, new_acting_primary
;
4744 vector
<int> new_up
, new_acting
;
4745 osdmap
->pg_to_up_acting_osds(
4746 pgid
.pgid
, &new_up
, &new_up_primary
, &new_acting
, &new_acting_primary
);
4748 // this is a bit imprecise, but sufficient?
4749 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
4750 const pg_pool_t
*pi
;
4751 bool operator()(const set
<pg_shard_t
> &have
) const {
4752 return have
.size() >= pi
->min_size
;
4754 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
4755 } min_size_predicate(osdmap
->get_pg_pool(pgid
.pgid
.pool()));
4757 bool new_interval
= PastIntervals::check_new_interval(
4764 h
->same_interval_since
,
4765 h
->last_epoch_clean
,
4769 &min_size_predicate
,
4773 h
->same_interval_since
= e
;
4775 h
->same_up_since
= e
;
4777 if (acting_primary
!= new_acting_primary
) {
4778 h
->same_primary_since
= e
;
4780 if (pgid
.pgid
.is_split(lastmap
->get_pg_num(pgid
.pgid
.pool()),
4781 osdmap
->get_pg_num(pgid
.pgid
.pool()),
4783 h
->last_epoch_split
= e
;
4786 acting
= new_acting
;
4787 up_primary
= new_up_primary
;
4788 acting_primary
= new_acting_primary
;
4792 dout(20) << __func__
<< " " << debug
.str() << dendl
;
4793 dout(10) << __func__
<< " " << *h
<< " " << *pi
4794 << " [" << (pi
->empty() ? pair
<epoch_t
,epoch_t
>(0,0) :
4795 pi
->get_bounds()) << ")"
4799 void OSD::_add_heartbeat_peer(int p
)
4805 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
4806 if (i
== heartbeat_peers
.end()) {
4807 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, osdmap
->get_epoch());
4810 hi
= &heartbeat_peers
[p
];
4812 RefCountedPtr s
{new HeartbeatSession
{p
}, false};
4813 hi
->con_back
= cons
.first
.get();
4814 hi
->con_back
->set_priv(s
);
4816 hi
->con_front
= cons
.second
.get();
4817 hi
->con_front
->set_priv(s
);
4818 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4819 << " " << hi
->con_back
->get_peer_addr()
4820 << " " << hi
->con_front
->get_peer_addr()
4823 hi
->con_front
.reset(NULL
);
4824 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4825 << " " << hi
->con_back
->get_peer_addr()
4831 hi
->epoch
= osdmap
->get_epoch();
4834 void OSD::_remove_heartbeat_peer(int n
)
4836 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
4837 ceph_assert(q
!= heartbeat_peers
.end());
4838 dout(20) << " removing heartbeat peer osd." << n
4839 << " " << q
->second
.con_back
->get_peer_addr()
4840 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
4842 q
->second
.con_back
->mark_down();
4843 if (q
->second
.con_front
) {
4844 q
->second
.con_front
->mark_down();
4846 heartbeat_peers
.erase(q
);
4849 void OSD::need_heartbeat_peer_update()
4853 dout(20) << "need_heartbeat_peer_update" << dendl
;
4854 heartbeat_set_peers_need_update();
4857 void OSD::maybe_update_heartbeat_peers()
4859 ceph_assert(osd_lock
.is_locked());
4861 if (is_waiting_for_healthy() || is_active()) {
4862 utime_t now
= ceph_clock_now();
4863 if (last_heartbeat_resample
== utime_t()) {
4864 last_heartbeat_resample
= now
;
4865 heartbeat_set_peers_need_update();
4866 } else if (!heartbeat_peers_need_update()) {
4867 utime_t dur
= now
- last_heartbeat_resample
;
4868 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
4869 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
4870 heartbeat_set_peers_need_update();
4871 last_heartbeat_resample
= now
;
4872 // automatically clean up any stale heartbeat peers
4873 // if we are unhealthy, then clean all
4874 reset_heartbeat_peers(is_waiting_for_healthy());
4879 if (!heartbeat_peers_need_update())
4881 heartbeat_clear_peers_need_update();
4883 std::lock_guard
l(heartbeat_lock
);
4885 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
4888 // build heartbeat from set
4892 for (auto& pg
: pgs
) {
4893 pg
->with_heartbeat_peers([&](int peer
) {
4894 if (osdmap
->is_up(peer
)) {
4895 _add_heartbeat_peer(peer
);
4901 // include next and previous up osds to ensure we have a fully-connected set
4902 set
<int> want
, extras
;
4903 const int next
= osdmap
->get_next_up_osd_after(whoami
);
4906 int prev
= osdmap
->get_previous_up_osd_before(whoami
);
4907 if (prev
>= 0 && prev
!= next
)
4910 // make sure we have at least **min_down** osds coming from different
4911 // subtree level (e.g., hosts) for fast failure detection.
4912 auto min_down
= cct
->_conf
.get_val
<uint64_t>("mon_osd_min_down_reporters");
4913 auto subtree
= cct
->_conf
.get_val
<string
>("mon_osd_reporter_subtree_level");
4914 osdmap
->get_random_up_osds_by_subtree(
4915 whoami
, subtree
, min_down
, want
, &want
);
4917 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
4918 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
4920 _add_heartbeat_peer(*p
);
4923 // remove down peers; enumerate extras
4924 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
4925 while (p
!= heartbeat_peers
.end()) {
4926 if (!osdmap
->is_up(p
->first
)) {
4929 _remove_heartbeat_peer(o
);
4932 if (p
->second
.epoch
< osdmap
->get_epoch()) {
4933 extras
.insert(p
->first
);
4939 for (int n
= next
; n
>= 0; ) {
4940 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
4942 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
4943 dout(10) << " adding random peer osd." << n
<< dendl
;
4945 _add_heartbeat_peer(n
);
4947 n
= osdmap
->get_next_up_osd_after(n
);
4949 break; // came full circle; stop
4953 for (set
<int>::iterator p
= extras
.begin();
4954 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
4958 _remove_heartbeat_peer(*p
);
4961 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
4964 void OSD::reset_heartbeat_peers(bool all
)
4966 ceph_assert(osd_lock
.is_locked());
4967 dout(10) << "reset_heartbeat_peers" << dendl
;
4968 utime_t stale
= ceph_clock_now();
4969 stale
-= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
4970 std::lock_guard
l(heartbeat_lock
);
4971 for (auto it
= heartbeat_peers
.begin(); it
!= heartbeat_peers
.end();) {
4972 HeartbeatInfo
& hi
= it
->second
;
4973 if (all
|| hi
.is_stale(stale
)) {
4974 hi
.con_back
->mark_down();
4976 hi
.con_front
->mark_down();
4978 // stop sending failure_report to mon too
4979 failure_queue
.erase(it
->first
);
4980 heartbeat_peers
.erase(it
++);
4987 void OSD::handle_osd_ping(MOSDPing
*m
)
4989 if (superblock
.cluster_fsid
!= m
->fsid
) {
4990 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
4991 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
<< dendl
;
4996 int from
= m
->get_source().num();
4998 heartbeat_lock
.Lock();
4999 if (is_stopping()) {
5000 heartbeat_lock
.Unlock();
5005 OSDMapRef curmap
= service
.get_osdmap();
5007 heartbeat_lock
.Unlock();
5014 case MOSDPing::PING
:
5016 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
5017 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
5018 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
5019 if (heartbeat_drop
->second
== 0) {
5020 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
5022 --heartbeat_drop
->second
;
5023 dout(5) << "Dropping heartbeat from " << from
5024 << ", " << heartbeat_drop
->second
5025 << " remaining to drop" << dendl
;
5028 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
5029 ((((double)(rand()%100))/100.0))) {
5031 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
5032 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
5033 dout(5) << "Dropping heartbeat from " << from
5034 << ", " << heartbeat_drop
->second
5035 << " remaining to drop" << dendl
;
5040 if (!cct
->get_heartbeat_map()->is_healthy()) {
5041 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl
;
5045 Message
*r
= new MOSDPing(monc
->get_fsid(),
5046 curmap
->get_epoch(),
5047 MOSDPing::PING_REPLY
, m
->stamp
,
5048 cct
->_conf
->osd_heartbeat_min_size
);
5049 m
->get_connection()->send_message(r
);
5051 if (curmap
->is_up(from
)) {
5052 service
.note_peer_epoch(from
, m
->map_epoch
);
5054 ConnectionRef con
= service
.get_con_osd_cluster(from
, curmap
->get_epoch());
5056 service
.share_map_peer(from
, con
.get());
5059 } else if (!curmap
->exists(from
) ||
5060 curmap
->get_down_at(from
) > m
->map_epoch
) {
5061 // tell them they have died
5062 Message
*r
= new MOSDPing(monc
->get_fsid(),
5063 curmap
->get_epoch(),
5066 cct
->_conf
->osd_heartbeat_min_size
);
5067 m
->get_connection()->send_message(r
);
5072 case MOSDPing::PING_REPLY
:
5074 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
5075 if (i
!= heartbeat_peers
.end()) {
5076 auto acked
= i
->second
.ping_history
.find(m
->stamp
);
5077 if (acked
!= i
->second
.ping_history
.end()) {
5078 utime_t now
= ceph_clock_now();
5079 int &unacknowledged
= acked
->second
.second
;
5080 if (m
->get_connection() == i
->second
.con_back
) {
5081 dout(25) << "handle_osd_ping got reply from osd." << from
5082 << " first_tx " << i
->second
.first_tx
5083 << " last_tx " << i
->second
.last_tx
5084 << " last_rx_back " << i
->second
.last_rx_back
<< " -> " << now
5085 << " last_rx_front " << i
->second
.last_rx_front
5087 i
->second
.last_rx_back
= now
;
5088 ceph_assert(unacknowledged
> 0);
5090 // if there is no front con, set both stamps.
5091 if (i
->second
.con_front
== NULL
) {
5092 i
->second
.last_rx_front
= now
;
5093 ceph_assert(unacknowledged
> 0);
5096 } else if (m
->get_connection() == i
->second
.con_front
) {
5097 dout(25) << "handle_osd_ping got reply from osd." << from
5098 << " first_tx " << i
->second
.first_tx
5099 << " last_tx " << i
->second
.last_tx
5100 << " last_rx_back " << i
->second
.last_rx_back
5101 << " last_rx_front " << i
->second
.last_rx_front
<< " -> " << now
5103 i
->second
.last_rx_front
= now
;
5104 ceph_assert(unacknowledged
> 0);
5108 if (unacknowledged
== 0) {
5109 // succeeded in getting all replies
5110 dout(25) << "handle_osd_ping got all replies from osd." << from
5111 << " , erase pending ping(sent at " << m
->stamp
<< ")"
5112 << " and older pending ping(s)"
5114 i
->second
.ping_history
.erase(i
->second
.ping_history
.begin(), ++acked
);
5117 if (i
->second
.is_healthy(now
)) {
5118 // Cancel false reports
5119 auto failure_queue_entry
= failure_queue
.find(from
);
5120 if (failure_queue_entry
!= failure_queue
.end()) {
5121 dout(10) << "handle_osd_ping canceling queued "
5122 << "failure report for osd." << from
<< dendl
;
5123 failure_queue
.erase(failure_queue_entry
);
5126 auto failure_pending_entry
= failure_pending
.find(from
);
5127 if (failure_pending_entry
!= failure_pending
.end()) {
5128 dout(10) << "handle_osd_ping canceling in-flight "
5129 << "failure report for osd." << from
<< dendl
;
5130 send_still_alive(curmap
->get_epoch(),
5132 failure_pending_entry
->second
.second
);
5133 failure_pending
.erase(failure_pending_entry
);
5137 // old replies, deprecated by newly sent pings.
5138 dout(10) << "handle_osd_ping no pending ping(sent at " << m
->stamp
5139 << ") is found, treat as covered by newly sent pings "
5146 curmap
->is_up(from
)) {
5147 service
.note_peer_epoch(from
, m
->map_epoch
);
5149 ConnectionRef con
= service
.get_con_osd_cluster(from
, curmap
->get_epoch());
5151 service
.share_map_peer(from
, con
.get());
5158 case MOSDPing::YOU_DIED
:
5159 dout(10) << "handle_osd_ping " << m
->get_source_inst()
5160 << " says i am down in " << m
->map_epoch
<< dendl
;
5161 osdmap_subscribe(curmap
->get_epoch()+1, false);
5165 heartbeat_lock
.Unlock();
5169 void OSD::heartbeat_entry()
5171 std::lock_guard
l(heartbeat_lock
);
5174 while (!heartbeat_stop
) {
5177 double wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
5179 w
.set_from_double(wait
);
5180 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
5181 heartbeat_cond
.WaitInterval(heartbeat_lock
, w
);
5184 dout(30) << "heartbeat_entry woke up" << dendl
;
5188 void OSD::heartbeat_check()
5190 ceph_assert(heartbeat_lock
.is_locked());
5191 utime_t now
= ceph_clock_now();
5193 // check for incoming heartbeats (move me elsewhere?)
5194 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5195 p
!= heartbeat_peers
.end();
5198 if (p
->second
.first_tx
== utime_t()) {
5199 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
5200 << " yet, skipping" << dendl
;
5204 dout(25) << "heartbeat_check osd." << p
->first
5205 << " first_tx " << p
->second
.first_tx
5206 << " last_tx " << p
->second
.last_tx
5207 << " last_rx_back " << p
->second
.last_rx_back
5208 << " last_rx_front " << p
->second
.last_rx_front
5210 if (p
->second
.is_unhealthy(now
)) {
5211 utime_t oldest_deadline
= p
->second
.ping_history
.begin()->second
.first
;
5212 if (p
->second
.last_rx_back
== utime_t() ||
5213 p
->second
.last_rx_front
== utime_t()) {
5214 derr
<< "heartbeat_check: no reply from "
5215 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5216 << " osd." << p
->first
5217 << " ever on either front or back, first ping sent "
5218 << p
->second
.first_tx
5219 << " (oldest deadline " << oldest_deadline
<< ")"
5222 failure_queue
[p
->first
] = p
->second
.first_tx
;
5224 derr
<< "heartbeat_check: no reply from "
5225 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5226 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
5227 << " front " << p
->second
.last_rx_front
5228 << " (oldest deadline " << oldest_deadline
<< ")"
5231 failure_queue
[p
->first
] = std::min(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
5237 void OSD::heartbeat()
5239 ceph_assert(heartbeat_lock
.is_locked_by_me());
5240 dout(30) << "heartbeat" << dendl
;
5244 int hb_interval
= cct
->_conf
->osd_heartbeat_interval
;
5245 int n_samples
= 86400;
5246 if (hb_interval
> 1) {
5247 n_samples
/= hb_interval
;
5252 if (getloadavg(loadavgs
, 1) == 1) {
5253 logger
->set(l_osd_loadavg
, 100 * loadavgs
[0]);
5254 daily_loadavg
= (daily_loadavg
* (n_samples
- 1) + loadavgs
[0]) / n_samples
;
5255 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg
<< dendl
;
5258 dout(30) << "heartbeat checking stats" << dendl
;
5260 // refresh peer list and osd stats
5261 vector
<int> hb_peers
;
5262 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5263 p
!= heartbeat_peers
.end();
5265 hb_peers
.push_back(p
->first
);
5267 auto new_stat
= service
.set_osd_stat(hb_peers
, get_num_pgs());
5268 dout(5) << __func__
<< " " << new_stat
<< dendl
;
5269 ceph_assert(new_stat
.statfs
.total
);
5272 float ratio
= service
.compute_adjusted_ratio(new_stat
, &pratio
);
5274 service
.check_full_status(ratio
, pratio
);
5276 utime_t now
= ceph_clock_now();
5277 utime_t deadline
= now
;
5278 deadline
+= cct
->_conf
->osd_heartbeat_grace
;
5281 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
5282 i
!= heartbeat_peers
.end();
5284 int peer
= i
->first
;
5285 i
->second
.last_tx
= now
;
5286 if (i
->second
.first_tx
== utime_t())
5287 i
->second
.first_tx
= now
;
5288 i
->second
.ping_history
[now
] = make_pair(deadline
,
5289 HeartbeatInfo::HEARTBEAT_MAX_CONN
);
5290 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
5291 i
->second
.con_back
->send_message(new MOSDPing(monc
->get_fsid(),
5292 service
.get_osdmap_epoch(),
5293 MOSDPing::PING
, now
,
5294 cct
->_conf
->osd_heartbeat_min_size
));
5296 if (i
->second
.con_front
)
5297 i
->second
.con_front
->send_message(new MOSDPing(monc
->get_fsid(),
5298 service
.get_osdmap_epoch(),
5299 MOSDPing::PING
, now
,
5300 cct
->_conf
->osd_heartbeat_min_size
));
5303 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
5305 // hmm.. am i all alone?
5306 dout(30) << "heartbeat lonely?" << dendl
;
5307 if (heartbeat_peers
.empty()) {
5308 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
5309 last_mon_heartbeat
= now
;
5310 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
5311 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
5315 dout(30) << "heartbeat done" << dendl
;
5318 bool OSD::heartbeat_reset(Connection
*con
)
5320 std::lock_guard
l(heartbeat_lock
);
5321 auto s
= con
->get_priv();
5322 con
->set_priv(nullptr);
5324 if (is_stopping()) {
5327 auto heartbeat_session
= static_cast<HeartbeatSession
*>(s
.get());
5328 auto p
= heartbeat_peers
.find(heartbeat_session
->peer
);
5329 if (p
!= heartbeat_peers
.end() &&
5330 (p
->second
.con_back
== con
||
5331 p
->second
.con_front
== con
)) {
5332 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5333 << ", reopening" << dendl
;
5334 if (con
!= p
->second
.con_back
) {
5335 p
->second
.con_back
->mark_down();
5337 p
->second
.con_back
.reset(NULL
);
5338 if (p
->second
.con_front
&& con
!= p
->second
.con_front
) {
5339 p
->second
.con_front
->mark_down();
5341 p
->second
.con_front
.reset(NULL
);
5342 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
5344 p
->second
.con_back
= newcon
.first
.get();
5345 p
->second
.con_back
->set_priv(s
);
5346 if (newcon
.second
) {
5347 p
->second
.con_front
= newcon
.second
.get();
5348 p
->second
.con_front
->set_priv(s
);
5350 p
->second
.ping_history
.clear();
5352 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5353 << ", raced with osdmap update, closing out peer" << dendl
;
5354 heartbeat_peers
.erase(p
);
5357 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
5365 // =========================================
5369 ceph_assert(osd_lock
.is_locked());
5370 dout(10) << "tick" << dendl
;
5372 if (is_active() || is_waiting_for_healthy()) {
5373 maybe_update_heartbeat_peers();
5376 if (is_waiting_for_healthy()) {
5380 if (is_waiting_for_healthy() || is_booting()) {
5381 std::lock_guard
l(heartbeat_lock
);
5382 utime_t now
= ceph_clock_now();
5383 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
) {
5384 last_mon_heartbeat
= now
;
5385 dout(1) << __func__
<< " checking mon for new map" << dendl
;
5386 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
5392 tick_timer
.add_event_after(get_tick_interval(), new C_Tick(this));
5395 void OSD::tick_without_osd_lock()
5397 ceph_assert(tick_timer_lock
.is_locked());
5398 dout(10) << "tick_without_osd_lock" << dendl
;
5400 logger
->set(l_osd_cached_crc
, buffer::get_cached_crc());
5401 logger
->set(l_osd_cached_crc_adjusted
, buffer::get_cached_crc_adjusted());
5402 logger
->set(l_osd_missed_crc
, buffer::get_missed_crc());
5404 // refresh osd stats
5405 struct store_statfs_t stbuf
;
5406 osd_alert_list_t alerts
;
5407 int r
= store
->statfs(&stbuf
, &alerts
);
5408 ceph_assert(r
== 0);
5409 service
.set_statfs(stbuf
, alerts
);
5411 // osd_lock is not being held, which means the OSD state
5412 // might change when doing the monitor report
5413 if (is_active() || is_waiting_for_healthy()) {
5414 heartbeat_lock
.Lock();
5416 heartbeat_lock
.Unlock();
5418 map_lock
.get_read();
5419 std::lock_guard
l(mon_report_lock
);
5422 utime_t now
= ceph_clock_now();
5423 if (service
.need_fullness_update() ||
5424 now
- last_mon_report
> cct
->_conf
->osd_mon_report_interval
) {
5425 last_mon_report
= now
;
5429 map_lock
.put_read();
5431 epoch_t max_waiting_epoch
= 0;
5432 for (auto s
: shards
) {
5433 max_waiting_epoch
= std::max(max_waiting_epoch
,
5434 s
->get_max_waiting_epoch());
5436 if (max_waiting_epoch
> get_osdmap()->get_epoch()) {
5437 dout(20) << __func__
<< " max_waiting_epoch " << max_waiting_epoch
5438 << ", requesting new map" << dendl
;
5439 osdmap_subscribe(superblock
.newest_map
+ 1, false);
5444 if (!scrub_random_backoff()) {
5447 service
.promote_throttle_recalibrate();
5448 resume_creating_pg();
5449 bool need_send_beacon
= false;
5450 const auto now
= ceph::coarse_mono_clock::now();
5452 // borrow lec lock to pretect last_sent_beacon from changing
5453 std::lock_guard l
{min_last_epoch_clean_lock
};
5454 const auto elapsed
= now
- last_sent_beacon
;
5455 if (chrono::duration_cast
<chrono::seconds
>(elapsed
).count() >
5456 cct
->_conf
->osd_beacon_report_interval
) {
5457 need_send_beacon
= true;
5460 if (need_send_beacon
) {
5465 mgrc
.update_daemon_health(get_health_metrics());
5466 service
.kick_recovery_queue();
5467 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
5468 new C_Tick_WithoutOSDLock(this));
5472 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5473 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5474 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5475 // getomap <pool> [namespace/]<obj-name>
5476 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5477 // injectmdataerr [namespace/]<obj-name> [shardid]
5478 // injectdataerr [namespace/]<obj-name> [shardid]
5480 // set_recovery_delay [utime]
5481 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
5482 std::string_view command
,
5483 const cmdmap_t
& cmdmap
, ostream
&ss
)
5486 //Support changing the omap on a single osd by using the Admin Socket to
5487 //directly request the osd make a change.
5488 if (command
== "setomapval" || command
== "rmomapkey" ||
5489 command
== "setomapheader" || command
== "getomap" ||
5490 command
== "truncobj" || command
== "injectmdataerr" ||
5491 command
== "injectdataerr"
5495 OSDMapRef curmap
= service
->get_osdmap();
5500 cmd_getval(service
->cct
, cmdmap
, "pool", poolstr
);
5501 pool
= curmap
->lookup_pg_pool_name(poolstr
);
5502 //If we can't find it by name then maybe id specified
5503 if (pool
< 0 && isdigit(poolstr
[0]))
5504 pool
= atoll(poolstr
.c_str());
5506 ss
<< "Invalid pool '" << poolstr
<< "''";
5510 string objname
, nspace
;
5511 cmd_getval(service
->cct
, cmdmap
, "objname", objname
);
5512 std::size_t found
= objname
.find_first_of('/');
5513 if (found
!= string::npos
) {
5514 nspace
= objname
.substr(0, found
);
5515 objname
= objname
.substr(found
+1);
5517 object_locator_t
oloc(pool
, nspace
);
5518 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
5521 ss
<< "Invalid namespace/objname";
5526 cmd_getval(service
->cct
, cmdmap
, "shardid", shardid
, int64_t(shard_id_t::NO_SHARD
));
5527 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
5528 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
5529 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
5530 if (curmap
->pg_is_ec(rawpg
)) {
5531 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
5532 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
5537 ObjectStore::Transaction t
;
5539 if (command
== "setomapval") {
5540 map
<string
, bufferlist
> newattrs
;
5543 cmd_getval(service
->cct
, cmdmap
, "key", key
);
5544 cmd_getval(service
->cct
, cmdmap
, "val", valstr
);
5547 newattrs
[key
] = val
;
5548 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
5549 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5551 ss
<< "error=" << r
;
5554 } else if (command
== "rmomapkey") {
5557 cmd_getval(service
->cct
, cmdmap
, "key", key
);
5560 t
.omap_rmkeys(coll_t(pgid
), ghobject_t(obj
), keys
);
5561 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5563 ss
<< "error=" << r
;
5566 } else if (command
== "setomapheader") {
5567 bufferlist newheader
;
5570 cmd_getval(service
->cct
, cmdmap
, "header", headerstr
);
5571 newheader
.append(headerstr
);
5572 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
5573 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5575 ss
<< "error=" << r
;
5578 } else if (command
== "getomap") {
5579 //Debug: Output entire omap
5581 map
<string
, bufferlist
> keyvals
;
5582 auto ch
= store
->open_collection(coll_t(pgid
));
5584 ss
<< "unable to open collection for " << pgid
;
5587 r
= store
->omap_get(ch
, ghobject_t(obj
), &hdrbl
, &keyvals
);
5589 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
5590 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
5591 it
!= keyvals
.end(); ++it
)
5592 ss
<< " key=" << (*it
).first
<< " val="
5593 << string((*it
).second
.c_str(), (*it
).second
.length());
5595 ss
<< "error=" << r
;
5598 } else if (command
== "truncobj") {
5600 cmd_getval(service
->cct
, cmdmap
, "len", trunclen
);
5601 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
5602 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
5604 ss
<< "error=" << r
;
5607 } else if (command
== "injectdataerr") {
5608 store
->inject_data_error(gobj
);
5610 } else if (command
== "injectmdataerr") {
5611 store
->inject_mdata_error(gobj
);
5616 if (command
== "set_recovery_delay") {
5618 cmd_getval(service
->cct
, cmdmap
, "utime", delay
, (int64_t)0);
5621 int r
= service
->cct
->_conf
.set_val("osd_recovery_delay_start",
5624 ss
<< "set_recovery_delay: error setting "
5625 << "osd_recovery_delay_start to '" << delay
<< "': error "
5629 service
->cct
->_conf
.apply_changes(nullptr);
5630 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
5631 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
5634 if (command
== "trigger_scrub" || command
== "trigger_deep_scrub") {
5636 bool deep
= (command
== "trigger_deep_scrub");
5637 OSDMapRef curmap
= service
->get_osdmap();
5641 cmd_getval(service
->cct
, cmdmap
, "pgid", pgidstr
);
5642 if (!pgid
.parse(pgidstr
.c_str())) {
5643 ss
<< "Invalid pgid specified";
5648 cmd_getval(service
->cct
, cmdmap
, "time", time
, (int64_t)0);
5650 PGRef pg
= service
->osd
->_lookup_lock_pg(pgid
);
5651 if (pg
== nullptr) {
5652 ss
<< "Can't find pg " << pgid
;
5656 if (pg
->is_primary()) {
5657 pg
->unreg_next_scrub();
5658 const pg_pool_t
*p
= curmap
->get_pg_pool(pgid
.pool());
5659 double pool_scrub_max_interval
= 0;
5660 double scrub_max_interval
;
5662 p
->opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &pool_scrub_max_interval
);
5663 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
5664 pool_scrub_max_interval
: g_conf()->osd_deep_scrub_interval
;
5666 p
->opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &pool_scrub_max_interval
);
5667 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
5668 pool_scrub_max_interval
: g_conf()->osd_scrub_max_interval
;
5670 // Instead of marking must_scrub force a schedule scrub
5671 utime_t stamp
= ceph_clock_now();
5673 stamp
-= scrub_max_interval
;
5675 stamp
-= (float)time
;
5676 stamp
-= 100.0; // push back last scrub more for good measure
5678 pg
->set_last_deep_scrub_stamp(stamp
);
5680 pg
->set_last_scrub_stamp(stamp
);
5682 pg
->reg_next_scrub();
5683 pg
->publish_stats_to_osd();
5684 ss
<< "ok - set" << (deep
? " deep" : "" ) << " stamp " << stamp
;
5686 ss
<< "Not primary";
5691 if (command
== "injectfull") {
5694 OSDService::s_names state
;
5695 cmd_getval(service
->cct
, cmdmap
, "type", type
, string("full"));
5696 cmd_getval(service
->cct
, cmdmap
, "count", count
, (int64_t)-1);
5697 if (type
== "none" || count
== 0) {
5701 state
= service
->get_full_state(type
);
5702 if (state
== OSDService::s_names::INVALID
) {
5703 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5706 service
->set_injectfull(state
, count
);
5709 ss
<< "Internal error - command=" << command
;
5712 // =========================================
5714 void OSD::ms_handle_connect(Connection
*con
)
5716 dout(10) << __func__
<< " con " << con
<< dendl
;
5717 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
5718 std::lock_guard
l(osd_lock
);
5721 dout(10) << __func__
<< " on mon" << dendl
;
5725 } else if (is_booting()) {
5726 _send_boot(); // resend boot message
5728 map_lock
.get_read();
5729 std::lock_guard
l2(mon_report_lock
);
5731 utime_t now
= ceph_clock_now();
5732 last_mon_report
= now
;
5734 // resend everything, it's a new session
5737 service
.requeue_pg_temp();
5738 service
.clear_sent_ready_to_merge();
5739 service
.send_pg_temp();
5740 service
.send_ready_to_merge();
5741 service
.send_pg_created();
5745 map_lock
.put_read();
5747 send_beacon(ceph::coarse_mono_clock::now());
5751 // full map requests may happen while active or pre-boot
5752 if (requested_full_first
) {
5753 rerequest_full_maps();
5758 void OSD::ms_handle_fast_connect(Connection
*con
)
5760 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
5761 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
5762 auto priv
= con
->get_priv();
5763 auto s
= static_cast<Session
*>(priv
.get());
5765 s
= new Session
{cct
, con
};
5766 con
->set_priv(RefCountedPtr
{s
, false});
5767 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
5768 << " addr=" << s
->con
->get_peer_addr() << dendl
;
5769 // we don't connect to clients
5770 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
5771 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
5776 void OSD::ms_handle_fast_accept(Connection
*con
)
5778 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
5779 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
5780 auto priv
= con
->get_priv();
5781 auto s
= static_cast<Session
*>(priv
.get());
5783 s
= new Session
{cct
, con
};
5784 con
->set_priv(RefCountedPtr
{s
, false});
5785 dout(10) << "new session (incoming)" << s
<< " con=" << con
5786 << " addr=" << con
->get_peer_addr()
5787 << " must have raced with connect" << dendl
;
5788 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
5789 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
5794 bool OSD::ms_handle_reset(Connection
*con
)
5796 auto s
= con
->get_priv();
5797 auto session
= static_cast<Session
*>(s
.get());
5798 dout(2) << "ms_handle_reset con " << con
<< " session " << session
<< dendl
;
5801 session
->wstate
.reset(con
);
5802 session
->con
->set_priv(nullptr);
5803 session
->con
.reset(); // break con <-> session ref cycle
5804 // note that we break session->con *before* the session_handle_reset
5805 // cleanup below. this avoids a race between us and
5806 // PG::add_backoff, Session::check_backoff, etc.
5807 session_handle_reset(SessionRef
{session
});
5811 bool OSD::ms_handle_refused(Connection
*con
)
5813 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
5816 auto priv
= con
->get_priv();
5817 auto session
= static_cast<Session
*>(priv
.get());
5818 dout(2) << "ms_handle_refused con " << con
<< " session " << session
<< dendl
;
5821 int type
= con
->get_peer_type();
5822 // handle only OSD failures here
5823 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
5824 OSDMapRef osdmap
= get_osdmap();
5826 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
5827 if (id
>= 0 && osdmap
->is_up(id
)) {
5828 // I'm cheating mon heartbeat grace logic, because we know it's not going
5829 // to respawn alone. +1 so we won't hit any boundary case.
5830 monc
->send_mon_message(
5834 osdmap
->get_addrs(id
),
5835 cct
->_conf
->osd_heartbeat_grace
+ 1,
5836 osdmap
->get_epoch(),
5837 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
5845 struct C_OSD_GetVersion
: public Context
{
5847 uint64_t oldest
, newest
;
5848 explicit C_OSD_GetVersion(OSD
*o
) : osd(o
), oldest(0), newest(0) {}
5849 void finish(int r
) override
{
5851 osd
->_got_mon_epochs(oldest
, newest
);
5855 void OSD::start_boot()
5857 if (!_is_healthy()) {
5858 // if we are not healthy, do not mark ourselves up (yet)
5859 dout(1) << "not healthy; waiting to boot" << dendl
;
5860 if (!is_waiting_for_healthy())
5861 start_waiting_for_healthy();
5862 // send pings sooner rather than later
5866 dout(1) << __func__
<< dendl
;
5867 set_state(STATE_PREBOOT
);
5868 dout(10) << "start_boot - have maps " << superblock
.oldest_map
5869 << ".." << superblock
.newest_map
<< dendl
;
5870 C_OSD_GetVersion
*c
= new C_OSD_GetVersion(this);
5871 monc
->get_version("osdmap", &c
->newest
, &c
->oldest
, c
);
5874 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
5876 std::lock_guard
l(osd_lock
);
5878 _preboot(oldest
, newest
);
5882 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
5884 ceph_assert(is_preboot());
5885 dout(10) << __func__
<< " _preboot mon has osdmaps "
5886 << oldest
<< ".." << newest
<< dendl
;
5888 // ensure our local fullness awareness is accurate
5890 std::lock_guard
l(heartbeat_lock
);
5894 // if our map within recent history, try to add ourselves to the osdmap.
5895 if (osdmap
->get_epoch() == 0) {
5896 derr
<< "waiting for initial osdmap" << dendl
;
5897 } else if (osdmap
->is_destroyed(whoami
)) {
5898 derr
<< "osdmap says I am destroyed" << dendl
;
5899 // provide a small margin so we don't livelock seeing if we
5900 // un-destroyed ourselves.
5901 if (osdmap
->get_epoch() > newest
- 1) {
5904 } else if (osdmap
->is_noup(whoami
)) {
5905 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
5906 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
5907 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5909 } else if (osdmap
->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
5910 derr
<< "osdmap require_osd_release < luminous; please upgrade to luminous"
5912 } else if (service
.need_fullness_update()) {
5913 derr
<< "osdmap fullness state needs update" << dendl
;
5915 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
5916 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
5918 // wait for pgs to fully catch up in a different thread, since
5919 // this thread might be required for splitting and merging PGs to
5921 boot_finisher
.queue(
5922 new FunctionContext(
5924 std::lock_guard
l(osd_lock
);
5926 dout(10) << __func__
<< " waiting for peering work to drain"
5929 for (auto shard
: shards
) {
5930 shard
->wait_min_pg_epoch(osdmap
->get_epoch());
5941 // get all the latest maps
5942 if (osdmap
->get_epoch() + 1 >= oldest
)
5943 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
5945 osdmap_subscribe(oldest
- 1, true);
5948 void OSD::send_full_update()
5950 if (!service
.need_fullness_update())
5953 if (service
.is_full()) {
5954 state
= CEPH_OSD_FULL
;
5955 } else if (service
.is_backfillfull()) {
5956 state
= CEPH_OSD_BACKFILLFULL
;
5957 } else if (service
.is_nearfull()) {
5958 state
= CEPH_OSD_NEARFULL
;
5961 OSDMap::calc_state_set(state
, s
);
5962 dout(10) << __func__
<< " want state " << s
<< dendl
;
5963 monc
->send_mon_message(new MOSDFull(osdmap
->get_epoch(), state
));
5966 void OSD::start_waiting_for_healthy()
5968 dout(1) << "start_waiting_for_healthy" << dendl
;
5969 set_state(STATE_WAITING_FOR_HEALTHY
);
5970 last_heartbeat_resample
= utime_t();
5972 // subscribe to osdmap updates, in case our peers really are known to be dead
5973 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
5976 bool OSD::_is_healthy()
5978 if (!cct
->get_heartbeat_map()->is_healthy()) {
5979 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
5983 if (is_waiting_for_healthy()) {
5984 utime_t now
= ceph_clock_now();
5985 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
5986 while (!osd_markdown_log
.empty() &&
5987 osd_markdown_log
.front() + grace
< now
)
5988 osd_markdown_log
.pop_front();
5989 if (osd_markdown_log
.size() <= 1) {
5990 dout(5) << __func__
<< " first time marked as down,"
5991 << " try reboot unconditionally" << dendl
;
5994 std::lock_guard
l(heartbeat_lock
);
5995 int num
= 0, up
= 0;
5996 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5997 p
!= heartbeat_peers
.end();
5999 if (p
->second
.is_healthy(now
))
6003 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
6004 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
6005 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
6013 void OSD::_send_boot()
6015 dout(10) << "_send_boot" << dendl
;
6016 Connection
*local_connection
=
6017 cluster_messenger
->get_loopback_connection().get();
6018 entity_addrvec_t client_addrs
= client_messenger
->get_myaddrs();
6019 entity_addrvec_t cluster_addrs
= cluster_messenger
->get_myaddrs();
6020 entity_addrvec_t hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6021 entity_addrvec_t hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6023 dout(20) << " initial client_addrs " << client_addrs
6024 << ", cluster_addrs " << cluster_addrs
6025 << ", hb_back_addrs " << hb_back_addrs
6026 << ", hb_front_addrs " << hb_front_addrs
6028 if (cluster_messenger
->set_addr_unknowns(client_addrs
)) {
6029 dout(10) << " assuming cluster_addrs match client_addrs "
6030 << client_addrs
<< dendl
;
6031 cluster_addrs
= cluster_messenger
->get_myaddrs();
6033 if (auto session
= local_connection
->get_priv(); !session
) {
6034 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6037 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
6038 if (hb_back_server_messenger
->set_addr_unknowns(cluster_addrs
)) {
6039 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6040 << cluster_addrs
<< dendl
;
6041 hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6043 if (auto session
= local_connection
->get_priv(); !session
) {
6044 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6047 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
6048 if (hb_front_server_messenger
->set_addr_unknowns(client_addrs
)) {
6049 dout(10) << " assuming hb_front_addrs match client_addrs "
6050 << client_addrs
<< dendl
;
6051 hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6053 if (auto session
= local_connection
->get_priv(); !session
) {
6054 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6057 // we now know what our front and back addrs will be, and we are
6058 // about to tell the mon what our metadata (including numa bindings)
6059 // are, so now is a good time!
6060 set_numa_affinity();
6062 MOSDBoot
*mboot
= new MOSDBoot(
6063 superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
6064 hb_back_addrs
, hb_front_addrs
, cluster_addrs
,
6066 dout(10) << " final client_addrs " << client_addrs
6067 << ", cluster_addrs " << cluster_addrs
6068 << ", hb_back_addrs " << hb_back_addrs
6069 << ", hb_front_addrs " << hb_front_addrs
6071 _collect_metadata(&mboot
->metadata
);
6072 monc
->send_mon_message(mboot
);
6073 set_state(STATE_BOOTING
);
6076 void OSD::_collect_metadata(map
<string
,string
> *pm
)
6079 (*pm
)["osd_data"] = dev_path
;
6080 if (store
->get_type() == "filestore") {
6081 // not applicable for bluestore
6082 (*pm
)["osd_journal"] = journal_path
;
6084 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddrs());
6085 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddrs());
6086 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddrs());
6087 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddrs());
6090 (*pm
)["osd_objectstore"] = store
->get_type();
6091 (*pm
)["rotational"] = store_is_rotational
? "1" : "0";
6092 (*pm
)["journal_rotational"] = journal_is_rotational
? "1" : "0";
6093 (*pm
)["default_device_class"] = store
->get_default_device_class();
6094 store
->collect_metadata(pm
);
6096 collect_sys_info(pm
, cct
);
6098 (*pm
)["front_iface"] = pick_iface(
6100 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
6101 (*pm
)["back_iface"] = pick_iface(
6103 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
6109 set
<string
> unknown
;
6110 for (auto nm
: { "front_iface", "back_iface" }) {
6111 if (!(*pm
)[nm
].size()) {
6116 int r
= get_iface_numa_node((*pm
)[nm
], &n
);
6118 unknown
.insert((*pm
)[nm
]);
6126 if (unknown
.size()) {
6127 (*pm
)["network_numa_unknown_ifaces"] = stringify(unknown
);
6129 if (!nodes
.empty()) {
6130 (*pm
)["network_numa_nodes"] = stringify(nodes
);
6132 if (node
>= 0 && nodes
.size() == 1 && unknown
.empty()) {
6133 (*pm
)["network_numa_node"] = stringify(node
);
6137 if (numa_node
>= 0) {
6138 (*pm
)["numa_node"] = stringify(numa_node
);
6139 (*pm
)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size
,
6143 set
<string
> devnames
;
6144 store
->get_devices(&devnames
);
6145 (*pm
)["devices"] = stringify(devnames
);
6147 for (auto& dev
: devnames
) {
6149 string id
= get_device_id(dev
, &err
);
6151 if (!devids
.empty()) {
6154 devids
+= dev
+ "=" + id
;
6156 dout(10) << __func__
<< " no unique device id for " << dev
<< ": "
6160 (*pm
)["device_ids"] = devids
;
6162 dout(10) << __func__
<< " " << *pm
<< dendl
;
6165 void OSD::queue_want_up_thru(epoch_t want
)
6167 map_lock
.get_read();
6168 epoch_t cur
= osdmap
->get_up_thru(whoami
);
6169 std::lock_guard
l(mon_report_lock
);
6170 if (want
> up_thru_wanted
) {
6171 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
6172 << ", currently " << cur
6174 up_thru_wanted
= want
;
6177 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
6178 << ", currently " << cur
6181 map_lock
.put_read();
6184 void OSD::send_alive()
6186 ceph_assert(mon_report_lock
.is_locked());
6187 if (!osdmap
->exists(whoami
))
6189 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
6190 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
6191 if (up_thru_wanted
> up_thru
) {
6192 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
6193 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
6197 void OSD::request_full_map(epoch_t first
, epoch_t last
)
6199 dout(10) << __func__
<< " " << first
<< ".." << last
6200 << ", previously requested "
6201 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
6202 ceph_assert(osd_lock
.is_locked());
6203 ceph_assert(first
> 0 && last
> 0);
6204 ceph_assert(first
<= last
);
6205 ceph_assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
6206 if (requested_full_first
== 0) {
6208 requested_full_first
= first
;
6209 requested_full_last
= last
;
6210 } else if (last
<= requested_full_last
) {
6214 // additional request
6215 first
= requested_full_last
+ 1;
6216 requested_full_last
= last
;
6218 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
6219 req
->request_full(first
, last
);
6220 monc
->send_mon_message(req
);
6223 void OSD::got_full_map(epoch_t e
)
6225 ceph_assert(requested_full_first
<= requested_full_last
);
6226 ceph_assert(osd_lock
.is_locked());
6227 if (requested_full_first
== 0) {
6228 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
6231 if (e
< requested_full_first
) {
6232 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6233 << ".." << requested_full_last
6234 << ", ignoring" << dendl
;
6237 if (e
>= requested_full_last
) {
6238 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6239 << ".." << requested_full_last
<< ", resetting" << dendl
;
6240 requested_full_first
= requested_full_last
= 0;
6244 requested_full_first
= e
+ 1;
6246 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6247 << ".." << requested_full_last
6248 << ", still need more" << dendl
;
6251 void OSD::requeue_failures()
6253 std::lock_guard
l(heartbeat_lock
);
6254 unsigned old_queue
= failure_queue
.size();
6255 unsigned old_pending
= failure_pending
.size();
6256 for (auto p
= failure_pending
.begin(); p
!= failure_pending
.end(); ) {
6257 failure_queue
[p
->first
] = p
->second
.first
;
6258 failure_pending
.erase(p
++);
6260 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
6261 << failure_queue
.size() << dendl
;
6264 void OSD::send_failures()
6266 ceph_assert(map_lock
.is_locked());
6267 ceph_assert(mon_report_lock
.is_locked());
6268 std::lock_guard
l(heartbeat_lock
);
6269 utime_t now
= ceph_clock_now();
6270 while (!failure_queue
.empty()) {
6271 int osd
= failure_queue
.begin()->first
;
6272 if (!failure_pending
.count(osd
)) {
6273 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
6274 monc
->send_mon_message(
6278 osdmap
->get_addrs(osd
),
6280 osdmap
->get_epoch()));
6281 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
,
6282 osdmap
->get_addrs(osd
));
6284 failure_queue
.erase(osd
);
6288 void OSD::send_still_alive(epoch_t epoch
, int osd
, const entity_addrvec_t
&addrs
)
6290 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), osd
, addrs
, 0, epoch
,
6291 MOSDFailure::FLAG_ALIVE
);
6292 monc
->send_mon_message(m
);
6295 void OSD::cancel_pending_failures()
6297 std::lock_guard
l(heartbeat_lock
);
6298 auto it
= failure_pending
.begin();
6299 while (it
!= failure_pending
.end()) {
6300 dout(10) << __func__
<< " canceling in-flight failure report for osd."
6301 << it
->first
<< dendl
;
6302 send_still_alive(osdmap
->get_epoch(), it
->first
, it
->second
.second
);
6303 failure_pending
.erase(it
++);
6307 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
6309 const auto& monmap
= monc
->monmap
;
6310 // send beacon to mon even if we are just connected, and the monmap is not
6311 // initialized yet by then.
6312 if (monmap
.epoch
> 0 &&
6313 monmap
.get_required_features().contains_all(
6314 ceph::features::mon::FEATURE_LUMINOUS
)) {
6315 dout(20) << __func__
<< " sending" << dendl
;
6316 MOSDBeacon
* beacon
= nullptr;
6318 std::lock_guard l
{min_last_epoch_clean_lock
};
6319 beacon
= new MOSDBeacon(osdmap
->get_epoch(), min_last_epoch_clean
);
6320 beacon
->pgs
= min_last_epoch_clean_pgs
;
6321 last_sent_beacon
= now
;
6323 monc
->send_mon_message(beacon
);
6325 dout(20) << __func__
<< " not sending" << dendl
;
6329 void OSD::handle_command(MMonCommand
*m
)
6331 if (!require_mon_peer(m
)) {
6336 Command
*c
= new Command(m
->cmd
, m
->get_tid(), m
->get_data(), NULL
);
6337 command_wq
.queue(c
);
6341 void OSD::handle_command(MCommand
*m
)
6343 ConnectionRef con
= m
->get_connection();
6344 auto priv
= con
->get_priv();
6345 auto session
= static_cast<Session
*>(priv
.get());
6347 con
->send_message(new MCommandReply(m
, -EPERM
));
6352 OSDCap
& caps
= session
->caps
;
6355 if (!caps
.allow_all() || m
->get_source().is_mon()) {
6356 con
->send_message(new MCommandReply(m
, -EPERM
));
6361 Command
*c
= new Command(m
->cmd
, m
->get_tid(), m
->get_data(), con
.get());
6362 command_wq
.queue(c
);
6372 } osd_commands
[] = {
6374 #define COMMAND(parsesig, helptext, module, perm) \
6375 {parsesig, helptext, module, perm},
6377 // yes, these are really pg commands, but there's a limit to how
6378 // much work it's worth. The OSD returns all of them. Make this
6379 // form (pg <pgid> <cmd>) valid only for the cli.
6380 // Rest uses "tell <pgid> <cmd>"
6383 "name=pgid,type=CephPgid " \
6384 "name=cmd,type=CephChoices,strings=query", \
6385 "show details of a specific pg", "osd", "r")
6387 "name=pgid,type=CephPgid " \
6388 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6389 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6390 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6393 "name=pgid,type=CephPgid " \
6394 "name=cmd,type=CephChoices,strings=list_unfound " \
6395 "name=offset,type=CephString,req=false",
6396 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6399 // new form: tell <pgid> <cmd> for both cli and rest
6402 "show details of a specific pg", "osd", "r")
6403 COMMAND("mark_unfound_lost " \
6404 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6405 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6407 COMMAND("list_unfound " \
6408 "name=offset,type=CephString,req=false",
6409 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6411 COMMAND("perf histogram dump "
6412 "name=logger,type=CephString,req=false "
6413 "name=counter,type=CephString,req=false",
6414 "Get histogram data",
6417 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6418 COMMAND("version", "report version of OSD", "osd", "r")
6419 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r")
6420 COMMAND("injectargs " \
6421 "name=injected_args,type=CephString,n=N",
6422 "inject configuration arguments into running OSD",
6424 COMMAND("config set " \
6425 "name=key,type=CephString name=value,type=CephString",
6426 "Set a configuration option at runtime (not persistent)",
6428 COMMAND("config get " \
6429 "name=key,type=CephString",
6430 "Get a configuration option at runtime",
6432 COMMAND("config unset " \
6433 "name=key,type=CephString",
6434 "Unset a configuration option at runtime (not persistent)",
6436 COMMAND("cluster_log " \
6437 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6438 "name=message,type=CephString,n=N",
6439 "log a message to the cluster log",
6442 "name=count,type=CephInt,req=false " \
6443 "name=size,type=CephInt,req=false " \
6444 "name=object_size,type=CephInt,req=false " \
6445 "name=object_num,type=CephInt,req=false ", \
6446 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
6447 "(default count=1G default size=4MB). Results in log.",
6449 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw")
6451 "name=heapcmd,type=CephChoices,strings="\
6452 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
6453 "name=value,type=CephString,req=false",
6454 "show heap usage info (available only if compiled with tcmalloc)",
6456 COMMAND("debug dump_missing " \
6457 "name=filename,type=CephFilepath",
6458 "dump missing objects to a named file", "osd", "r")
6459 COMMAND("debug kick_recovery_wq " \
6460 "name=delay,type=CephInt,range=0",
6461 "set osd_recovery_delay_start to <val>", "osd", "rw")
6462 COMMAND("cpu_profiler " \
6463 "name=arg,type=CephChoices,strings=status|flush",
6464 "run cpu profiling on daemon", "osd", "rw")
6465 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6467 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6470 "compact object store's omap. "
6471 "WARNING: Compaction probably slows your requests",
6473 COMMAND("smart name=devid,type=CephString,req=False",
6474 "runs smartctl on this osd devices. ",
6476 COMMAND("cache drop",
6477 "Drop all OSD caches",
6479 COMMAND("cache status",
6480 "Get OSD caches statistics",
6482 COMMAND("send_beacon",
6483 "Send OSD beacon to mon immediately",
6487 void OSD::do_command(
6488 Connection
*con
, ceph_tid_t tid
, vector
<string
>& cmd
, bufferlist
& data
)
6490 dout(20) << "do_command tid " << tid
<< " " << cmd
<< dendl
;
6493 stringstream ss
, ds
;
6497 ss
<< "no command given";
6500 if (!cmdmap_from_json(cmd
, &cmdmap
, ss
)) {
6506 r
= _do_command(con
, cmdmap
, tid
, data
, odata
, ss
, ds
);
6507 } catch (const bad_cmd_get
& e
) {
6515 string rs
= ss
.str();
6517 dout(0) << "do_command r=" << r
<< " " << rs
<< dendl
;
6520 MCommandReply
*reply
= new MCommandReply(r
, rs
);
6521 reply
->set_tid(tid
);
6522 reply
->set_data(odata
);
6523 con
->send_message(reply
);
6528 class unlock_guard
{
6531 explicit unlock_guard(Mutex
& mutex
)
6536 unlock_guard(unlock_guard
&) = delete;
6543 int OSD::_do_command(
6544 Connection
*con
, cmdmap_t
& cmdmap
, ceph_tid_t tid
, bufferlist
& data
,
6545 bufferlist
& odata
, stringstream
& ss
, stringstream
& ds
)
6551 boost::scoped_ptr
<Formatter
> f
;
6553 cmd_getval(cct
, cmdmap
, "prefix", prefix
);
6555 if (prefix
== "get_command_descriptions") {
6557 JSONFormatter
*f
= new JSONFormatter();
6558 f
->open_object_section("command_descriptions");
6559 for (OSDCommand
*cp
= osd_commands
;
6560 cp
< &osd_commands
[std::size(osd_commands
)]; cp
++) {
6562 ostringstream secname
;
6563 secname
<< "cmd" << setfill('0') << std::setw(3) << cmdnum
;
6564 dump_cmddesc_to_json(f
, con
->get_features(),
6565 secname
.str(), cp
->cmdstring
, cp
->helpstring
,
6566 cp
->module
, cp
->perm
, 0);
6569 f
->close_section(); // command_descriptions
6576 cmd_getval(cct
, cmdmap
, "format", format
);
6577 f
.reset(Formatter::create(format
));
6579 if (prefix
== "version") {
6581 f
->open_object_section("version");
6582 f
->dump_string("version", pretty_version_to_str());
6586 ds
<< pretty_version_to_str();
6590 else if (prefix
== "injectargs") {
6591 vector
<string
> argsvec
;
6592 cmd_getval(cct
, cmdmap
, "injected_args", argsvec
);
6594 if (argsvec
.empty()) {
6596 ss
<< "ignoring empty injectargs";
6599 string args
= argsvec
.front();
6600 for (vector
<string
>::iterator a
= ++argsvec
.begin(); a
!= argsvec
.end(); ++a
)
6602 unlock_guard unlock
{osd_lock
};
6603 r
= cct
->_conf
.injectargs(args
, &ss
);
6605 else if (prefix
== "config set") {
6608 cmd_getval(cct
, cmdmap
, "key", key
);
6609 cmd_getval(cct
, cmdmap
, "value", val
);
6610 unlock_guard unlock
{osd_lock
};
6611 r
= cct
->_conf
.set_val(key
, val
, &ss
);
6613 cct
->_conf
.apply_changes(nullptr);
6616 else if (prefix
== "config get") {
6618 cmd_getval(cct
, cmdmap
, "key", key
);
6619 unlock_guard unlock
{osd_lock
};
6621 r
= cct
->_conf
.get_val(key
, &val
);
6626 else if (prefix
== "config unset") {
6628 cmd_getval(cct
, cmdmap
, "key", key
);
6629 unlock_guard unlock
{osd_lock
};
6630 r
= cct
->_conf
.rm_val(key
);
6632 cct
->_conf
.apply_changes(nullptr);
6635 r
= 0; // make command idempotent
6638 else if (prefix
== "cluster_log") {
6640 cmd_getval(cct
, cmdmap
, "message", msg
);
6643 ss
<< "ignoring empty log message";
6646 string message
= msg
.front();
6647 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
6648 message
+= " " + *a
;
6650 cmd_getval(cct
, cmdmap
, "level", lvl
);
6651 clog_type level
= string_to_clog_type(lvl
);
6654 ss
<< "unknown level '" << lvl
<< "'";
6657 clog
->do_log(level
, message
);
6660 // either 'pg <pgid> <command>' or
6661 // 'tell <pgid>' (which comes in without any of that prefix)?
6663 else if (prefix
== "pg" ||
6664 prefix
== "query" ||
6665 prefix
== "mark_unfound_lost" ||
6666 prefix
== "list_unfound"
6670 if (!cmd_getval(cct
, cmdmap
, "pgid", pgidstr
)) {
6671 ss
<< "no pgid specified";
6673 } else if (!pgid
.parse(pgidstr
.c_str())) {
6674 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
6679 if (osdmap
->get_primary_shard(pgid
, &pcand
) &&
6680 (pg
= _lookup_lock_pg(pcand
))) {
6681 if (pg
->is_primary()) {
6682 // simulate pg <pgid> cmd= for pg->do-command
6684 cmd_putval(cct
, cmdmap
, "cmd", prefix
);
6686 r
= pg
->do_command(cmdmap
, ss
, data
, odata
, con
, tid
);
6687 } catch (const bad_cmd_get
& e
) {
6694 // don't reply, pg will do so async
6698 ss
<< "not primary for pgid " << pgid
;
6700 // send them the latest diff to ensure they realize the mapping
6702 service
.send_incremental_map(osdmap
->get_epoch() - 1, con
, osdmap
);
6704 // do not reply; they will get newer maps and realize they
6711 ss
<< "i don't have pgid " << pgid
;
6717 else if (prefix
== "bench") {
6720 int64_t osize
, onum
;
6721 // default count 1G, size 4MB
6722 cmd_getval(cct
, cmdmap
, "count", count
, (int64_t)1 << 30);
6723 cmd_getval(cct
, cmdmap
, "size", bsize
, (int64_t)4 << 20);
6724 cmd_getval(cct
, cmdmap
, "object_size", osize
, (int64_t)0);
6725 cmd_getval(cct
, cmdmap
, "object_num", onum
, (int64_t)0);
6727 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
6729 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
6730 // let us limit the block size because the next checks rely on it
6731 // having a sane value. If we allow any block size to be set things
6732 // can still go sideways.
6733 ss
<< "block 'size' values are capped at "
6734 << byte_u_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
6735 << " a higher value, please adjust 'osd_bench_max_block_size'";
6738 } else if (bsize
< (int64_t) (1 << 20)) {
6739 // entering the realm of small block sizes.
6740 // limit the count to a sane value, assuming a configurable amount of
6741 // IOPS and duration, so that the OSD doesn't get hung up on this,
6742 // preventing timeouts from going off
6744 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
6745 if (count
> max_count
) {
6746 ss
<< "'count' values greater than " << max_count
6747 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
6748 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
6749 << " for " << duration
<< " seconds,"
6750 << " can cause ill effects on osd. "
6751 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6752 << " value if you wish to use a higher 'count'.";
6757 // 1MB block sizes are big enough so that we get more stuff done.
6758 // However, to avoid the osd from getting hung on this and having
6759 // timers being triggered, we are going to limit the count assuming
6760 // a configurable throughput and duration.
6761 // NOTE: max_count is the total amount of bytes that we believe we
6762 // will be able to write during 'duration' for the given
6763 // throughput. The block size hardly impacts this unless it's
6764 // way too big. Given we already check how big the block size
6765 // is, it's safe to assume everything will check out.
6767 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
6768 if (count
> max_count
) {
6769 ss
<< "'count' values greater than " << max_count
6770 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
6771 << byte_u_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
6772 << " for " << duration
<< " seconds,"
6773 << " can cause ill effects on osd. "
6774 << " Please adjust 'osd_bench_large_size_max_throughput'"
6775 << " with a higher value if you wish to use a higher 'count'.";
6781 if (osize
&& bsize
> osize
)
6784 dout(1) << " bench count " << count
6785 << " bsize " << byte_u_t(bsize
) << dendl
;
6787 ObjectStore::Transaction cleanupt
;
6789 if (osize
&& onum
) {
6791 bufferptr
bp(osize
);
6793 bl
.push_back(std::move(bp
));
6794 bl
.rebuild_page_aligned();
6795 for (int i
=0; i
<onum
; ++i
) {
6797 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
6799 hobject_t
soid(sobject_t(oid
, 0));
6800 ObjectStore::Transaction t
;
6801 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
6802 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
6803 cleanupt
.remove(coll_t(), ghobject_t(soid
));
6808 bufferptr
bp(bsize
);
6810 bl
.push_back(std::move(bp
));
6811 bl
.rebuild_page_aligned();
6815 if (!service
.meta_ch
->flush_commit(&waiter
)) {
6820 utime_t start
= ceph_clock_now();
6821 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
6823 unsigned offset
= 0;
6824 if (onum
&& osize
) {
6825 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
6826 offset
= rand() % (osize
/ bsize
) * bsize
;
6828 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
6831 hobject_t
soid(sobject_t(oid
, 0));
6832 ObjectStore::Transaction t
;
6833 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
6834 store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
6835 if (!onum
|| !osize
)
6836 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
6841 if (!service
.meta_ch
->flush_commit(&waiter
)) {
6845 utime_t end
= ceph_clock_now();
6848 store
->queue_transaction(service
.meta_ch
, std::move(cleanupt
), NULL
);
6851 if (!service
.meta_ch
->flush_commit(&waiter
)) {
6856 double elapsed
= end
- start
;
6857 double rate
= count
/ elapsed
;
6858 double iops
= rate
/ bsize
;
6860 f
->open_object_section("osd_bench_results");
6861 f
->dump_int("bytes_written", count
);
6862 f
->dump_int("blocksize", bsize
);
6863 f
->dump_float("elapsed_sec", elapsed
);
6864 f
->dump_float("bytes_per_sec", rate
);
6865 f
->dump_float("iops", iops
);
6869 ds
<< "bench: wrote " << byte_u_t(count
)
6870 << " in blocks of " << byte_u_t(bsize
) << " in "
6871 << elapsed
<< " sec at " << byte_u_t(rate
) << "/sec "
6872 << si_u_t(iops
) << " IOPS";
6876 else if (prefix
== "flush_pg_stats") {
6877 mgrc
.send_pgstats();
6878 ds
<< service
.get_osd_stat_seq() << "\n";
6881 else if (prefix
== "heap") {
6882 r
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ds
);
6885 else if (prefix
== "debug dump_missing") {
6887 f
.reset(new JSONFormatter(true));
6889 f
->open_array_section("pgs");
6892 for (auto& pg
: pgs
) {
6893 string s
= stringify(pg
->pg_id
);
6894 f
->open_array_section(s
.c_str());
6896 pg
->dump_missing(f
.get());
6903 else if (prefix
== "debug kick_recovery_wq") {
6905 cmd_getval(cct
, cmdmap
, "delay", delay
);
6908 unlock_guard unlock
{osd_lock
};
6909 r
= cct
->_conf
.set_val("osd_recovery_delay_start", oss
.str().c_str());
6911 ss
<< "kick_recovery_wq: error setting "
6912 << "osd_recovery_delay_start to '" << delay
<< "': error "
6916 cct
->_conf
.apply_changes(nullptr);
6917 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
6918 << "to " << cct
->_conf
->osd_recovery_delay_start
;
6921 else if (prefix
== "cpu_profiler") {
6923 cmd_getval(cct
, cmdmap
, "arg", arg
);
6924 vector
<string
> argvec
;
6925 get_str_vec(arg
, argvec
);
6926 cpu_profiler_handle_command(argvec
, ds
);
6929 else if (prefix
== "dump_pg_recovery_stats") {
6932 pg_recovery_stats
.dump_formatted(f
.get());
6935 pg_recovery_stats
.dump(s
);
6936 ds
<< "dump pg recovery stats: " << s
.str();
6940 else if (prefix
== "reset_pg_recovery_stats") {
6941 ss
<< "reset pg recovery stats";
6942 pg_recovery_stats
.reset();
6945 else if (prefix
== "perf histogram dump") {
6947 std::string counter
;
6948 cmd_getval(cct
, cmdmap
, "logger", logger
);
6949 cmd_getval(cct
, cmdmap
, "counter", counter
);
6951 cct
->get_perfcounters_collection()->dump_formatted_histograms(
6952 f
.get(), false, logger
, counter
);
6957 else if (prefix
== "compact") {
6958 dout(1) << "triggering manual compaction" << dendl
;
6959 auto start
= ceph::coarse_mono_clock::now();
6961 auto end
= ceph::coarse_mono_clock::now();
6962 double duration
= std::chrono::duration
<double>(end
-start
).count();
6963 dout(1) << "finished manual compaction in "
6965 << " seconds" << dendl
;
6966 ss
<< "compacted omap in " << duration
<< " seconds";
6969 else if (prefix
== "smart") {
6971 cmd_getval(cct
, cmdmap
, "devid", devid
);
6972 probe_smart(devid
, ds
);
6975 else if (prefix
== "cache drop") {
6976 dout(20) << "clearing all caches" << dendl
;
6977 // Clear the objectstore's cache - onode and buffer for Bluestore,
6978 // system's pagecache for Filestore
6979 r
= store
->flush_cache(&ss
);
6981 ds
<< "Error flushing objectstore cache: " << cpp_strerror(r
);
6984 // Clear the objectcontext cache (per PG)
6987 for (auto& pg
: pgs
) {
6992 else if (prefix
== "cache status") {
6993 int obj_ctx_count
= 0;
6996 for (auto& pg
: pgs
) {
6997 obj_ctx_count
+= pg
->get_cache_obj_count();
7000 f
->open_object_section("cache_status");
7001 f
->dump_int("object_ctx", obj_ctx_count
);
7002 store
->dump_cache_stats(f
.get());
7006 ds
<< "object_ctx: " << obj_ctx_count
;
7007 store
->dump_cache_stats(ds
);
7010 else if (prefix
== "send_beacon") {
7012 send_beacon(ceph::coarse_mono_clock::now());
7015 ss
<< "unrecognized command '" << prefix
<< "'";
7023 void OSD::probe_smart(const string
& only_devid
, ostream
& ss
)
7025 set
<string
> devnames
;
7026 store
->get_devices(&devnames
);
7027 uint64_t smart_timeout
= cct
->_conf
.get_val
<uint64_t>(
7028 "osd_smart_report_timeout");
7030 // == typedef std::map<std::string, mValue> mObject;
7031 json_spirit::mObject json_map
;
7033 for (auto dev
: devnames
) {
7034 // smartctl works only on physical devices; filter out any logical device
7035 if (dev
.find("dm-") == 0) {
7040 string devid
= get_device_id(dev
, &err
);
7041 if (devid
.size() == 0) {
7042 dout(10) << __func__
<< " no unique id for dev " << dev
<< " ("
7043 << err
<< "), skipping" << dendl
;
7046 if (only_devid
.size() && devid
!= only_devid
) {
7050 json_spirit::mValue smart_json
;
7051 if (block_device_get_metrics(dev
, smart_timeout
,
7053 dout(10) << "block_device_get_metrics failed for /dev/" << dev
<< dendl
;
7056 json_map
[devid
] = smart_json
;
7058 json_spirit::write(json_map
, ss
, json_spirit::pretty_print
);
7061 bool OSD::heartbeat_dispatch(Message
*m
)
7063 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
7064 switch (m
->get_type()) {
7067 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
7072 handle_osd_ping(static_cast<MOSDPing
*>(m
));
7076 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
7083 bool OSD::ms_dispatch(Message
*m
)
7085 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
7086 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
7087 service
.got_stop_ack();
7095 if (is_stopping()) {
7109 void OSD::maybe_share_map(
7114 if (!op
->check_send_map
) {
7117 epoch_t last_sent_epoch
= 0;
7119 session
->sent_epoch_lock
.lock();
7120 last_sent_epoch
= session
->last_sent_epoch
;
7121 session
->sent_epoch_lock
.unlock();
7123 // assume the peer has the newer of the op's sent_epoch and what
7124 // we think we sent them.
7125 epoch_t from
= std::max(last_sent_epoch
, op
->sent_epoch
);
7127 const Message
*m
= op
->get_req();
7130 m
->get_connection().get(),
7133 session
? &last_sent_epoch
: NULL
);
7135 session
->sent_epoch_lock
.lock();
7136 if (session
->last_sent_epoch
< last_sent_epoch
) {
7137 session
->last_sent_epoch
= last_sent_epoch
;
7139 session
->sent_epoch_lock
.unlock();
7141 op
->check_send_map
= false;
7144 void OSD::dispatch_session_waiting(SessionRef session
, OSDMapRef osdmap
)
7146 ceph_assert(session
->session_dispatch_lock
.is_locked());
7148 auto i
= session
->waiting_on_map
.begin();
7149 while (i
!= session
->waiting_on_map
.end()) {
7150 OpRequestRef op
= &(*i
);
7151 ceph_assert(ms_can_fast_dispatch(op
->get_req()));
7152 const MOSDFastDispatchOp
*m
= static_cast<const MOSDFastDispatchOp
*>(
7154 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
7157 session
->waiting_on_map
.erase(i
++);
7161 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
7162 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
7163 static_cast<const MOSDOp
*>(m
)->get_pg());
7164 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
7168 pgid
= m
->get_spg();
7170 enqueue_op(pgid
, std::move(op
), m
->get_map_epoch());
7173 if (session
->waiting_on_map
.empty()) {
7174 clear_session_waiting_on_map(session
);
7176 register_session_waiting_on_map(session
);
7180 void OSD::ms_fast_dispatch(Message
*m
)
7183 if (service
.is_stopping()) {
7189 switch (m
->get_type()) {
7191 dout(10) << "ping from " << m
->get_source() << dendl
;
7194 case MSG_MON_COMMAND
:
7195 handle_command(static_cast<MMonCommand
*>(m
));
7197 case MSG_OSD_FORCE_RECOVERY
:
7198 handle_fast_force_recovery(static_cast<MOSDForceRecovery
*>(m
));
7200 case MSG_OSD_SCRUB2
:
7201 handle_fast_scrub(static_cast<MOSDScrub2
*>(m
));
7204 case MSG_OSD_PG_CREATE2
:
7205 return handle_fast_pg_create(static_cast<MOSDPGCreate2
*>(m
));
7206 case MSG_OSD_PG_QUERY
:
7207 return handle_fast_pg_query(static_cast<MOSDPGQuery
*>(m
));
7208 case MSG_OSD_PG_NOTIFY
:
7209 return handle_fast_pg_notify(static_cast<MOSDPGNotify
*>(m
));
7210 case MSG_OSD_PG_INFO
:
7211 return handle_fast_pg_info(static_cast<MOSDPGInfo
*>(m
));
7212 case MSG_OSD_PG_REMOVE
:
7213 return handle_fast_pg_remove(static_cast<MOSDPGRemove
*>(m
));
7215 // these are single-pg messages that handle themselves
7216 case MSG_OSD_PG_LOG
:
7217 case MSG_OSD_PG_TRIM
:
7218 case MSG_OSD_BACKFILL_RESERVE
:
7219 case MSG_OSD_RECOVERY_RESERVE
:
7221 MOSDPeeringOp
*pm
= static_cast<MOSDPeeringOp
*>(m
);
7222 if (require_osd_peer(pm
)) {
7223 enqueue_peering_evt(
7225 PGPeeringEventRef(pm
->get_event()));
7232 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7235 osd_reqid_t reqid
= op
->get_reqid();
7237 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
7238 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
7242 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7244 // note sender epoch, min req's epoch
7245 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
7246 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
7247 ceph_assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
7249 service
.maybe_inject_dispatch_delay();
7251 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
7252 m
->get_type() != CEPH_MSG_OSD_OP
) {
7253 // queue it directly
7255 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
7257 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
7259 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7260 // message that didn't have an explicit spg_t); we need to map
7261 // them to an spg_t while preserving delivery order.
7262 auto priv
= m
->get_connection()->get_priv();
7263 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
7264 std::lock_guard l
{session
->session_dispatch_lock
};
7266 session
->waiting_on_map
.push_back(*op
);
7267 OSDMapRef nextmap
= service
.get_nextmap_reserved();
7268 dispatch_session_waiting(session
, nextmap
);
7269 service
.release_map(nextmap
);
7272 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
7275 bool OSD::ms_get_authorizer(int dest_type
, AuthAuthorizer
**authorizer
)
7277 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type
) << dendl
;
7279 if (is_stopping()) {
7280 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
7284 if (dest_type
== CEPH_ENTITY_TYPE_MON
)
7287 *authorizer
= monc
->build_authorizer(dest_type
);
7288 return *authorizer
!= NULL
;
7291 KeyStore
*OSD::ms_get_auth1_authorizer_keystore()
7293 return monc
->rotating_secrets
.get();
7296 int OSD::ms_handle_authentication(Connection
*con
)
7299 auto priv
= con
->get_priv();
7300 Session
*s
= static_cast<Session
*>(priv
.get());
7302 s
= new Session(cct
, con
);
7303 con
->set_priv(RefCountedPtr
{s
, false});
7304 s
->entity_name
= con
->get_peer_entity_name();
7305 dout(10) << __func__
<< " new session " << s
<< " con " << s
->con
7306 << " entity " << s
->entity_name
7307 << " addr " << con
->get_peer_addrs() << dendl
;
7309 dout(10) << __func__
<< " existing session " << s
<< " con " << s
->con
7310 << " entity " << s
->entity_name
7311 << " addr " << con
->get_peer_addrs() << dendl
;
7314 AuthCapsInfo
&caps_info
= con
->get_peer_caps_info();
7315 if (caps_info
.allow_all
)
7316 s
->caps
.set_allow_all();
7318 if (caps_info
.caps
.length() > 0) {
7319 bufferlist::const_iterator p
= caps_info
.caps
.cbegin();
7324 catch (buffer::error
& e
) {
7325 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7326 << " failed to decode caps string" << dendl
;
7330 bool success
= s
->caps
.parse(str
);
7332 dout(10) << __func__
<< " session " << s
7333 << " " << s
->entity_name
7334 << " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
7337 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7338 << " failed to parse caps '" << str
<< "'" << dendl
;
7346 void OSD::do_waiters()
7348 ceph_assert(osd_lock
.is_locked());
7350 dout(10) << "do_waiters -- start" << dendl
;
7351 while (!finished
.empty()) {
7352 OpRequestRef next
= finished
.front();
7353 finished
.pop_front();
7356 dout(10) << "do_waiters -- finish" << dendl
;
7359 void OSD::dispatch_op(OpRequestRef op
)
7361 switch (op
->get_req()->get_type()) {
7363 case MSG_OSD_PG_CREATE
:
7364 handle_pg_create(op
);
7369 void OSD::_dispatch(Message
*m
)
7371 ceph_assert(osd_lock
.is_locked());
7372 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
7374 switch (m
->get_type()) {
7375 // -- don't need OSDMap --
7377 // map and replication
7378 case CEPH_MSG_OSD_MAP
:
7379 handle_osd_map(static_cast<MOSDMap
*>(m
));
7384 handle_scrub(static_cast<MOSDScrub
*>(m
));
7388 handle_command(static_cast<MCommand
*>(m
));
7391 // -- need OSDMap --
7393 case MSG_OSD_PG_CREATE
:
7395 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7397 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7398 // no map? starting up?
7400 dout(7) << "no OSDMap, not booted" << dendl
;
7401 logger
->inc(l_osd_waiting_for_map
);
7402 waiting_for_osdmap
.push_back(op
);
7403 op
->mark_delayed("no osdmap");
7413 // remove me post-nautilus
7414 void OSD::handle_scrub(MOSDScrub
*m
)
7416 dout(10) << "handle_scrub " << *m
<< dendl
;
7417 if (!require_mon_or_mgr_peer(m
)) {
7421 if (m
->fsid
!= monc
->get_fsid()) {
7422 dout(0) << "handle_scrub fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7431 if (!m
->scrub_pgs
.empty()) {
7433 for (auto pgid
: m
->scrub_pgs
) {
7435 if (osdmap
->get_primary_shard(pgid
, &pcand
) &&
7436 std::find(spgs
.begin(), spgs
.end(), pcand
) != spgs
.end()) {
7443 for (auto pgid
: spgs
) {
7444 enqueue_peering_evt(
7447 std::make_shared
<PGPeeringEvent
>(
7450 PG::RequestScrub(m
->deep
, m
->repair
))));
7456 void OSD::handle_fast_scrub(MOSDScrub2
*m
)
7458 dout(10) << __func__
<< " " << *m
<< dendl
;
7459 if (!require_mon_or_mgr_peer(m
)) {
7463 if (m
->fsid
!= monc
->get_fsid()) {
7464 dout(0) << __func__
<< " fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7469 for (auto pgid
: m
->scrub_pgs
) {
7470 enqueue_peering_evt(
7473 std::make_shared
<PGPeeringEvent
>(
7476 PG::RequestScrub(m
->deep
, m
->repair
))));
7481 bool OSD::scrub_random_backoff()
7483 bool coin_flip
= (rand() / (double)RAND_MAX
>=
7484 cct
->_conf
->osd_scrub_backoff_ratio
);
7486 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl
;
7492 OSDService::ScrubJob::ScrubJob(CephContext
* cct
,
7493 const spg_t
& pg
, const utime_t
& timestamp
,
7494 double pool_scrub_min_interval
,
7495 double pool_scrub_max_interval
, bool must
)
7498 sched_time(timestamp
),
7501 // if not explicitly requested, postpone the scrub with a random delay
7503 double scrub_min_interval
= pool_scrub_min_interval
> 0 ?
7504 pool_scrub_min_interval
: cct
->_conf
->osd_scrub_min_interval
;
7505 double scrub_max_interval
= pool_scrub_max_interval
> 0 ?
7506 pool_scrub_max_interval
: cct
->_conf
->osd_scrub_max_interval
;
7508 sched_time
+= scrub_min_interval
;
7509 double r
= rand() / (double)RAND_MAX
;
7511 scrub_min_interval
* cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
7512 if (scrub_max_interval
== 0) {
7513 deadline
= utime_t();
7515 deadline
+= scrub_max_interval
;
7521 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob
& rhs
) const {
7522 if (sched_time
< rhs
.sched_time
)
7524 if (sched_time
> rhs
.sched_time
)
7526 return pgid
< rhs
.pgid
;
7529 bool OSD::scrub_time_permit(utime_t now
)
7532 time_t tt
= now
.sec();
7533 localtime_r(&tt
, &bdt
);
7535 bool day_permit
= false;
7536 if (cct
->_conf
->osd_scrub_begin_week_day
< cct
->_conf
->osd_scrub_end_week_day
) {
7537 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
&& bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7541 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
|| bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7547 dout(20) << __func__
<< " should run between week day " << cct
->_conf
->osd_scrub_begin_week_day
7548 << " - " << cct
->_conf
->osd_scrub_end_week_day
7549 << " now " << bdt
.tm_wday
<< " = no" << dendl
;
7553 bool time_permit
= false;
7554 if (cct
->_conf
->osd_scrub_begin_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7555 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
&& bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7559 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
|| bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7564 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7565 << " - " << cct
->_conf
->osd_scrub_end_hour
7566 << " now " << bdt
.tm_hour
<< " = no" << dendl
;
7568 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7569 << " - " << cct
->_conf
->osd_scrub_end_hour
7570 << " now " << bdt
.tm_hour
<< " = yes" << dendl
;
7575 bool OSD::scrub_load_below_threshold()
7578 if (getloadavg(loadavgs
, 3) != 3) {
7579 dout(10) << __func__
<< " couldn't read loadavgs\n" << dendl
;
7583 // allow scrub if below configured threshold
7584 long cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
7585 double loadavg_per_cpu
= cpus
> 0 ? loadavgs
[0] / cpus
: loadavgs
[0];
7586 if (loadavg_per_cpu
< cct
->_conf
->osd_scrub_load_threshold
) {
7587 dout(20) << __func__
<< " loadavg per cpu " << loadavg_per_cpu
7588 << " < max " << cct
->_conf
->osd_scrub_load_threshold
7589 << " = yes" << dendl
;
7593 // allow scrub if below daily avg and currently decreasing
7594 if (loadavgs
[0] < daily_loadavg
&& loadavgs
[0] < loadavgs
[2]) {
7595 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7596 << " < daily_loadavg " << daily_loadavg
7597 << " and < 15m avg " << loadavgs
[2]
7598 << " = yes" << dendl
;
7602 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7603 << " >= max " << cct
->_conf
->osd_scrub_load_threshold
7604 << " and ( >= daily_loadavg " << daily_loadavg
7605 << " or >= 15m avg " << loadavgs
[2]
7606 << ") = no" << dendl
;
7610 void OSD::sched_scrub()
7612 // if not permitted, fail fast
7613 if (!service
.can_inc_scrubs_pending()) {
7616 if (!cct
->_conf
->osd_scrub_during_recovery
&& service
.is_recovery_active()) {
7617 dout(20) << __func__
<< " not scheduling scrubs due to active recovery" << dendl
;
7622 utime_t now
= ceph_clock_now();
7623 bool time_permit
= scrub_time_permit(now
);
7624 bool load_is_low
= scrub_load_below_threshold();
7625 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low
<< dendl
;
7627 OSDService::ScrubJob scrub
;
7628 if (service
.first_scrub_stamp(&scrub
)) {
7630 dout(30) << "sched_scrub examine " << scrub
.pgid
<< " at " << scrub
.sched_time
<< dendl
;
7632 if (scrub
.sched_time
> now
) {
7633 // save ourselves some effort
7634 dout(10) << "sched_scrub " << scrub
.pgid
<< " scheduled at " << scrub
.sched_time
7635 << " > " << now
<< dendl
;
7639 if ((scrub
.deadline
.is_zero() || scrub
.deadline
>= now
) && !(time_permit
&& load_is_low
)) {
7640 dout(10) << __func__
<< " not scheduling scrub for " << scrub
.pgid
<< " due to "
7641 << (!time_permit
? "time not permit" : "high load") << dendl
;
7645 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7648 // This has already started, so go on to the next scrub job
7649 if (pg
->scrubber
.active
) {
7651 dout(30) << __func__
<< ": already in progress pgid " << scrub
.pgid
<< dendl
;
7654 // If it is reserving, let it resolve before going to the next scrub job
7655 if (pg
->scrubber
.reserved
) {
7657 dout(30) << __func__
<< ": reserve in progress pgid " << scrub
.pgid
<< dendl
;
7660 dout(10) << "sched_scrub scrubbing " << scrub
.pgid
<< " at " << scrub
.sched_time
7661 << (pg
->get_must_scrub() ? ", explicitly requested" :
7662 (load_is_low
? ", load_is_low" : " deadline < now"))
7664 if (pg
->sched_scrub()) {
7669 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7671 dout(20) << "sched_scrub done" << dendl
;
7674 void OSD::resched_all_scrubs()
7676 dout(10) << __func__
<< ": start" << dendl
;
7677 OSDService::ScrubJob scrub
;
7678 if (service
.first_scrub_stamp(&scrub
)) {
7680 dout(20) << __func__
<< ": examine " << scrub
.pgid
<< dendl
;
7682 PGRef pg
= _lookup_lock_pg(scrub
.pgid
);
7685 if (!pg
->scrubber
.must_scrub
&& !pg
->scrubber
.need_auto
) {
7686 dout(20) << __func__
<< ": reschedule " << scrub
.pgid
<< dendl
;
7687 pg
->on_info_history_change();
7690 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7692 dout(10) << __func__
<< ": done" << dendl
;
7695 MPGStats
* OSD::collect_pg_stats()
7697 // This implementation unconditionally sends every is_primary PG's
7698 // stats every time we're called. This has equivalent cost to the
7699 // previous implementation's worst case where all PGs are busy and
7700 // their stats are always enqueued for sending.
7701 RWLock::RLocker
l(map_lock
);
7703 utime_t had_for
= ceph_clock_now() - had_map_since
;
7704 osd_stat_t cur_stat
= service
.get_osd_stat();
7705 cur_stat
.os_perf_stat
= store
->get_cur_stats();
7707 auto m
= new MPGStats(monc
->get_fsid(), osdmap
->get_epoch(), had_for
);
7708 m
->osd_stat
= cur_stat
;
7710 std::lock_guard lec
{min_last_epoch_clean_lock
};
7711 min_last_epoch_clean
= osdmap
->get_epoch();
7712 min_last_epoch_clean_pgs
.clear();
7714 std::set
<int64_t> pool_set
;
7717 for (auto& pg
: pgs
) {
7718 auto pool
= pg
->pg_id
.pgid
.pool();
7719 pool_set
.emplace((int64_t)pool
);
7720 if (!pg
->is_primary()) {
7723 pg
->get_pg_stats([&](const pg_stat_t
& s
, epoch_t lec
) {
7724 m
->pg_stat
[pg
->pg_id
.pgid
] = s
;
7725 min_last_epoch_clean
= min(min_last_epoch_clean
, lec
);
7726 min_last_epoch_clean_pgs
.push_back(pg
->pg_id
.pgid
);
7730 bool per_pool_stats
= false;
7731 for (auto p
: pool_set
) {
7732 int r
= store
->pool_statfs(p
, &st
);
7733 if (r
== -ENOTSUP
) {
7737 m
->pool_stat
[p
] = st
;
7738 per_pool_stats
= true;
7742 // indicate whether we are reporting per-pool stats
7743 m
->osd_stat
.num_osds
= 1;
7744 m
->osd_stat
.num_per_pool_osds
= per_pool_stats
? 1 : 0;
7749 vector
<DaemonHealthMetric
> OSD::get_health_metrics()
7751 vector
<DaemonHealthMetric
> metrics
;
7753 utime_t oldest_secs
;
7754 const utime_t now
= ceph_clock_now();
7756 too_old
-= cct
->_conf
.get_val
<double>("osd_op_complaint_time");
7758 TrackedOpRef oldest_op
;
7759 auto count_slow_ops
= [&](TrackedOp
& op
) {
7760 if (op
.get_initiated() < too_old
) {
7761 lgeneric_subdout(cct
,osd
,20) << "slow op " << op
.get_desc()
7763 << op
.get_initiated() << dendl
;
7765 if (!oldest_op
|| op
.get_initiated() < oldest_op
->get_initiated()) {
7773 if (op_tracker
.visit_ops_in_flight(&oldest_secs
, count_slow_ops
)) {
7775 derr
<< __func__
<< " reporting " << slow
<< " slow ops, oldest is "
7776 << oldest_op
->get_desc() << dendl
;
7778 metrics
.emplace_back(daemon_metric::SLOW_OPS
, slow
, oldest_secs
);
7780 // no news is not good news.
7781 metrics
.emplace_back(daemon_metric::SLOW_OPS
, 0, 0);
7785 std::lock_guard
l(pending_creates_lock
);
7786 auto n_primaries
= pending_creates_from_mon
;
7787 for (const auto& create
: pending_creates_from_osd
) {
7788 if (create
.second
) {
7792 metrics
.emplace_back(daemon_metric::PENDING_CREATING_PGS
, n_primaries
);
7797 // =====================================================
7800 void OSD::wait_for_new_map(OpRequestRef op
)
7803 if (waiting_for_osdmap
.empty()) {
7804 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
7807 logger
->inc(l_osd_waiting_for_map
);
7808 waiting_for_osdmap
.push_back(op
);
7809 op
->mark_delayed("wait for new map");
7814 * assimilate new OSDMap(s). scan pgs, etc.
7817 void OSD::note_down_osd(int peer
)
7819 ceph_assert(osd_lock
.is_locked());
7820 cluster_messenger
->mark_down_addrs(osdmap
->get_cluster_addrs(peer
));
7822 heartbeat_lock
.Lock();
7823 failure_queue
.erase(peer
);
7824 failure_pending
.erase(peer
);
7825 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
7826 if (p
!= heartbeat_peers
.end()) {
7827 p
->second
.con_back
->mark_down();
7828 if (p
->second
.con_front
) {
7829 p
->second
.con_front
->mark_down();
7831 heartbeat_peers
.erase(p
);
7833 heartbeat_lock
.Unlock();
7836 void OSD::note_up_osd(int peer
)
7838 service
.forget_peer_epoch(peer
, osdmap
->get_epoch() - 1);
7839 heartbeat_set_peers_need_update();
7842 struct C_OnMapCommit
: public Context
{
7844 epoch_t first
, last
;
7846 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
7847 : osd(o
), first(f
), last(l
), msg(m
) {}
7848 void finish(int r
) override
{
7849 osd
->_committed_osd_maps(first
, last
, msg
);
7854 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
7856 std::lock_guard
l(osdmap_subscribe_lock
);
7857 if (latest_subscribed_epoch
>= epoch
&& !force_request
)
7860 latest_subscribed_epoch
= std::max
<uint64_t>(epoch
, latest_subscribed_epoch
);
7862 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
7868 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
7870 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
7871 if (min
<= superblock
.oldest_map
)
7875 ObjectStore::Transaction t
;
7876 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
7877 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
7878 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
7879 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
7880 superblock
.oldest_map
= e
+ 1;
7882 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
7883 service
.publish_superblock(superblock
);
7884 write_superblock(t
);
7885 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7886 ceph_assert(tr
== 0);
7889 // skip_maps leaves us with a range of old maps if we fail to remove all
7890 // of them before moving superblock.oldest_map forward to the first map
7891 // in the incoming MOSDMap msg. so we should continue removing them in
7892 // this case, even we could do huge series of delete transactions all at
7899 service
.publish_superblock(superblock
);
7900 write_superblock(t
);
7901 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7902 ceph_assert(tr
== 0);
7904 // we should not remove the cached maps
7905 ceph_assert(min
<= service
.map_cache
.cached_key_lower_bound());
7908 void OSD::handle_osd_map(MOSDMap
*m
)
7910 // wait for pgs to catch up
7912 // we extend the map cache pins to accomodate pgs slow to consume maps
7913 // for some period, until we hit the max_lag_factor bound, at which point
7914 // we block here to stop injesting more maps than they are able to keep
7916 epoch_t max_lag
= cct
->_conf
->osd_map_cache_size
*
7917 m_osd_pg_epoch_max_lag_factor
;
7918 ceph_assert(max_lag
> 0);
7919 epoch_t osd_min
= 0;
7920 for (auto shard
: shards
) {
7921 epoch_t min
= shard
->get_min_pg_epoch();
7922 if (osd_min
== 0 || min
< osd_min
) {
7927 osdmap
->get_epoch() > max_lag
&&
7928 osdmap
->get_epoch() - max_lag
> osd_min
) {
7929 epoch_t need
= osdmap
->get_epoch() - max_lag
;
7930 dout(10) << __func__
<< " waiting for pgs to catch up (need " << need
7931 << " max_lag " << max_lag
<< ")" << dendl
;
7932 for (auto shard
: shards
) {
7933 epoch_t min
= shard
->get_min_pg_epoch();
7935 dout(10) << __func__
<< " waiting for pgs to consume " << need
7936 << " (shard " << shard
->shard_id
<< " min " << min
7937 << ", map cache is " << cct
->_conf
->osd_map_cache_size
7938 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7940 unlock_guard unlock
{osd_lock
};
7941 shard
->wait_min_pg_epoch(need
);
7947 ceph_assert(osd_lock
.is_locked());
7948 map
<epoch_t
,OSDMapRef
> added_maps
;
7949 map
<epoch_t
,bufferlist
> added_maps_bl
;
7950 if (m
->fsid
!= monc
->get_fsid()) {
7951 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
7952 << monc
->get_fsid() << dendl
;
7956 if (is_initializing()) {
7957 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
7962 auto priv
= m
->get_connection()->get_priv();
7963 if (auto session
= static_cast<Session
*>(priv
.get());
7964 session
&& !(session
->entity_name
.is_mon() ||
7965 session
->entity_name
.is_osd())) {
7967 dout(10) << "got osd map from Session " << session
7968 << " which we can't take maps from (not a mon or osd)" << dendl
;
7973 // share with the objecter
7975 service
.objecter
->handle_osd_map(m
);
7977 epoch_t first
= m
->get_first();
7978 epoch_t last
= m
->get_last();
7979 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
7980 << superblock
.newest_map
7981 << ", src has [" << m
->oldest_map
<< "," << m
->newest_map
<< "]"
7984 logger
->inc(l_osd_map
);
7985 logger
->inc(l_osd_mape
, last
- first
+ 1);
7986 if (first
<= superblock
.newest_map
)
7987 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
7988 if (service
.max_oldest_map
< m
->oldest_map
) {
7989 service
.max_oldest_map
= m
->oldest_map
;
7990 ceph_assert(service
.max_oldest_map
>= superblock
.oldest_map
);
7993 // make sure there is something new, here, before we bother flushing
7994 // the queues and such
7995 if (last
<= superblock
.newest_map
) {
7996 dout(10) << " no new maps here, dropping" << dendl
;
8002 bool skip_maps
= false;
8003 if (first
> superblock
.newest_map
+ 1) {
8004 dout(10) << "handle_osd_map message skips epochs "
8005 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
8006 if (m
->oldest_map
<= superblock
.newest_map
+ 1) {
8007 osdmap_subscribe(superblock
.newest_map
+ 1, false);
8011 // always try to get the full range of maps--as many as we can. this
8012 // 1- is good to have
8013 // 2- is at present the only way to ensure that we get a *full* map as
8015 if (m
->oldest_map
< first
) {
8016 osdmap_subscribe(m
->oldest_map
- 1, true);
8023 ObjectStore::Transaction t
;
8024 uint64_t txn_size
= 0;
8026 // store new maps: queue for disk and put in the osdmap cache
8027 epoch_t start
= std::max(superblock
.newest_map
+ 1, first
);
8028 for (epoch_t e
= start
; e
<= last
; e
++) {
8029 if (txn_size
>= t
.get_num_bytes()) {
8030 derr
<< __func__
<< " transaction size overflowed" << dendl
;
8031 ceph_assert(txn_size
< t
.get_num_bytes());
8033 txn_size
= t
.get_num_bytes();
8034 map
<epoch_t
,bufferlist
>::iterator p
;
8035 p
= m
->maps
.find(e
);
8036 if (p
!= m
->maps
.end()) {
8037 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
8038 OSDMap
*o
= new OSDMap
;
8039 bufferlist
& bl
= p
->second
;
8043 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8044 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
8045 added_maps
[e
] = add_map(o
);
8046 added_maps_bl
[e
] = bl
;
8051 p
= m
->incremental_maps
.find(e
);
8052 if (p
!= m
->incremental_maps
.end()) {
8053 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
8054 bufferlist
& bl
= p
->second
;
8055 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
8056 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
8058 OSDMap
*o
= new OSDMap
;
8061 bool got
= get_map_bl(e
- 1, obl
);
8063 auto p
= added_maps_bl
.find(e
- 1);
8064 ceph_assert(p
!= added_maps_bl
.end());
8070 OSDMap::Incremental inc
;
8071 auto p
= bl
.cbegin();
8074 if (o
->apply_incremental(inc
) < 0) {
8075 derr
<< "ERROR: bad fsid? i have " << osdmap
->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
8076 ceph_abort_msg("bad fsid");
8080 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
8082 bool injected_failure
= false;
8083 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
8084 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
8085 derr
<< __func__
<< " injecting map crc failure" << dendl
;
8086 injected_failure
= true;
8089 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
8090 dout(2) << "got incremental " << e
8091 << " but failed to encode full with correct crc; requesting"
8093 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
8094 dout(20) << "my encoded map was:\n";
8095 fbl
.hexdump(*_dout
);
8098 request_full_map(e
, last
);
8104 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8105 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
8106 added_maps
[e
] = add_map(o
);
8107 added_maps_bl
[e
] = fbl
;
8111 ceph_abort_msg("MOSDMap lied about what maps it had?");
8114 // even if this map isn't from a mon, we may have satisfied our subscription
8115 monc
->sub_got("osdmap", last
);
8117 if (!m
->maps
.empty() && requested_full_first
) {
8118 dout(10) << __func__
<< " still missing full maps " << requested_full_first
8119 << ".." << requested_full_last
<< dendl
;
8120 rerequest_full_maps();
8123 if (superblock
.oldest_map
) {
8124 // make sure we at least keep pace with incoming maps
8125 trim_maps(m
->oldest_map
, last
- first
+ 1, skip_maps
);
8126 pg_num_history
.prune(superblock
.oldest_map
);
8129 if (!superblock
.oldest_map
|| skip_maps
)
8130 superblock
.oldest_map
= first
;
8131 superblock
.newest_map
= last
;
8132 superblock
.current_epoch
= last
;
8134 // note in the superblock that we were clean thru the prior epoch
8135 epoch_t boot_epoch
= service
.get_boot_epoch();
8136 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
8137 superblock
.mounted
= boot_epoch
;
8138 superblock
.clean_thru
= last
;
8141 // check for pg_num changes and deleted pools
8143 for (auto& i
: added_maps
) {
8145 if (!(lastmap
= service
.try_get_map(i
.first
- 1))) {
8146 dout(10) << __func__
<< " can't get previous map " << i
.first
- 1
8147 << " probably first start of this osd" << dendl
;
8151 ceph_assert(lastmap
->get_epoch() + 1 == i
.second
->get_epoch());
8152 for (auto& j
: lastmap
->get_pools()) {
8153 if (!i
.second
->have_pg_pool(j
.first
)) {
8154 pg_num_history
.log_pool_delete(i
.first
, j
.first
);
8155 dout(10) << __func__
<< " recording final pg_pool_t for pool "
8156 << j
.first
<< dendl
;
8157 // this information is needed by _make_pg() if have to restart before
8158 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8159 ghobject_t obj
= make_final_pool_info_oid(j
.first
);
8161 encode(j
.second
, bl
, CEPH_FEATURES_ALL
);
8162 string name
= lastmap
->get_pool_name(j
.first
);
8164 map
<string
,string
> profile
;
8165 if (lastmap
->get_pg_pool(j
.first
)->is_erasure()) {
8166 profile
= lastmap
->get_erasure_code_profile(
8167 lastmap
->get_pg_pool(j
.first
)->erasure_code_profile
);
8169 encode(profile
, bl
);
8170 t
.write(coll_t::meta(), obj
, 0, bl
.length(), bl
);
8171 service
.store_deleted_pool_pg_num(j
.first
, j
.second
.get_pg_num());
8172 } else if (unsigned new_pg_num
= i
.second
->get_pg_num(j
.first
);
8173 new_pg_num
!= j
.second
.get_pg_num()) {
8174 dout(10) << __func__
<< " recording pool " << j
.first
<< " pg_num "
8175 << j
.second
.get_pg_num() << " -> " << new_pg_num
<< dendl
;
8176 pg_num_history
.log_pg_num_change(i
.first
, j
.first
, new_pg_num
);
8179 for (auto& j
: i
.second
->get_pools()) {
8180 if (!lastmap
->have_pg_pool(j
.first
)) {
8181 dout(10) << __func__
<< " recording new pool " << j
.first
<< " pg_num "
8182 << j
.second
.get_pg_num() << dendl
;
8183 pg_num_history
.log_pg_num_change(i
.first
, j
.first
,
8184 j
.second
.get_pg_num());
8189 pg_num_history
.epoch
= last
;
8192 ::encode(pg_num_history
, bl
);
8193 t
.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl
.length(), bl
);
8194 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
8197 // superblock and commit
8198 write_superblock(t
);
8199 t
.register_on_commit(new C_OnMapCommit(this, start
, last
, m
));
8200 store
->queue_transaction(
8203 service
.publish_superblock(superblock
);
8206 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
8208 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
8209 if (is_stopping()) {
8210 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8213 std::lock_guard
l(osd_lock
);
8214 if (is_stopping()) {
8215 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8218 map_lock
.get_write();
8220 bool do_shutdown
= false;
8221 bool do_restart
= false;
8222 bool network_error
= false;
8224 // advance through the new maps
8225 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
8226 dout(10) << " advance to epoch " << cur
8227 << " (<= last " << last
8228 << " <= newest_map " << superblock
.newest_map
8231 OSDMapRef newmap
= get_map(cur
);
8232 ceph_assert(newmap
); // we just cached it above!
8234 // start blacklisting messages sent to peers that go down.
8235 service
.pre_publish_map(newmap
);
8237 // kill connections to newly down osds
8238 bool waited_for_reservations
= false;
8240 osdmap
->get_all_osds(old
);
8241 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
8243 osdmap
->is_up(*p
) && // in old map
8244 newmap
->is_down(*p
)) { // but not the new one
8245 if (!waited_for_reservations
) {
8246 service
.await_reserved_maps();
8247 waited_for_reservations
= true;
8250 } else if (*p
!= whoami
&&
8251 osdmap
->is_down(*p
) &&
8252 newmap
->is_up(*p
)) {
8257 if (osdmap
->is_noup(whoami
) != newmap
->is_noup(whoami
)) {
8258 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
8261 // this captures the case where we sent the boot message while
8262 // NOUP was being set on the mon and our boot request was
8263 // dropped, and then later it is cleared. it imperfectly
8264 // handles the case where our original boot message was not
8265 // dropped and we restart even though we might have booted, but
8266 // that is harmless (boot will just take slightly longer).
8274 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
8276 osdmap
->is_up(whoami
) &&
8277 osdmap
->get_addrs(whoami
) == client_messenger
->get_myaddrs()) {
8278 up_epoch
= osdmap
->get_epoch();
8279 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
8281 boot_epoch
= osdmap
->get_epoch();
8282 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
8284 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
8288 had_map_since
= ceph_clock_now();
8290 epoch_t _bind_epoch
= service
.get_bind_epoch();
8291 if (osdmap
->is_up(whoami
) &&
8292 osdmap
->get_addrs(whoami
).legacy_equals(
8293 client_messenger
->get_myaddrs()) &&
8294 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
8297 dout(1) << "state: booting -> active" << dendl
;
8298 set_state(STATE_ACTIVE
);
8301 // set incarnation so that osd_reqid_t's we generate for our
8302 // objecter requests are unique across restarts.
8303 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
8304 cancel_pending_failures();
8308 if (osdmap
->get_epoch() > 0 &&
8310 if (!osdmap
->exists(whoami
)) {
8311 dout(0) << "map says i do not exist. shutting down." << dendl
;
8312 do_shutdown
= true; // don't call shutdown() while we have
8313 // everything paused
8314 } else if (!osdmap
->is_up(whoami
) ||
8315 !osdmap
->get_addrs(whoami
).legacy_equals(
8316 client_messenger
->get_myaddrs()) ||
8317 !osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8318 cluster_messenger
->get_myaddrs()) ||
8319 !osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8320 hb_back_server_messenger
->get_myaddrs()) ||
8321 !osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8322 hb_front_server_messenger
->get_myaddrs())) {
8323 if (!osdmap
->is_up(whoami
)) {
8324 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
8325 service
.got_stop_ack();
8327 clog
->warn() << "Monitor daemon marked osd." << whoami
<< " down, "
8328 "but it is still running";
8329 clog
->debug() << "map e" << osdmap
->get_epoch()
8330 << " wrongly marked me down at e"
8331 << osdmap
->get_down_at(whoami
);
8333 } else if (!osdmap
->get_addrs(whoami
).legacy_equals(
8334 client_messenger
->get_myaddrs())) {
8335 clog
->error() << "map e" << osdmap
->get_epoch()
8336 << " had wrong client addr (" << osdmap
->get_addrs(whoami
)
8337 << " != my " << client_messenger
->get_myaddrs() << ")";
8338 } else if (!osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8339 cluster_messenger
->get_myaddrs())) {
8340 clog
->error() << "map e" << osdmap
->get_epoch()
8341 << " had wrong cluster addr ("
8342 << osdmap
->get_cluster_addrs(whoami
)
8343 << " != my " << cluster_messenger
->get_myaddrs() << ")";
8344 } else if (!osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8345 hb_back_server_messenger
->get_myaddrs())) {
8346 clog
->error() << "map e" << osdmap
->get_epoch()
8347 << " had wrong heartbeat back addr ("
8348 << osdmap
->get_hb_back_addrs(whoami
)
8349 << " != my " << hb_back_server_messenger
->get_myaddrs()
8351 } else if (!osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8352 hb_front_server_messenger
->get_myaddrs())) {
8353 clog
->error() << "map e" << osdmap
->get_epoch()
8354 << " had wrong heartbeat front addr ("
8355 << osdmap
->get_hb_front_addrs(whoami
)
8356 << " != my " << hb_front_server_messenger
->get_myaddrs()
8360 if (!service
.is_stopping()) {
8361 epoch_t up_epoch
= 0;
8362 epoch_t bind_epoch
= osdmap
->get_epoch();
8363 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
8367 utime_t now
= ceph_clock_now();
8368 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
8369 osd_markdown_log
.push_back(now
);
8370 //clear all out-of-date log
8371 while (!osd_markdown_log
.empty() &&
8372 osd_markdown_log
.front() + grace
< now
)
8373 osd_markdown_log
.pop_front();
8374 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
8375 dout(0) << __func__
<< " marked down "
8376 << osd_markdown_log
.size()
8377 << " > osd_max_markdown_count "
8378 << cct
->_conf
->osd_max_markdown_count
8379 << " in last " << grace
<< " seconds, shutting down"
8385 start_waiting_for_healthy();
8387 set
<int> avoid_ports
;
8388 #if defined(__FreeBSD__)
8389 // prevent FreeBSD from grabbing the client_messenger port during
8390 // rebinding. In which case a cluster_meesneger will connect also
8392 client_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8394 cluster_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8395 hb_back_server_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8396 hb_front_server_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8398 int r
= cluster_messenger
->rebind(avoid_ports
);
8400 do_shutdown
= true; // FIXME: do_restart?
8401 network_error
= true;
8402 dout(0) << __func__
<< " marked down:"
8403 << " rebind cluster_messenger failed" << dendl
;
8406 r
= hb_back_server_messenger
->rebind(avoid_ports
);
8408 do_shutdown
= true; // FIXME: do_restart?
8409 network_error
= true;
8410 dout(0) << __func__
<< " marked down:"
8411 << " rebind hb_back_server_messenger failed" << dendl
;
8414 r
= hb_front_server_messenger
->rebind(avoid_ports
);
8416 do_shutdown
= true; // FIXME: do_restart?
8417 network_error
= true;
8418 dout(0) << __func__
<< " marked down:"
8419 << " rebind hb_front_server_messenger failed" << dendl
;
8422 hb_front_client_messenger
->mark_down_all();
8423 hb_back_client_messenger
->mark_down_all();
8425 reset_heartbeat_peers(true);
8430 map_lock
.put_write();
8432 check_osdmap_features();
8437 if (is_active() || is_waiting_for_healthy())
8438 maybe_update_heartbeat_peers();
8445 if (network_error
) {
8446 cancel_pending_failures();
8448 // trigger shutdown in a different thread
8449 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
8450 queue_async_signal(SIGINT
);
8452 else if (m
->newest_map
&& m
->newest_map
> last
) {
8453 dout(10) << " msg say newest map is " << m
->newest_map
8454 << ", requesting more" << dendl
;
8455 osdmap_subscribe(osdmap
->get_epoch()+1, false);
8457 else if (is_preboot()) {
8458 if (m
->get_source().is_mon())
8459 _preboot(m
->oldest_map
, m
->newest_map
);
8463 else if (do_restart
)
8468 void OSD::check_osdmap_features()
8470 // adjust required feature bits?
8472 // we have to be a bit careful here, because we are accessing the
8473 // Policy structures without taking any lock. in particular, only
8474 // modify integer values that can safely be read by a racing CPU.
8475 // since we are only accessing existing Policy structures a their
8476 // current memory location, and setting or clearing bits in integer
8477 // fields, and we are the only writer, this is not a problem.
8480 Messenger::Policy p
= client_messenger
->get_default_policy();
8482 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
8483 if ((p
.features_required
& mask
) != features
) {
8484 dout(0) << "crush map has features " << features
8485 << ", adjusting msgr requires for clients" << dendl
;
8486 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8487 client_messenger
->set_default_policy(p
);
8491 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
8493 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
8494 if ((p
.features_required
& mask
) != features
) {
8495 dout(0) << "crush map has features " << features
8496 << " was " << p
.features_required
8497 << ", adjusting msgr requires for mons" << dendl
;
8498 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8499 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
8503 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
8505 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
8507 if ((p
.features_required
& mask
) != features
) {
8508 dout(0) << "crush map has features " << features
8509 << ", adjusting msgr requires for osds" << dendl
;
8510 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8511 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
8514 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
8515 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
8516 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
8517 ObjectStore::Transaction t
;
8518 write_superblock(t
);
8519 int err
= store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
8520 ceph_assert(err
== 0);
8524 if (osdmap
->require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
8525 heartbeat_dispatcher
.ms_set_require_authorizer(false);
8528 if (osdmap
->require_osd_release
!= last_require_osd_release
) {
8529 dout(1) << __func__
<< " require_osd_release " << last_require_osd_release
8530 << " -> " << to_string(osdmap
->require_osd_release
) << dendl
;
8531 store
->write_meta("require_osd_release",
8532 stringify((int)osdmap
->require_osd_release
));
8533 last_require_osd_release
= osdmap
->require_osd_release
;
8537 struct C_FinishSplits
: public Context
{
8540 C_FinishSplits(OSD
*osd
, const set
<PGRef
> &in
)
8541 : osd(osd
), pgs(in
) {}
8542 void finish(int r
) override
{
8543 osd
->_finish_splits(pgs
);
8547 void OSD::_finish_splits(set
<PGRef
>& pgs
)
8549 dout(10) << __func__
<< " " << pgs
<< dendl
;
8552 PG::RecoveryCtx rctx
= create_context();
8553 for (set
<PGRef
>::iterator i
= pgs
.begin();
8559 dout(10) << __func__
<< " " << *pg
<< dendl
;
8560 epoch_t e
= pg
->get_osdmap_epoch();
8561 pg
->handle_initialize(&rctx
);
8562 pg
->queue_null(e
, e
);
8563 dispatch_context_transaction(rctx
, pg
);
8566 unsigned shard_index
= pg
->pg_id
.hash_to_shard(num_shards
);
8567 shards
[shard_index
]->register_and_wake_split_child(pg
);
8570 dispatch_context(rctx
, 0, service
.get_osdmap());
8573 bool OSD::add_merge_waiter(OSDMapRef nextmap
, spg_t target
, PGRef src
,
8576 std::lock_guard
l(merge_lock
);
8577 auto& p
= merge_waiters
[nextmap
->get_epoch()][target
];
8578 p
[src
->pg_id
] = src
;
8579 dout(10) << __func__
<< " added merge_waiter " << src
->pg_id
8580 << " for " << target
<< ", have " << p
.size() << "/" << need
8582 return p
.size() == need
;
8585 bool OSD::advance_pg(
8588 ThreadPool::TPHandle
&handle
,
8589 PG::RecoveryCtx
*rctx
)
8591 if (osd_epoch
<= pg
->get_osdmap_epoch()) {
8594 ceph_assert(pg
->is_locked());
8595 OSDMapRef lastmap
= pg
->get_osdmap();
8596 ceph_assert(lastmap
->get_epoch() < osd_epoch
);
8597 set
<PGRef
> new_pgs
; // any split children
8600 unsigned old_pg_num
= lastmap
->have_pg_pool(pg
->pg_id
.pool()) ?
8601 lastmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8602 for (epoch_t next_epoch
= pg
->get_osdmap_epoch() + 1;
8603 next_epoch
<= osd_epoch
;
8605 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
8607 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
8611 unsigned new_pg_num
=
8612 (old_pg_num
&& nextmap
->have_pg_pool(pg
->pg_id
.pool())) ?
8613 nextmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8614 if (old_pg_num
&& new_pg_num
&& old_pg_num
!= new_pg_num
) {
8616 if (nextmap
->have_pg_pool(pg
->pg_id
.pool())) {
8618 if (pg
->pg_id
.is_merge_source(
8622 // we are merge source
8623 PGRef spg
= pg
; // carry a ref
8624 dout(1) << __func__
<< " " << pg
->pg_id
8625 << " is merge source, target is " << parent
8627 pg
->write_if_dirty(rctx
);
8628 dispatch_context_transaction(*rctx
, pg
, &handle
);
8631 OSDShard
*sdata
= pg
->osd_shard
;
8633 std::lock_guard
l(sdata
->shard_lock
);
8635 sdata
->_detach_pg(pg
->pg_slot
);
8636 // update pg count now since we might not get an osdmap
8638 if (pg
->is_primary())
8639 logger
->dec(l_osd_pg_primary
);
8640 else if (pg
->is_replica())
8641 logger
->dec(l_osd_pg_replica
);
8643 logger
->dec(l_osd_pg_stray
);
8648 set
<spg_t
> children
;
8649 parent
.is_split(new_pg_num
, old_pg_num
, &children
);
8650 if (add_merge_waiter(nextmap
, parent
, pg
, children
.size())) {
8651 enqueue_peering_evt(
8654 std::make_shared
<PGPeeringEvent
>(
8655 nextmap
->get_epoch(),
8656 nextmap
->get_epoch(),
8661 } else if (pg
->pg_id
.is_merge_target(old_pg_num
, new_pg_num
)) {
8662 // we are merge target
8663 set
<spg_t
> children
;
8664 pg
->pg_id
.is_split(new_pg_num
, old_pg_num
, &children
);
8665 dout(20) << __func__
<< " " << pg
->pg_id
8666 << " is merge target, sources are " << children
8668 map
<spg_t
,PGRef
> sources
;
8670 std::lock_guard
l(merge_lock
);
8671 auto& s
= merge_waiters
[nextmap
->get_epoch()][pg
->pg_id
];
8672 unsigned need
= children
.size();
8673 dout(20) << __func__
<< " have " << s
.size() << "/"
8675 if (s
.size() == need
) {
8677 merge_waiters
[nextmap
->get_epoch()].erase(pg
->pg_id
);
8678 if (merge_waiters
[nextmap
->get_epoch()].empty()) {
8679 merge_waiters
.erase(nextmap
->get_epoch());
8683 if (!sources
.empty()) {
8684 unsigned new_pg_num
= nextmap
->get_pg_num(pg
->pg_id
.pool());
8685 unsigned split_bits
= pg
->pg_id
.get_split_bits(new_pg_num
);
8686 dout(1) << __func__
<< " merging " << pg
->pg_id
<< dendl
;
8688 sources
, rctx
, split_bits
,
8689 nextmap
->get_pg_pool(
8690 pg
->pg_id
.pool())->last_pg_merge_meta
);
8691 pg
->pg_slot
->waiting_for_merge_epoch
= 0;
8693 dout(20) << __func__
<< " not ready to merge yet" << dendl
;
8694 pg
->write_if_dirty(rctx
);
8696 // kick source(s) to get them ready
8697 for (auto& i
: children
) {
8698 dout(20) << __func__
<< " kicking source " << i
<< dendl
;
8699 enqueue_peering_evt(
8702 std::make_shared
<PGPeeringEvent
>(
8703 nextmap
->get_epoch(),
8704 nextmap
->get_epoch(),
8714 vector
<int> newup
, newacting
;
8715 int up_primary
, acting_primary
;
8716 nextmap
->pg_to_up_acting_osds(
8718 &newup
, &up_primary
,
8719 &newacting
, &acting_primary
);
8720 pg
->handle_advance_map(
8721 nextmap
, lastmap
, newup
, up_primary
,
8722 newacting
, acting_primary
, rctx
);
8724 auto oldpool
= lastmap
->get_pools().find(pg
->pg_id
.pool());
8725 auto newpool
= nextmap
->get_pools().find(pg
->pg_id
.pool());
8726 if (oldpool
!= lastmap
->get_pools().end()
8727 && newpool
!= nextmap
->get_pools().end()) {
8728 dout(20) << __func__
8729 << " new pool opts " << newpool
->second
.opts
8730 << " old pool opts " << oldpool
->second
.opts
8733 double old_min_interval
= 0, new_min_interval
= 0;
8734 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &old_min_interval
);
8735 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &new_min_interval
);
8737 double old_max_interval
= 0, new_max_interval
= 0;
8738 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &old_max_interval
);
8739 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &new_max_interval
);
8741 // Assume if an interval is change from set to unset or vice versa the actual config
8742 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8744 if (old_min_interval
!= new_min_interval
|| old_max_interval
!= new_max_interval
) {
8745 pg
->on_info_history_change();
8749 if (new_pg_num
&& old_pg_num
!= new_pg_num
) {
8751 set
<spg_t
> children
;
8752 if (pg
->pg_id
.is_split(
8757 pg
, children
, &new_pgs
, lastmap
, nextmap
,
8763 old_pg_num
= new_pg_num
;
8764 handle
.reset_tp_timeout();
8766 pg
->handle_activate_map(rctx
);
8770 if (!new_pgs
.empty()) {
8771 rctx
->transaction
->register_on_applied(new C_FinishSplits(this, new_pgs
));
8776 void OSD::consume_map()
8778 ceph_assert(osd_lock
.is_locked());
8779 dout(7) << "consume_map version " << osdmap
->get_epoch() << dendl
;
8781 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8782 * speak the older sorting version any more. Be careful not to force
8783 * a shutdown if we are merely processing old maps, though.
8785 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
) && is_active()) {
8786 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
8790 service
.pre_publish_map(osdmap
);
8791 service
.await_reserved_maps();
8792 service
.publish_map(osdmap
);
8794 // prime splits and merges
8795 set
<pair
<spg_t
,epoch_t
>> newly_split
; // splits, and when
8796 set
<pair
<spg_t
,epoch_t
>> merge_pgs
; // merge participants, and when
8797 for (auto& shard
: shards
) {
8798 shard
->identify_splits_and_merges(osdmap
, &newly_split
, &merge_pgs
);
8800 if (!newly_split
.empty()) {
8801 for (auto& shard
: shards
) {
8802 shard
->prime_splits(osdmap
, &newly_split
);
8804 ceph_assert(newly_split
.empty());
8807 // prune sent_ready_to_merge
8808 service
.prune_sent_ready_to_merge(osdmap
);
8810 // FIXME, maybe: We could race against an incoming peering message
8811 // that instantiates a merge PG after identify_merges() below and
8812 // never set up its peer to complete the merge. An OSD restart
8813 // would clear it up. This is a hard race to resolve,
8814 // extraordinarily rare (we only merge PGs that are stable and
8815 // clean, so it'd have to be an imported PG to an OSD with a
8816 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8817 // replace all of this with a seastar-based code soon anyway.
8818 if (!merge_pgs
.empty()) {
8819 // mark the pgs we already have, or create new and empty merge
8820 // participants for those we are missing. do this all under the
8821 // shard lock so we don't have to worry about racing pg creates
8823 for (auto& shard
: shards
) {
8824 shard
->prime_merges(osdmap
, &merge_pgs
);
8826 ceph_assert(merge_pgs
.empty());
8829 service
.prune_pg_created();
8831 unsigned pushes_to_free
= 0;
8832 for (auto& shard
: shards
) {
8833 shard
->consume_map(osdmap
, &pushes_to_free
);
8836 vector
<spg_t
> pgids
;
8839 // count (FIXME, probably during seastar rewrite)
8840 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
8843 for (auto& pg
: pgs
) {
8844 // FIXME (probably during seastar rewrite): this is lockless and
8845 // racy, but we don't want to take pg lock here.
8846 if (pg
->is_primary())
8848 else if (pg
->is_replica())
8855 // FIXME (as part of seastar rewrite): move to OSDShard
8856 std::lock_guard
l(pending_creates_lock
);
8857 for (auto pg
= pending_creates_from_osd
.begin();
8858 pg
!= pending_creates_from_osd
.end();) {
8859 if (osdmap
->get_pg_acting_rank(pg
->first
, whoami
) < 0) {
8860 dout(10) << __func__
<< " pg " << pg
->first
<< " doesn't map here, "
8861 << "discarding pending_create_from_osd" << dendl
;
8862 pg
= pending_creates_from_osd
.erase(pg
);
8869 service
.maybe_inject_dispatch_delay();
8871 dispatch_sessions_waiting_on_map();
8873 service
.maybe_inject_dispatch_delay();
8875 service
.release_reserved_pushes(pushes_to_free
);
8877 // queue null events to push maps down to individual PGs
8878 for (auto pgid
: pgids
) {
8879 enqueue_peering_evt(
8882 std::make_shared
<PGPeeringEvent
>(
8883 osdmap
->get_epoch(),
8884 osdmap
->get_epoch(),
8887 logger
->set(l_osd_pg
, pgids
.size());
8888 logger
->set(l_osd_pg_primary
, num_pg_primary
);
8889 logger
->set(l_osd_pg_replica
, num_pg_replica
);
8890 logger
->set(l_osd_pg_stray
, num_pg_stray
);
8893 void OSD::activate_map()
8895 ceph_assert(osd_lock
.is_locked());
8897 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
8899 if (osdmap
->test_flag(CEPH_OSDMAP_FULL
)) {
8900 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl
;
8901 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
8905 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
8906 if (!service
.recovery_is_paused()) {
8907 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
8908 service
.pause_recovery();
8911 if (service
.recovery_is_paused()) {
8912 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
8913 service
.unpause_recovery();
8917 service
.activate_map();
8920 take_waiters(waiting_for_osdmap
);
8923 bool OSD::require_mon_peer(const Message
*m
)
8925 if (!m
->get_connection()->peer_is_mon()) {
8926 dout(0) << "require_mon_peer received from non-mon "
8927 << m
->get_connection()->get_peer_addr()
8928 << " " << *m
<< dendl
;
8934 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
8936 if (!m
->get_connection()->peer_is_mon() &&
8937 !m
->get_connection()->peer_is_mgr()) {
8938 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8939 << m
->get_connection()->get_peer_addr()
8940 << " " << *m
<< dendl
;
8946 bool OSD::require_osd_peer(const Message
*m
)
8948 if (!m
->get_connection()->peer_is_osd()) {
8949 dout(0) << "require_osd_peer received from non-osd "
8950 << m
->get_connection()->get_peer_addr()
8951 << " " << *m
<< dendl
;
8957 bool OSD::require_self_aliveness(const Message
*m
, epoch_t epoch
)
8959 epoch_t up_epoch
= service
.get_up_epoch();
8960 if (epoch
< up_epoch
) {
8961 dout(7) << "from pre-up epoch " << epoch
<< " < " << up_epoch
<< dendl
;
8966 dout(7) << "still in boot state, dropping message " << *m
<< dendl
;
8973 bool OSD::require_same_peer_instance(const Message
*m
, OSDMapRef
& map
,
8974 bool is_fast_dispatch
)
8976 int from
= m
->get_source().num();
8978 if (map
->is_down(from
) ||
8979 (map
->get_cluster_addrs(from
) != m
->get_source_addrs())) {
8980 dout(5) << "from dead osd." << from
<< ", marking down, "
8981 << " msg was " << m
->get_source_inst().addr
8983 << (map
->is_up(from
) ?
8984 map
->get_cluster_addrs(from
) : entity_addrvec_t())
8986 ConnectionRef con
= m
->get_connection();
8988 auto priv
= con
->get_priv();
8989 if (auto s
= static_cast<Session
*>(priv
.get()); s
) {
8990 if (!is_fast_dispatch
)
8991 s
->session_dispatch_lock
.Lock();
8992 clear_session_waiting_on_map(s
);
8993 con
->set_priv(nullptr); // break ref <-> session cycle, if any
8995 if (!is_fast_dispatch
)
8996 s
->session_dispatch_lock
.Unlock();
9005 * require that we have same (or newer) map, and that
9006 * the source is the pg primary.
9008 bool OSD::require_same_or_newer_map(OpRequestRef
& op
, epoch_t epoch
,
9009 bool is_fast_dispatch
)
9011 const Message
*m
= op
->get_req();
9012 dout(15) << "require_same_or_newer_map " << epoch
9013 << " (i am " << osdmap
->get_epoch() << ") " << m
<< dendl
;
9015 ceph_assert(osd_lock
.is_locked());
9017 // do they have a newer map?
9018 if (epoch
> osdmap
->get_epoch()) {
9019 dout(7) << "waiting for newer map epoch " << epoch
9020 << " > my " << osdmap
->get_epoch() << " with " << m
<< dendl
;
9021 wait_for_new_map(op
);
9025 if (!require_self_aliveness(op
->get_req(), epoch
)) {
9029 // ok, our map is same or newer.. do they still exist?
9030 if (m
->get_connection()->get_messenger() == cluster_messenger
&&
9031 !require_same_peer_instance(op
->get_req(), osdmap
, is_fast_dispatch
)) {
9042 // ----------------------------------------
9045 void OSD::split_pgs(
9047 const set
<spg_t
> &childpgids
, set
<PGRef
> *out_pgs
,
9050 PG::RecoveryCtx
*rctx
)
9052 unsigned pg_num
= nextmap
->get_pg_num(parent
->pg_id
.pool());
9053 parent
->update_snap_mapper_bits(parent
->get_pgid().get_split_bits(pg_num
));
9055 vector
<object_stat_sum_t
> updated_stats
;
9056 parent
->start_split_stats(childpgids
, &updated_stats
);
9058 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
9059 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
9060 i
!= childpgids
.end();
9062 ceph_assert(stat_iter
!= updated_stats
.end());
9063 dout(10) << __func__
<< " splitting " << *parent
<< " into " << *i
<< dendl
;
9064 PG
* child
= _make_pg(nextmap
, *i
);
9066 out_pgs
->insert(child
);
9067 child
->ch
= store
->create_new_collection(child
->coll
);
9070 uint32_t shard_index
= i
->hash_to_shard(shards
.size());
9071 assert(NULL
!= shards
[shard_index
]);
9072 store
->set_collection_commit_queue(child
->coll
, &(shards
[shard_index
]->context_queue
));
9075 unsigned split_bits
= i
->get_split_bits(pg_num
);
9076 dout(10) << " pg_num is " << pg_num
9077 << ", m_seed " << i
->ps()
9078 << ", split_bits is " << split_bits
<< dendl
;
9079 parent
->split_colls(
9083 &child
->get_pool().info
,
9090 child
->finish_split_stats(*stat_iter
, rctx
->transaction
);
9093 ceph_assert(stat_iter
!= updated_stats
.end());
9094 parent
->finish_split_stats(*stat_iter
, rctx
->transaction
);
9100 void OSD::handle_pg_create(OpRequestRef op
)
9102 const MOSDPGCreate
*m
= static_cast<const MOSDPGCreate
*>(op
->get_req());
9103 ceph_assert(m
->get_type() == MSG_OSD_PG_CREATE
);
9105 dout(10) << "handle_pg_create " << *m
<< dendl
;
9107 if (!require_mon_peer(op
->get_req())) {
9111 if (!require_same_or_newer_map(op
, m
->epoch
, false))
9116 map
<pg_t
,utime_t
>::const_iterator ci
= m
->ctimes
.begin();
9117 for (map
<pg_t
,pg_create_t
>::const_iterator p
= m
->mkpg
.begin();
9120 ceph_assert(ci
!= m
->ctimes
.end() && ci
->first
== p
->first
);
9121 epoch_t created
= p
->second
.created
;
9122 if (p
->second
.split_bits
) // Skip split pgs
9126 if (!osdmap
->have_pg_pool(on
.pool())) {
9127 dout(20) << "ignoring pg on deleted pool " << on
<< dendl
;
9131 dout(20) << "mkpg " << on
<< " e" << created
<< "@" << ci
->second
<< dendl
;
9133 // is it still ours?
9134 vector
<int> up
, acting
;
9135 int up_primary
= -1;
9136 int acting_primary
= -1;
9137 osdmap
->pg_to_up_acting_osds(on
, &up
, &up_primary
, &acting
, &acting_primary
);
9138 int role
= osdmap
->calc_pg_role(whoami
, acting
, acting
.size());
9140 if (acting_primary
!= whoami
) {
9141 dout(10) << "mkpg " << on
<< " not acting_primary (" << acting_primary
9142 << "), my role=" << role
<< ", skipping" << dendl
;
9147 bool mapped
= osdmap
->get_primary_shard(on
, &pgid
);
9148 ceph_assert(mapped
);
9151 pg_history_t history
;
9152 build_initial_pg_history(pgid
, created
, ci
->second
, &history
, &pi
);
9154 // The mon won't resend unless the primary changed, so we ignore
9155 // same_interval_since. We'll pass this history with the current
9156 // epoch as the event.
9157 if (history
.same_primary_since
> m
->epoch
) {
9158 dout(10) << __func__
<< ": got obsolete pg create on pgid "
9159 << pgid
<< " from epoch " << m
->epoch
9160 << ", primary changed in " << history
.same_primary_since
9164 enqueue_peering_evt(
9167 std::make_shared
<PGPeeringEvent
>(
9168 osdmap
->get_epoch(),
9169 osdmap
->get_epoch(),
9174 osdmap
->get_epoch(),
9182 std::lock_guard
l(pending_creates_lock
);
9183 if (pending_creates_from_mon
== 0) {
9184 last_pg_create_epoch
= m
->epoch
;
9188 maybe_update_heartbeat_peers();
9192 // ----------------------------------------
9193 // peering and recovery
9195 PG::RecoveryCtx
OSD::create_context()
9197 ObjectStore::Transaction
*t
= new ObjectStore::Transaction
;
9198 map
<int, map
<spg_t
,pg_query_t
> > *query_map
=
9199 new map
<int, map
<spg_t
, pg_query_t
> >;
9200 map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > > *notify_list
=
9201 new map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > >;
9202 map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > > *info_map
=
9203 new map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > >;
9204 PG::RecoveryCtx
rctx(query_map
, info_map
, notify_list
, t
);
9208 void OSD::dispatch_context_transaction(PG::RecoveryCtx
&ctx
, PG
*pg
,
9209 ThreadPool::TPHandle
*handle
)
9211 if (!ctx
.transaction
->empty() || ctx
.transaction
->has_contexts()) {
9212 int tr
= store
->queue_transaction(
9214 std::move(*ctx
.transaction
), TrackedOpRef(), handle
);
9215 ceph_assert(tr
== 0);
9216 delete (ctx
.transaction
);
9217 ctx
.transaction
= new ObjectStore::Transaction
;
9221 void OSD::dispatch_context(PG::RecoveryCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
9222 ThreadPool::TPHandle
*handle
)
9224 if (!service
.get_osdmap()->is_up(whoami
)) {
9225 dout(20) << __func__
<< " not up in osdmap" << dendl
;
9226 } else if (!is_active()) {
9227 dout(20) << __func__
<< " not active" << dendl
;
9229 do_notifies(*ctx
.notify_list
, curmap
);
9230 do_queries(*ctx
.query_map
, curmap
);
9231 do_infos(*ctx
.info_map
, curmap
);
9233 if ((!ctx
.transaction
->empty() || ctx
.transaction
->has_contexts()) && pg
) {
9234 int tr
= store
->queue_transaction(
9236 std::move(*ctx
.transaction
), TrackedOpRef(),
9238 ceph_assert(tr
== 0);
9240 delete ctx
.notify_list
;
9241 delete ctx
.query_map
;
9242 delete ctx
.info_map
;
9243 delete ctx
.transaction
;
9246 void OSD::discard_context(PG::RecoveryCtx
& ctx
)
9248 delete ctx
.notify_list
;
9249 delete ctx
.query_map
;
9250 delete ctx
.info_map
;
9251 delete ctx
.transaction
;
9256 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
9257 * content for, and they are primary for.
9260 void OSD::do_notifies(
9261 map
<int,vector
<pair
<pg_notify_t
,PastIntervals
> > >& notify_list
,
9265 vector
<pair
<pg_notify_t
,PastIntervals
> > >::iterator it
=
9266 notify_list
.begin();
9267 it
!= notify_list
.end();
9269 if (!curmap
->is_up(it
->first
)) {
9270 dout(20) << __func__
<< " skipping down osd." << it
->first
<< dendl
;
9273 ConnectionRef con
= service
.get_con_osd_cluster(
9274 it
->first
, curmap
->get_epoch());
9276 dout(20) << __func__
<< " skipping osd." << it
->first
9277 << " (NULL con)" << dendl
;
9280 service
.share_map_peer(it
->first
, con
.get(), curmap
);
9281 dout(7) << __func__
<< " osd." << it
->first
9282 << " on " << it
->second
.size() << " PGs" << dendl
;
9283 MOSDPGNotify
*m
= new MOSDPGNotify(curmap
->get_epoch(),
9285 con
->send_message(m
);
9291 * send out pending queries for info | summaries
9293 void OSD::do_queries(map
<int, map
<spg_t
,pg_query_t
> >& query_map
,
9296 for (map
<int, map
<spg_t
,pg_query_t
> >::iterator pit
= query_map
.begin();
9297 pit
!= query_map
.end();
9299 if (!curmap
->is_up(pit
->first
)) {
9300 dout(20) << __func__
<< " skipping down osd." << pit
->first
<< dendl
;
9303 int who
= pit
->first
;
9304 ConnectionRef con
= service
.get_con_osd_cluster(who
, curmap
->get_epoch());
9306 dout(20) << __func__
<< " skipping osd." << who
9307 << " (NULL con)" << dendl
;
9310 service
.share_map_peer(who
, con
.get(), curmap
);
9311 dout(7) << __func__
<< " querying osd." << who
9312 << " on " << pit
->second
.size() << " PGs" << dendl
;
9313 MOSDPGQuery
*m
= new MOSDPGQuery(curmap
->get_epoch(), pit
->second
);
9314 con
->send_message(m
);
9319 void OSD::do_infos(map
<int,
9320 vector
<pair
<pg_notify_t
, PastIntervals
> > >& info_map
,
9324 vector
<pair
<pg_notify_t
, PastIntervals
> > >::iterator p
=
9326 p
!= info_map
.end();
9328 if (!curmap
->is_up(p
->first
)) {
9329 dout(20) << __func__
<< " skipping down osd." << p
->first
<< dendl
;
9332 for (vector
<pair
<pg_notify_t
,PastIntervals
> >::iterator i
= p
->second
.begin();
9333 i
!= p
->second
.end();
9335 dout(20) << __func__
<< " sending info " << i
->first
.info
9336 << " to shard " << p
->first
<< dendl
;
9338 ConnectionRef con
= service
.get_con_osd_cluster(
9339 p
->first
, curmap
->get_epoch());
9341 dout(20) << __func__
<< " skipping osd." << p
->first
9342 << " (NULL con)" << dendl
;
9345 service
.share_map_peer(p
->first
, con
.get(), curmap
);
9346 MOSDPGInfo
*m
= new MOSDPGInfo(curmap
->get_epoch());
9347 m
->pg_list
= p
->second
;
9348 con
->send_message(m
);
9353 void OSD::handle_fast_pg_create(MOSDPGCreate2
*m
)
9355 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9356 if (!require_mon_peer(m
)) {
9360 for (auto& p
: m
->pgs
) {
9361 spg_t pgid
= p
.first
;
9362 epoch_t created
= p
.second
.first
;
9363 utime_t created_stamp
= p
.second
.second
;
9364 dout(20) << __func__
<< " " << pgid
<< " e" << created
9365 << "@" << created_stamp
<< dendl
;
9367 h
.epoch_created
= created
;
9368 h
.epoch_pool_created
= created
;
9369 h
.same_up_since
= created
;
9370 h
.same_interval_since
= created
;
9371 h
.same_primary_since
= created
;
9372 h
.last_scrub_stamp
= created_stamp
;
9373 h
.last_deep_scrub_stamp
= created_stamp
;
9374 h
.last_clean_scrub_stamp
= created_stamp
;
9376 enqueue_peering_evt(
9379 std::make_shared
<PGPeeringEvent
>(
9394 std::lock_guard
l(pending_creates_lock
);
9395 if (pending_creates_from_mon
== 0) {
9396 last_pg_create_epoch
= m
->epoch
;
9403 void OSD::handle_fast_pg_query(MOSDPGQuery
*m
)
9405 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9406 if (!require_osd_peer(m
)) {
9410 int from
= m
->get_source().num();
9411 for (auto& p
: m
->pg_list
) {
9412 enqueue_peering_evt(
9415 std::make_shared
<PGPeeringEvent
>(
9416 p
.second
.epoch_sent
, p
.second
.epoch_sent
,
9419 pg_shard_t(from
, p
.second
.from
),
9421 p
.second
.epoch_sent
),
9428 void OSD::handle_fast_pg_notify(MOSDPGNotify
* m
)
9430 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9431 if (!require_osd_peer(m
)) {
9435 int from
= m
->get_source().num();
9436 for (auto& p
: m
->get_pg_list()) {
9437 spg_t
pgid(p
.first
.info
.pgid
.pgid
, p
.first
.to
);
9438 enqueue_peering_evt(
9441 std::make_shared
<PGPeeringEvent
>(
9443 p
.first
.query_epoch
,
9445 pgid
, pg_shard_t(from
, p
.first
.from
),
9447 m
->get_connection()->get_features(),
9452 p
.first
.query_epoch
,
9453 p
.first
.info
.history
,
9461 void OSD::handle_fast_pg_info(MOSDPGInfo
* m
)
9463 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9464 if (!require_osd_peer(m
)) {
9468 int from
= m
->get_source().num();
9469 for (auto& p
: m
->pg_list
) {
9470 enqueue_peering_evt(
9471 spg_t(p
.first
.info
.pgid
.pgid
, p
.first
.to
),
9473 std::make_shared
<PGPeeringEvent
>(
9474 p
.first
.epoch_sent
, p
.first
.query_epoch
,
9476 pg_shard_t(from
, p
.first
.from
),
9478 p
.first
.epoch_sent
)))
9484 void OSD::handle_fast_pg_remove(MOSDPGRemove
*m
)
9486 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9487 if (!require_osd_peer(m
)) {
9491 for (auto& pgid
: m
->pg_list
) {
9492 enqueue_peering_evt(
9495 std::make_shared
<PGPeeringEvent
>(
9496 m
->get_epoch(), m
->get_epoch(),
9497 PG::DeleteStart())));
9502 void OSD::handle_fast_force_recovery(MOSDForceRecovery
*m
)
9504 dout(10) << __func__
<< " " << *m
<< dendl
;
9505 if (!require_mon_or_mgr_peer(m
)) {
9509 epoch_t epoch
= get_osdmap_epoch();
9510 for (auto pgid
: m
->forced_pgs
) {
9511 if (m
->options
& OFR_BACKFILL
) {
9512 if (m
->options
& OFR_CANCEL
) {
9513 enqueue_peering_evt(
9516 std::make_shared
<PGPeeringEvent
>(
9518 PG::UnsetForceBackfill())));
9520 enqueue_peering_evt(
9523 std::make_shared
<PGPeeringEvent
>(
9525 PG::SetForceBackfill())));
9527 } else if (m
->options
& OFR_RECOVERY
) {
9528 if (m
->options
& OFR_CANCEL
) {
9529 enqueue_peering_evt(
9532 std::make_shared
<PGPeeringEvent
>(
9534 PG::UnsetForceRecovery())));
9536 enqueue_peering_evt(
9539 std::make_shared
<PGPeeringEvent
>(
9541 PG::SetForceRecovery())));
9548 void OSD::handle_pg_query_nopg(const MQuery
& q
)
9550 spg_t pgid
= q
.pgid
;
9551 dout(10) << __func__
<< " " << pgid
<< dendl
;
9553 OSDMapRef osdmap
= get_osdmap();
9554 if (!osdmap
->have_pg_pool(pgid
.pool()))
9557 dout(10) << " pg " << pgid
<< " dne" << dendl
;
9558 pg_info_t
empty(spg_t(pgid
.pgid
, q
.query
.to
));
9559 ConnectionRef con
= service
.get_con_osd_cluster(q
.from
.osd
, osdmap
->get_epoch());
9562 if (q
.query
.type
== pg_query_t::LOG
||
9563 q
.query
.type
== pg_query_t::FULLLOG
) {
9565 q
.query
.from
, q
.query
.to
,
9566 osdmap
->get_epoch(), empty
,
9567 q
.query
.epoch_sent
);
9569 vector
<pair
<pg_notify_t
,PastIntervals
>> ls
;
9573 q
.query
.from
, q
.query
.to
,
9575 osdmap
->get_epoch(),
9578 m
= new MOSDPGNotify(osdmap
->get_epoch(), ls
);
9580 service
.share_map_peer(q
.from
.osd
, con
.get(), osdmap
);
9581 con
->send_message(m
);
9586 // =========================================================
9589 void OSDService::_maybe_queue_recovery() {
9590 ceph_assert(recovery_lock
.is_locked_by_me());
9591 uint64_t available_pushes
;
9592 while (!awaiting_throttle
.empty() &&
9593 _recover_now(&available_pushes
)) {
9594 uint64_t to_start
= std::min(
9596 cct
->_conf
->osd_recovery_max_single_start
);
9597 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
9598 awaiting_throttle
.pop_front();
9599 dout(10) << __func__
<< " starting " << to_start
9600 << ", recovery_ops_reserved " << recovery_ops_reserved
9601 << " -> " << (recovery_ops_reserved
+ to_start
) << dendl
;
9602 recovery_ops_reserved
+= to_start
;
9606 bool OSDService::_recover_now(uint64_t *available_pushes
)
9608 if (available_pushes
)
9609 *available_pushes
= 0;
9611 if (ceph_clock_now() < defer_recovery_until
) {
9612 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
9616 if (recovery_paused
) {
9617 dout(15) << __func__
<< " paused" << dendl
;
9621 uint64_t max
= cct
->_conf
->osd_recovery_max_active
;
9622 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
9623 dout(15) << __func__
<< " active " << recovery_ops_active
9624 << " + reserved " << recovery_ops_reserved
9625 << " >= max " << max
<< dendl
;
9629 if (available_pushes
)
9630 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
9635 void OSD::do_recovery(
9636 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
,
9637 ThreadPool::TPHandle
&handle
)
9639 uint64_t started
= 0;
9642 * When the value of osd_recovery_sleep is set greater than zero, recovery
9643 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9644 * recovery event's schedule time. This is done by adding a
9645 * recovery_requeue_callback event, which re-queues the recovery op using
9646 * queue_recovery_after_sleep.
9648 float recovery_sleep
= get_osd_recovery_sleep();
9650 std::lock_guard
l(service
.sleep_lock
);
9651 if (recovery_sleep
> 0 && service
.recovery_needs_sleep
) {
9653 auto recovery_requeue_callback
= new FunctionContext([this, pgref
, queued
, reserved_pushes
](int r
) {
9654 dout(20) << "do_recovery wake up at "
9656 << ", re-queuing recovery" << dendl
;
9657 std::lock_guard
l(service
.sleep_lock
);
9658 service
.recovery_needs_sleep
= false;
9659 service
.queue_recovery_after_sleep(pgref
.get(), queued
, reserved_pushes
);
9662 // This is true for the first recovery op and when the previous recovery op
9663 // has been scheduled in the past. The next recovery op is scheduled after
9664 // completing the sleep from now.
9665 if (service
.recovery_schedule_time
< ceph_clock_now()) {
9666 service
.recovery_schedule_time
= ceph_clock_now();
9668 service
.recovery_schedule_time
+= recovery_sleep
;
9669 service
.sleep_timer
.add_event_at(service
.recovery_schedule_time
,
9670 recovery_requeue_callback
);
9671 dout(20) << "Recovery event scheduled at "
9672 << service
.recovery_schedule_time
<< dendl
;
9679 std::lock_guard
l(service
.sleep_lock
);
9680 service
.recovery_needs_sleep
= true;
9683 if (pg
->pg_has_reset_since(queued
)) {
9687 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
9688 #ifdef DEBUG_RECOVERY_OIDS
9689 dout(20) << " active was " << service
.recovery_oids
[pg
->pg_id
] << dendl
;
9692 bool do_unfound
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
9693 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
9694 << " on " << *pg
<< dendl
;
9697 PG::RecoveryCtx rctx
= create_context();
9698 rctx
.handle
= &handle
;
9699 pg
->find_unfound(queued
, &rctx
);
9700 dispatch_context(rctx
, pg
, pg
->get_osdmap());
9705 ceph_assert(started
<= reserved_pushes
);
9706 service
.release_reserved_pushes(reserved_pushes
);
9709 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
9711 std::lock_guard
l(recovery_lock
);
9712 dout(10) << "start_recovery_op " << *pg
<< " " << soid
9713 << " (" << recovery_ops_active
<< "/"
9714 << cct
->_conf
->osd_recovery_max_active
<< " rops)"
9716 recovery_ops_active
++;
9718 #ifdef DEBUG_RECOVERY_OIDS
9719 dout(20) << " active was " << recovery_oids
[pg
->pg_id
] << dendl
;
9720 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
) == 0);
9721 recovery_oids
[pg
->pg_id
].insert(soid
);
9725 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
9727 std::lock_guard
l(recovery_lock
);
9728 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
9729 << " dequeue=" << dequeue
9730 << " (" << recovery_ops_active
<< "/" << cct
->_conf
->osd_recovery_max_active
<< " rops)"
9734 ceph_assert(recovery_ops_active
> 0);
9735 recovery_ops_active
--;
9737 #ifdef DEBUG_RECOVERY_OIDS
9738 dout(20) << " active oids was " << recovery_oids
[pg
->pg_id
] << dendl
;
9739 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
));
9740 recovery_oids
[pg
->pg_id
].erase(soid
);
9743 _maybe_queue_recovery();
9746 bool OSDService::is_recovery_active()
9748 return local_reserver
.has_reservation() || remote_reserver
.has_reservation();
9751 void OSDService::release_reserved_pushes(uint64_t pushes
)
9753 std::lock_guard
l(recovery_lock
);
9754 dout(10) << __func__
<< "(" << pushes
<< "), recovery_ops_reserved "
9755 << recovery_ops_reserved
<< " -> " << (recovery_ops_reserved
-pushes
)
9757 ceph_assert(recovery_ops_reserved
>= pushes
);
9758 recovery_ops_reserved
-= pushes
;
9759 _maybe_queue_recovery();
9762 // =========================================================
9765 bool OSD::op_is_discardable(const MOSDOp
*op
)
9767 // drop client request if they are not connected and can't get the
9769 if (!op
->get_connection()->is_connected()) {
9775 void OSD::enqueue_op(spg_t pg
, OpRequestRef
&& op
, epoch_t epoch
)
9777 const utime_t stamp
= op
->get_req()->get_recv_stamp();
9778 const utime_t latency
= ceph_clock_now() - stamp
;
9779 const unsigned priority
= op
->get_req()->get_priority();
9780 const int cost
= op
->get_req()->get_cost();
9781 const uint64_t owner
= op
->get_req()->get_source().num();
9783 dout(15) << "enqueue_op " << op
<< " prio " << priority
9785 << " latency " << latency
9786 << " epoch " << epoch
9787 << " " << *(op
->get_req()) << dendl
;
9788 op
->osd_trace
.event("enqueue op");
9789 op
->osd_trace
.keyval("priority", priority
);
9790 op
->osd_trace
.keyval("cost", cost
);
9791 op
->mark_queued_for_pg();
9792 logger
->tinc(l_osd_op_before_queue_op_lat
, latency
);
9795 unique_ptr
<OpQueueItem::OpQueueable
>(new PGOpItem(pg
, std::move(op
))),
9796 cost
, priority
, stamp
, owner
, epoch
));
9799 void OSD::enqueue_peering_evt(spg_t pgid
, PGPeeringEventRef evt
)
9801 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
9804 unique_ptr
<OpQueueItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
9806 cct
->_conf
->osd_peering_op_priority
,
9809 evt
->get_epoch_sent()));
9812 void OSD::enqueue_peering_evt_front(spg_t pgid
, PGPeeringEventRef evt
)
9814 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
9815 op_shardedwq
.queue_front(
9817 unique_ptr
<OpQueueItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
9819 cct
->_conf
->osd_peering_op_priority
,
9822 evt
->get_epoch_sent()));
9826 * NOTE: dequeue called in worker thread, with pg lock
9828 void OSD::dequeue_op(
9829 PGRef pg
, OpRequestRef op
,
9830 ThreadPool::TPHandle
&handle
)
9833 OID_EVENT_TRACE_WITH_MSG(op
->get_req(), "DEQUEUE_OP_BEGIN", false);
9835 utime_t now
= ceph_clock_now();
9836 op
->set_dequeued_time(now
);
9837 utime_t latency
= now
- op
->get_req()->get_recv_stamp();
9838 dout(10) << "dequeue_op " << op
<< " prio " << op
->get_req()->get_priority()
9839 << " cost " << op
->get_req()->get_cost()
9840 << " latency " << latency
9841 << " " << *(op
->get_req())
9842 << " pg " << *pg
<< dendl
;
9844 logger
->tinc(l_osd_op_before_dequeue_op_lat
, latency
);
9846 auto priv
= op
->get_req()->get_connection()->get_priv();
9847 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
9848 maybe_share_map(session
, op
, pg
->get_osdmap());
9851 if (pg
->is_deleting())
9854 op
->mark_reached_pg();
9855 op
->osd_trace
.event("dequeue_op");
9857 pg
->do_request(op
, handle
);
9860 dout(10) << "dequeue_op " << op
<< " finish" << dendl
;
9861 OID_EVENT_TRACE_WITH_MSG(op
->get_req(), "DEQUEUE_OP_END", false);
9865 void OSD::dequeue_peering_evt(
9868 PGPeeringEventRef evt
,
9869 ThreadPool::TPHandle
& handle
)
9871 PG::RecoveryCtx rctx
= create_context();
9872 auto curmap
= sdata
->get_osdmap();
9873 epoch_t need_up_thru
= 0, same_interval_since
= 0;
9875 if (const MQuery
*q
= dynamic_cast<const MQuery
*>(evt
->evt
.get())) {
9876 handle_pg_query_nopg(*q
);
9878 derr
<< __func__
<< " unrecognized pg-less event " << evt
->get_desc() << dendl
;
9881 } else if (advance_pg(curmap
->get_epoch(), pg
, handle
, &rctx
)) {
9882 pg
->do_peering_event(evt
, &rctx
);
9883 if (pg
->is_deleted()) {
9884 // do not dispatch rctx; the final _delete_some already did it.
9885 discard_context(rctx
);
9889 dispatch_context_transaction(rctx
, pg
, &handle
);
9890 need_up_thru
= pg
->get_need_up_thru();
9891 same_interval_since
= pg
->get_same_interval_since();
9896 queue_want_up_thru(same_interval_since
);
9898 dispatch_context(rctx
, pg
, curmap
, &handle
);
9900 service
.send_pg_temp();
9903 void OSD::dequeue_delete(
9907 ThreadPool::TPHandle
& handle
)
9909 dequeue_peering_evt(
9913 std::make_shared
<PGPeeringEvent
>(
9921 // --------------------------------
9923 const char** OSD::get_tracked_conf_keys() const
9925 static const char* KEYS
[] = {
9926 "osd_max_backfills",
9927 "osd_min_recovery_priority",
9928 "osd_max_trimming_pgs",
9929 "osd_op_complaint_time",
9930 "osd_op_log_threshold",
9931 "osd_op_history_size",
9932 "osd_op_history_duration",
9933 "osd_op_history_slow_op_size",
9934 "osd_op_history_slow_op_threshold",
9935 "osd_enable_op_tracker",
9936 "osd_map_cache_size",
9937 "osd_pg_epoch_max_lag_factor",
9938 "osd_pg_epoch_persisted_max_stale",
9939 // clog & admin clog
9942 "clog_to_syslog_facility",
9943 "clog_to_syslog_level",
9944 "osd_objectstore_fuse",
9946 "clog_to_graylog_host",
9947 "clog_to_graylog_port",
9950 "osd_recovery_delay_start",
9951 "osd_client_message_size_cap",
9952 "osd_client_message_cap",
9953 "osd_heartbeat_min_size",
9954 "osd_heartbeat_interval",
9955 "osd_scrub_min_interval",
9956 "osd_scrub_max_interval",
9962 void OSD::handle_conf_change(const ConfigProxy
& conf
,
9963 const std::set
<std::string
> &changed
)
9965 Mutex::Locker
l(osd_lock
);
9966 if (changed
.count("osd_max_backfills")) {
9967 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9968 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9970 if (changed
.count("osd_min_recovery_priority")) {
9971 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9972 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9974 if (changed
.count("osd_max_trimming_pgs")) {
9975 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
9977 if (changed
.count("osd_op_complaint_time") ||
9978 changed
.count("osd_op_log_threshold")) {
9979 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
9980 cct
->_conf
->osd_op_log_threshold
);
9982 if (changed
.count("osd_op_history_size") ||
9983 changed
.count("osd_op_history_duration")) {
9984 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
9985 cct
->_conf
->osd_op_history_duration
);
9987 if (changed
.count("osd_op_history_slow_op_size") ||
9988 changed
.count("osd_op_history_slow_op_threshold")) {
9989 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
9990 cct
->_conf
->osd_op_history_slow_op_threshold
);
9992 if (changed
.count("osd_enable_op_tracker")) {
9993 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
9995 if (changed
.count("osd_map_cache_size")) {
9996 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9997 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9998 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10000 if (changed
.count("clog_to_monitors") ||
10001 changed
.count("clog_to_syslog") ||
10002 changed
.count("clog_to_syslog_level") ||
10003 changed
.count("clog_to_syslog_facility") ||
10004 changed
.count("clog_to_graylog") ||
10005 changed
.count("clog_to_graylog_host") ||
10006 changed
.count("clog_to_graylog_port") ||
10007 changed
.count("host") ||
10008 changed
.count("fsid")) {
10009 update_log_config();
10011 if (changed
.count("osd_pg_epoch_max_lag_factor")) {
10012 m_osd_pg_epoch_max_lag_factor
= conf
.get_val
<double>(
10013 "osd_pg_epoch_max_lag_factor");
10016 #ifdef HAVE_LIBFUSE
10017 if (changed
.count("osd_objectstore_fuse")) {
10019 enable_disable_fuse(false);
10024 if (changed
.count("osd_recovery_delay_start")) {
10025 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
10026 service
.kick_recovery_queue();
10029 if (changed
.count("osd_client_message_cap")) {
10030 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
10031 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10032 if (pol
.throttler_messages
&& newval
> 0) {
10033 pol
.throttler_messages
->reset_max(newval
);
10036 if (changed
.count("osd_client_message_size_cap")) {
10037 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
10038 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10039 if (pol
.throttler_bytes
&& newval
> 0) {
10040 pol
.throttler_bytes
->reset_max(newval
);
10044 if (changed
.count("osd_scrub_min_interval") ||
10045 changed
.count("osd_scrub_max_interval")) {
10046 resched_all_scrubs();
10047 dout(0) << __func__
<< ": scrub interval change" << dendl
;
10052 void OSD::update_log_config()
10054 map
<string
,string
> log_to_monitors
;
10055 map
<string
,string
> log_to_syslog
;
10056 map
<string
,string
> log_channel
;
10057 map
<string
,string
> log_prio
;
10058 map
<string
,string
> log_to_graylog
;
10059 map
<string
,string
> log_to_graylog_host
;
10060 map
<string
,string
> log_to_graylog_port
;
10064 if (parse_log_client_options(cct
, log_to_monitors
, log_to_syslog
,
10065 log_channel
, log_prio
, log_to_graylog
,
10066 log_to_graylog_host
, log_to_graylog_port
,
10068 clog
->update_config(log_to_monitors
, log_to_syslog
,
10069 log_channel
, log_prio
, log_to_graylog
,
10070 log_to_graylog_host
, log_to_graylog_port
,
10072 derr
<< "log_to_monitors " << log_to_monitors
<< dendl
;
10075 void OSD::check_config()
10077 // some sanity checks
10078 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
10079 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
10080 << " is not > osd_pg_epoch_persisted_max_stale ("
10081 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
10085 // --------------------------------
10087 void OSD::get_latest_osdmap()
10089 dout(10) << __func__
<< " -- start" << dendl
;
10092 service
.objecter
->wait_for_latest_osdmap(&cond
);
10095 dout(10) << __func__
<< " -- finish" << dendl
;
10098 // --------------------------------
10100 int OSD::init_op_flags(OpRequestRef
& op
)
10102 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
10103 vector
<OSDOp
>::const_iterator iter
;
10105 // client flags have no bearing on whether an op is a read, write, etc.
10108 if (m
->has_flag(CEPH_OSD_FLAG_RWORDERED
)) {
10109 op
->set_force_rwordered();
10112 // set bits based on op codes, called methods.
10113 for (iter
= m
->ops
.begin(); iter
!= m
->ops
.end(); ++iter
) {
10114 if ((iter
->op
.op
== CEPH_OSD_OP_WATCH
&&
10115 iter
->op
.watch
.op
== CEPH_OSD_WATCH_OP_PING
)) {
10116 /* This a bit odd. PING isn't actually a write. It can't
10117 * result in an update to the object_info. PINGs also aren't
10118 * resent, so there's no reason to write out a log entry.
10120 * However, we pipeline them behind writes, so let's force
10121 * the write_ordered flag.
10123 op
->set_force_rwordered();
10125 if (ceph_osd_op_mode_modify(iter
->op
.op
))
10128 if (ceph_osd_op_mode_read(iter
->op
.op
))
10131 // set READ flag if there are src_oids
10132 if (iter
->soid
.oid
.name
.length())
10135 // set PGOP flag if there are PG ops
10136 if (ceph_osd_op_type_pg(iter
->op
.op
))
10139 if (ceph_osd_op_mode_cache(iter
->op
.op
))
10142 // check for ec base pool
10143 int64_t poolid
= m
->get_pg().pool();
10144 const pg_pool_t
*pool
= osdmap
->get_pg_pool(poolid
);
10145 if (pool
&& pool
->is_tier()) {
10146 const pg_pool_t
*base_pool
= osdmap
->get_pg_pool(pool
->tier_of
);
10147 if (base_pool
&& base_pool
->require_rollback()) {
10148 if ((iter
->op
.op
!= CEPH_OSD_OP_READ
) &&
10149 (iter
->op
.op
!= CEPH_OSD_OP_CHECKSUM
) &&
10150 (iter
->op
.op
!= CEPH_OSD_OP_CMPEXT
) &&
10151 (iter
->op
.op
!= CEPH_OSD_OP_STAT
) &&
10152 (iter
->op
.op
!= CEPH_OSD_OP_ISDIRTY
) &&
10153 (iter
->op
.op
!= CEPH_OSD_OP_UNDIRTY
) &&
10154 (iter
->op
.op
!= CEPH_OSD_OP_GETXATTR
) &&
10155 (iter
->op
.op
!= CEPH_OSD_OP_GETXATTRS
) &&
10156 (iter
->op
.op
!= CEPH_OSD_OP_CMPXATTR
) &&
10157 (iter
->op
.op
!= CEPH_OSD_OP_ASSERT_VER
) &&
10158 (iter
->op
.op
!= CEPH_OSD_OP_LIST_WATCHERS
) &&
10159 (iter
->op
.op
!= CEPH_OSD_OP_LIST_SNAPS
) &&
10160 (iter
->op
.op
!= CEPH_OSD_OP_SETALLOCHINT
) &&
10161 (iter
->op
.op
!= CEPH_OSD_OP_WRITEFULL
) &&
10162 (iter
->op
.op
!= CEPH_OSD_OP_ROLLBACK
) &&
10163 (iter
->op
.op
!= CEPH_OSD_OP_CREATE
) &&
10164 (iter
->op
.op
!= CEPH_OSD_OP_DELETE
) &&
10165 (iter
->op
.op
!= CEPH_OSD_OP_SETXATTR
) &&
10166 (iter
->op
.op
!= CEPH_OSD_OP_RMXATTR
) &&
10167 (iter
->op
.op
!= CEPH_OSD_OP_STARTSYNC
) &&
10168 (iter
->op
.op
!= CEPH_OSD_OP_COPY_GET
) &&
10169 (iter
->op
.op
!= CEPH_OSD_OP_COPY_FROM
)) {
10175 switch (iter
->op
.op
) {
10176 case CEPH_OSD_OP_CALL
:
10178 bufferlist::iterator bp
= const_cast<bufferlist
&>(iter
->indata
).begin();
10179 int is_write
, is_read
;
10180 string cname
, mname
;
10181 bp
.copy(iter
->op
.cls
.class_len
, cname
);
10182 bp
.copy(iter
->op
.cls
.method_len
, mname
);
10184 ClassHandler::ClassData
*cls
;
10185 int r
= class_handler
->open_class(cname
, &cls
);
10187 derr
<< "class " << cname
<< " open got " << cpp_strerror(r
) << dendl
;
10190 else if (r
!= -EPERM
) // propagate permission errors
10194 int flags
= cls
->get_method_flags(mname
.c_str());
10196 if (flags
== -ENOENT
)
10202 is_read
= flags
& CLS_METHOD_RD
;
10203 is_write
= flags
& CLS_METHOD_WR
;
10204 bool is_promote
= flags
& CLS_METHOD_PROMOTE
;
10206 dout(10) << "class " << cname
<< " method " << mname
<< " "
10207 << "flags=" << (is_read
? "r" : "")
10208 << (is_write
? "w" : "")
10209 << (is_promote
? "p" : "")
10212 op
->set_class_read();
10214 op
->set_class_write();
10217 op
->add_class(std::move(cname
), std::move(mname
), is_read
, is_write
,
10222 case CEPH_OSD_OP_WATCH
:
10223 // force the read bit for watch since it is depends on previous
10224 // watch state (and may return early if the watch exists) or, in
10225 // the case of ping, is simply a read op.
10228 case CEPH_OSD_OP_NOTIFY
:
10229 case CEPH_OSD_OP_NOTIFY_ACK
:
10235 case CEPH_OSD_OP_DELETE
:
10236 // if we get a delete with FAILOK we can skip handle cache. without
10237 // FAILOK we still need to promote (or do something smarter) to
10238 // determine whether to return ENOENT or 0.
10239 if (iter
== m
->ops
.begin() &&
10240 iter
->op
.flags
== CEPH_OSD_OP_FLAG_FAILOK
) {
10241 op
->set_skip_handle_cache();
10243 // skip promotion when proxying a delete op
10244 if (m
->ops
.size() == 1) {
10245 op
->set_skip_promote();
10249 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
10250 case CEPH_OSD_OP_CACHE_FLUSH
:
10251 case CEPH_OSD_OP_CACHE_EVICT
:
10252 // If try_flush/flush/evict is the only op, can skip handle cache.
10253 if (m
->ops
.size() == 1) {
10254 op
->set_skip_handle_cache();
10258 case CEPH_OSD_OP_READ
:
10259 case CEPH_OSD_OP_SYNC_READ
:
10260 case CEPH_OSD_OP_SPARSE_READ
:
10261 case CEPH_OSD_OP_CHECKSUM
:
10262 case CEPH_OSD_OP_WRITEFULL
:
10263 if (m
->ops
.size() == 1 &&
10264 (iter
->op
.flags
& CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
||
10265 iter
->op
.flags
& CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
)) {
10266 op
->set_skip_promote();
10270 // force promotion when pin an object in cache tier
10271 case CEPH_OSD_OP_CACHE_PIN
:
10280 if (op
->rmw_flags
== 0)
10286 void OSD::set_perf_queries(
10287 const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
) {
10288 dout(10) << "setting " << queries
.size() << " queries" << dendl
;
10290 std::list
<OSDPerfMetricQuery
> supported_queries
;
10291 for (auto &it
: queries
) {
10292 auto &query
= it
.first
;
10293 if (!query
.key_descriptor
.empty()) {
10294 supported_queries
.push_back(query
);
10297 if (supported_queries
.size() < queries
.size()) {
10298 dout(1) << queries
.size() - supported_queries
.size()
10299 << " unsupported queries" << dendl
;
10303 Mutex::Locker
locker(m_perf_queries_lock
);
10304 m_perf_queries
= supported_queries
;
10305 m_perf_limits
= queries
;
10308 std::vector
<PGRef
> pgs
;
10310 for (auto& pg
: pgs
) {
10311 if (pg
->is_primary()) {
10313 pg
->set_dynamic_perf_stats_queries(supported_queries
);
10319 void OSD::get_perf_reports(
10320 std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> *reports
) {
10321 std::vector
<PGRef
> pgs
;
10323 DynamicPerfStats dps
;
10324 for (auto& pg
: pgs
) {
10325 if (pg
->is_primary()) {
10326 // m_perf_queries can be modified only in set_perf_queries by mgr client
10327 // request, and it is protected by by mgr client's lock, which is held
10328 // when set_perf_queries/get_perf_reports are called, so we may not hold
10329 // m_perf_queries_lock here.
10330 DynamicPerfStats
pg_dps(m_perf_queries
);
10332 pg
->get_dynamic_perf_stats(&pg_dps
);
10337 dps
.add_to_reports(m_perf_limits
, reports
);
10338 dout(20) << "reports for " << reports
->size() << " queries" << dendl
;
10341 // =============================================================
10343 #undef dout_context
10344 #define dout_context cct
10346 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10348 void OSDShard::_attach_pg(OSDShardPGSlot
*slot
, PG
*pg
)
10350 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10352 pg
->osd_shard
= this;
10353 pg
->pg_slot
= slot
;
10354 osd
->inc_num_pgs();
10356 slot
->epoch
= pg
->get_osdmap_epoch();
10357 pg_slots_by_epoch
.insert(*slot
);
10360 void OSDShard::_detach_pg(OSDShardPGSlot
*slot
)
10362 dout(10) << slot
->pg
->pg_id
<< " " << slot
->pg
<< dendl
;
10363 slot
->pg
->osd_shard
= nullptr;
10364 slot
->pg
->pg_slot
= nullptr;
10365 slot
->pg
= nullptr;
10366 osd
->dec_num_pgs();
10368 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10370 if (waiting_for_min_pg_epoch
) {
10371 min_pg_epoch_cond
.notify_all();
10375 void OSDShard::update_pg_epoch(OSDShardPGSlot
*slot
, epoch_t e
)
10377 std::lock_guard
l(shard_lock
);
10378 dout(30) << "min was " << pg_slots_by_epoch
.begin()->epoch
10379 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10380 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10381 dout(20) << slot
->pg
->pg_id
<< " " << slot
->epoch
<< " -> " << e
<< dendl
;
10383 pg_slots_by_epoch
.insert(*slot
);
10384 dout(30) << "min is now " << pg_slots_by_epoch
.begin()->epoch
10385 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10386 if (waiting_for_min_pg_epoch
) {
10387 min_pg_epoch_cond
.notify_all();
10391 epoch_t
OSDShard::get_min_pg_epoch()
10393 std::lock_guard
l(shard_lock
);
10394 auto p
= pg_slots_by_epoch
.begin();
10395 if (p
== pg_slots_by_epoch
.end()) {
10401 void OSDShard::wait_min_pg_epoch(epoch_t need
)
10403 std::unique_lock l
{shard_lock
};
10404 ++waiting_for_min_pg_epoch
;
10405 min_pg_epoch_cond
.wait(l
, [need
, this] {
10406 if (pg_slots_by_epoch
.empty()) {
10408 } else if (pg_slots_by_epoch
.begin()->epoch
>= need
) {
10411 dout(10) << need
<< " waiting on "
10412 << pg_slots_by_epoch
.begin()->epoch
<< dendl
;
10416 --waiting_for_min_pg_epoch
;
10419 epoch_t
OSDShard::get_max_waiting_epoch()
10421 std::lock_guard
l(shard_lock
);
10423 for (auto& i
: pg_slots
) {
10424 if (!i
.second
->waiting_peering
.empty()) {
10425 r
= std::max(r
, i
.second
->waiting_peering
.rbegin()->first
);
10431 void OSDShard::consume_map(
10432 OSDMapRef
& new_osdmap
,
10433 unsigned *pushes_to_free
)
10435 std::lock_guard
l(shard_lock
);
10436 OSDMapRef old_osdmap
;
10438 std::lock_guard
l(osdmap_lock
);
10439 old_osdmap
= std::move(shard_osdmap
);
10440 shard_osdmap
= new_osdmap
;
10442 dout(10) << new_osdmap
->get_epoch()
10443 << " (was " << (old_osdmap
? old_osdmap
->get_epoch() : 0) << ")"
10445 bool queued
= false;
10448 auto p
= pg_slots
.begin();
10449 while (p
!= pg_slots
.end()) {
10450 OSDShardPGSlot
*slot
= p
->second
.get();
10451 const spg_t
& pgid
= p
->first
;
10452 dout(20) << __func__
<< " " << pgid
<< dendl
;
10453 if (!slot
->waiting_for_split
.empty()) {
10454 dout(20) << __func__
<< " " << pgid
10455 << " waiting for split " << slot
->waiting_for_split
<< dendl
;
10459 if (slot
->waiting_for_merge_epoch
> new_osdmap
->get_epoch()) {
10460 dout(20) << __func__
<< " " << pgid
10461 << " waiting for merge by epoch " << slot
->waiting_for_merge_epoch
10466 if (!slot
->waiting_peering
.empty()) {
10467 epoch_t first
= slot
->waiting_peering
.begin()->first
;
10468 if (first
<= new_osdmap
->get_epoch()) {
10469 dout(20) << __func__
<< " " << pgid
10470 << " pending_peering first epoch " << first
10471 << " <= " << new_osdmap
->get_epoch() << ", requeueing" << dendl
;
10472 _wake_pg_slot(pgid
, slot
);
10478 if (!slot
->waiting
.empty()) {
10479 if (new_osdmap
->is_up_acting_osd_shard(pgid
, osd
->get_nodeid())) {
10480 dout(20) << __func__
<< " " << pgid
<< " maps to us, keeping"
10485 while (!slot
->waiting
.empty() &&
10486 slot
->waiting
.front().get_map_epoch() <= new_osdmap
->get_epoch()) {
10487 auto& qi
= slot
->waiting
.front();
10488 dout(20) << __func__
<< " " << pgid
10489 << " waiting item " << qi
10490 << " epoch " << qi
.get_map_epoch()
10491 << " <= " << new_osdmap
->get_epoch()
10493 << (qi
.get_map_epoch() < new_osdmap
->get_epoch() ? "stale" :
10495 << ", dropping" << dendl
;
10496 *pushes_to_free
+= qi
.get_reserved_pushes();
10497 slot
->waiting
.pop_front();
10500 if (slot
->waiting
.empty() &&
10501 slot
->num_running
== 0 &&
10502 slot
->waiting_for_split
.empty() &&
10504 dout(20) << __func__
<< " " << pgid
<< " empty, pruning" << dendl
;
10505 p
= pg_slots
.erase(p
);
10512 std::lock_guard l
{sdata_wait_lock
};
10513 sdata_cond
.notify_one();
10517 void OSDShard::_wake_pg_slot(
10519 OSDShardPGSlot
*slot
)
10521 dout(20) << __func__
<< " " << pgid
10522 << " to_process " << slot
->to_process
10523 << " waiting " << slot
->waiting
10524 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10525 for (auto i
= slot
->to_process
.rbegin();
10526 i
!= slot
->to_process
.rend();
10528 _enqueue_front(std::move(*i
), osd
->op_prio_cutoff
);
10530 slot
->to_process
.clear();
10531 for (auto i
= slot
->waiting
.rbegin();
10532 i
!= slot
->waiting
.rend();
10534 _enqueue_front(std::move(*i
), osd
->op_prio_cutoff
);
10536 slot
->waiting
.clear();
10537 for (auto i
= slot
->waiting_peering
.rbegin();
10538 i
!= slot
->waiting_peering
.rend();
10540 // this is overkill; we requeue everything, even if some of these
10541 // items are waiting for maps we don't have yet. FIXME, maybe,
10542 // someday, if we decide this inefficiency matters
10543 for (auto j
= i
->second
.rbegin(); j
!= i
->second
.rend(); ++j
) {
10544 _enqueue_front(std::move(*j
), osd
->op_prio_cutoff
);
10547 slot
->waiting_peering
.clear();
10548 ++slot
->requeue_seq
;
10551 void OSDShard::identify_splits_and_merges(
10552 const OSDMapRef
& as_of_osdmap
,
10553 set
<pair
<spg_t
,epoch_t
>> *split_pgs
,
10554 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10556 std::lock_guard
l(shard_lock
);
10557 if (shard_osdmap
) {
10558 for (auto& i
: pg_slots
) {
10559 const spg_t
& pgid
= i
.first
;
10560 auto *slot
= i
.second
.get();
10562 osd
->service
.identify_splits_and_merges(
10563 shard_osdmap
, as_of_osdmap
, pgid
,
10564 split_pgs
, merge_pgs
);
10565 } else if (!slot
->waiting_for_split
.empty()) {
10566 osd
->service
.identify_splits_and_merges(
10567 shard_osdmap
, as_of_osdmap
, pgid
,
10568 split_pgs
, nullptr);
10570 dout(20) << __func__
<< " slot " << pgid
10571 << " has no pg and waiting_for_split "
10572 << slot
->waiting_for_split
<< dendl
;
10578 void OSDShard::prime_splits(const OSDMapRef
& as_of_osdmap
,
10579 set
<pair
<spg_t
,epoch_t
>> *pgids
)
10581 std::lock_guard
l(shard_lock
);
10582 _prime_splits(pgids
);
10583 if (shard_osdmap
->get_epoch() > as_of_osdmap
->get_epoch()) {
10584 set
<pair
<spg_t
,epoch_t
>> newer_children
;
10585 for (auto i
: *pgids
) {
10586 osd
->service
.identify_splits_and_merges(
10587 as_of_osdmap
, shard_osdmap
, i
.first
,
10588 &newer_children
, nullptr);
10590 newer_children
.insert(pgids
->begin(), pgids
->end());
10591 dout(10) << "as_of_osdmap " << as_of_osdmap
->get_epoch() << " < shard "
10592 << shard_osdmap
->get_epoch() << ", new children " << newer_children
10594 _prime_splits(&newer_children
);
10595 // note: we don't care what is left over here for other shards.
10596 // if this shard is ahead of us and one isn't, e.g., one thread is
10597 // calling into prime_splits via _process (due to a newly created
10598 // pg) and this shard has a newer map due to a racing consume_map,
10599 // then any grandchildren left here will be identified (or were
10600 // identified) when the slower shard's osdmap is advanced.
10601 // _prime_splits() will tolerate the case where the pgid is
10606 void OSDShard::_prime_splits(set
<pair
<spg_t
,epoch_t
>> *pgids
)
10608 dout(10) << *pgids
<< dendl
;
10609 auto p
= pgids
->begin();
10610 while (p
!= pgids
->end()) {
10611 unsigned shard_index
= p
->first
.hash_to_shard(osd
->num_shards
);
10612 if (shard_index
== shard_id
) {
10613 auto r
= pg_slots
.emplace(p
->first
, nullptr);
10615 dout(10) << "priming slot " << p
->first
<< " e" << p
->second
<< dendl
;
10616 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10617 r
.first
->second
->waiting_for_split
.insert(p
->second
);
10620 ceph_assert(q
!= pg_slots
.end());
10621 dout(10) << "priming (existing) slot " << p
->first
<< " e" << p
->second
10623 q
->second
->waiting_for_split
.insert(p
->second
);
10625 p
= pgids
->erase(p
);
10632 void OSDShard::prime_merges(const OSDMapRef
& as_of_osdmap
,
10633 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10635 std::lock_guard
l(shard_lock
);
10636 dout(20) << __func__
<< " checking shard " << shard_id
10637 << " for remaining merge pgs " << merge_pgs
<< dendl
;
10638 auto p
= merge_pgs
->begin();
10639 while (p
!= merge_pgs
->end()) {
10640 spg_t pgid
= p
->first
;
10641 epoch_t epoch
= p
->second
;
10642 unsigned shard_index
= pgid
.hash_to_shard(osd
->num_shards
);
10643 if (shard_index
!= shard_id
) {
10647 OSDShardPGSlot
*slot
;
10648 auto r
= pg_slots
.emplace(pgid
, nullptr);
10650 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10652 slot
= r
.first
->second
.get();
10655 dout(20) << __func__
<< " have merge participant pg " << pgid
10656 << " " << slot
->pg
<< dendl
;
10657 } else if (!slot
->waiting_for_split
.empty() &&
10658 *slot
->waiting_for_split
.begin() < epoch
) {
10659 dout(20) << __func__
<< " pending split on merge participant pg " << pgid
10660 << " " << slot
->waiting_for_split
<< dendl
;
10662 dout(20) << __func__
<< " creating empty merge participant " << pgid
10663 << " for merge in " << epoch
<< dendl
;
10664 // leave history zeroed; PG::merge_from() will fill it in.
10665 pg_history_t history
;
10666 PGCreateInfo
cinfo(pgid
, epoch
- 1,
10667 history
, PastIntervals(), false);
10668 PGRef pg
= osd
->handle_pg_create_info(shard_osdmap
, &cinfo
);
10669 _attach_pg(r
.first
->second
.get(), pg
.get());
10670 _wake_pg_slot(pgid
, slot
);
10673 // mark slot for merge
10674 dout(20) << __func__
<< " marking merge participant " << pgid
<< dendl
;
10675 slot
->waiting_for_merge_epoch
= epoch
;
10676 p
= merge_pgs
->erase(p
);
10680 void OSDShard::register_and_wake_split_child(PG
*pg
)
10684 std::lock_guard
l(shard_lock
);
10685 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10686 auto p
= pg_slots
.find(pg
->pg_id
);
10687 ceph_assert(p
!= pg_slots
.end());
10688 auto *slot
= p
->second
.get();
10689 dout(20) << pg
->pg_id
<< " waiting_for_split " << slot
->waiting_for_split
10691 ceph_assert(!slot
->pg
);
10692 ceph_assert(!slot
->waiting_for_split
.empty());
10693 _attach_pg(slot
, pg
);
10695 epoch
= pg
->get_osdmap_epoch();
10696 ceph_assert(slot
->waiting_for_split
.count(epoch
));
10697 slot
->waiting_for_split
.erase(epoch
);
10698 if (slot
->waiting_for_split
.empty()) {
10699 _wake_pg_slot(pg
->pg_id
, slot
);
10701 dout(10) << __func__
<< " still waiting for split on "
10702 << slot
->waiting_for_split
<< dendl
;
10706 // kick child to ensure it pulls up to the latest osdmap
10707 osd
->enqueue_peering_evt(
10710 std::make_shared
<PGPeeringEvent
>(
10715 std::lock_guard l
{sdata_wait_lock
};
10716 sdata_cond
.notify_one();
10719 void OSDShard::unprime_split_children(spg_t parent
, unsigned old_pg_num
)
10721 std::lock_guard
l(shard_lock
);
10722 vector
<spg_t
> to_delete
;
10723 for (auto& i
: pg_slots
) {
10724 if (i
.first
!= parent
&&
10725 i
.first
.get_ancestor(old_pg_num
) == parent
) {
10726 dout(10) << __func__
<< " parent " << parent
<< " clearing " << i
.first
10728 _wake_pg_slot(i
.first
, i
.second
.get());
10729 to_delete
.push_back(i
.first
);
10732 for (auto pgid
: to_delete
) {
10733 pg_slots
.erase(pgid
);
10738 // =============================================================
10740 #undef dout_context
10741 #define dout_context osd->cct
10743 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10745 void OSD::ShardedOpWQ::_add_slot_waiter(
10747 OSDShardPGSlot
*slot
,
10750 if (qi
.is_peering()) {
10751 dout(20) << __func__
<< " " << pgid
10752 << " peering, item epoch is "
10753 << qi
.get_map_epoch()
10754 << ", will wait on " << qi
<< dendl
;
10755 slot
->waiting_peering
[qi
.get_map_epoch()].push_back(std::move(qi
));
10757 dout(20) << __func__
<< " " << pgid
10758 << " item epoch is "
10759 << qi
.get_map_epoch()
10760 << ", will wait on " << qi
<< dendl
;
10761 slot
->waiting
.push_back(std::move(qi
));
10766 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10768 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
10770 uint32_t shard_index
= thread_index
% osd
->num_shards
;
10771 auto& sdata
= osd
->shards
[shard_index
];
10772 ceph_assert(sdata
);
10774 // If all threads of shards do oncommits, there is a out-of-order
10775 // problem. So we choose the thread which has the smallest
10776 // thread_index(thread_index < num_shards) of shard to do oncommit
10778 bool is_smallest_thread_index
= thread_index
< osd
->num_shards
;
10781 sdata
->shard_lock
.lock();
10782 if (sdata
->pqueue
->empty() &&
10783 (!is_smallest_thread_index
|| sdata
->context_queue
.empty())) {
10784 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10785 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
10786 // we raced with a context_queue addition, don't wait
10787 wait_lock
.unlock();
10788 } else if (!sdata
->stop_waiting
) {
10789 dout(20) << __func__
<< " empty q, waiting" << dendl
;
10790 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
10791 sdata
->shard_lock
.unlock();
10792 sdata
->sdata_cond
.wait(wait_lock
);
10793 wait_lock
.unlock();
10794 sdata
->shard_lock
.lock();
10795 if (sdata
->pqueue
->empty() &&
10796 !(is_smallest_thread_index
&& !sdata
->context_queue
.empty())) {
10797 sdata
->shard_lock
.unlock();
10800 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10801 osd
->cct
->_conf
->threadpool_default_timeout
, 0);
10803 dout(20) << __func__
<< " need return immediately" << dendl
;
10804 wait_lock
.unlock();
10805 sdata
->shard_lock
.unlock();
10810 list
<Context
*> oncommits
;
10811 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
10812 sdata
->context_queue
.swap(oncommits
);
10815 if (sdata
->pqueue
->empty()) {
10816 if (osd
->is_stopping()) {
10817 sdata
->shard_lock
.unlock();
10818 for (auto c
: oncommits
) {
10819 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10822 return; // OSD shutdown, discard.
10824 sdata
->shard_lock
.unlock();
10825 handle_oncommits(oncommits
);
10829 OpQueueItem item
= sdata
->pqueue
->dequeue();
10830 if (osd
->is_stopping()) {
10831 sdata
->shard_lock
.unlock();
10832 for (auto c
: oncommits
) {
10833 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10836 return; // OSD shutdown, discard.
10839 const auto token
= item
.get_ordering_token();
10840 auto r
= sdata
->pg_slots
.emplace(token
, nullptr);
10842 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10844 OSDShardPGSlot
*slot
= r
.first
->second
.get();
10845 dout(20) << __func__
<< " " << token
10846 << (r
.second
? " (new)" : "")
10847 << " to_process " << slot
->to_process
10848 << " waiting " << slot
->waiting
10849 << " waiting_peering " << slot
->waiting_peering
10851 slot
->to_process
.push_back(std::move(item
));
10852 dout(20) << __func__
<< " " << slot
->to_process
.back()
10853 << " queued" << dendl
;
10856 PGRef pg
= slot
->pg
;
10858 // lock pg (if we have it)
10860 // note the requeue seq now...
10861 uint64_t requeue_seq
= slot
->requeue_seq
;
10862 ++slot
->num_running
;
10864 sdata
->shard_lock
.unlock();
10865 osd
->service
.maybe_inject_dispatch_delay();
10867 osd
->service
.maybe_inject_dispatch_delay();
10868 sdata
->shard_lock
.lock();
10870 auto q
= sdata
->pg_slots
.find(token
);
10871 if (q
== sdata
->pg_slots
.end()) {
10872 // this can happen if we race with pg removal.
10873 dout(20) << __func__
<< " slot " << token
<< " no longer there" << dendl
;
10875 sdata
->shard_lock
.unlock();
10876 handle_oncommits(oncommits
);
10879 slot
= q
->second
.get();
10880 --slot
->num_running
;
10882 if (slot
->to_process
.empty()) {
10883 // raced with _wake_pg_slot or consume_map
10884 dout(20) << __func__
<< " " << token
10885 << " nothing queued" << dendl
;
10887 sdata
->shard_lock
.unlock();
10888 handle_oncommits(oncommits
);
10891 if (requeue_seq
!= slot
->requeue_seq
) {
10892 dout(20) << __func__
<< " " << token
10893 << " requeue_seq " << slot
->requeue_seq
<< " > our "
10894 << requeue_seq
<< ", we raced with _wake_pg_slot"
10897 sdata
->shard_lock
.unlock();
10898 handle_oncommits(oncommits
);
10901 if (slot
->pg
!= pg
) {
10902 // this can happen if we race with pg removal.
10903 dout(20) << __func__
<< " slot " << token
<< " no longer attached to "
10910 dout(20) << __func__
<< " " << token
10911 << " to_process " << slot
->to_process
10912 << " waiting " << slot
->waiting
10913 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10915 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
10919 auto qi
= std::move(slot
->to_process
.front());
10920 slot
->to_process
.pop_front();
10921 dout(20) << __func__
<< " " << qi
<< " pg " << pg
<< dendl
;
10922 set
<pair
<spg_t
,epoch_t
>> new_children
;
10926 // should this pg shard exist on this osd in this (or a later) epoch?
10927 osdmap
= sdata
->shard_osdmap
;
10928 const PGCreateInfo
*create_info
= qi
.creates_pg();
10929 if (!slot
->waiting_for_split
.empty()) {
10930 dout(20) << __func__
<< " " << token
10931 << " splitting " << slot
->waiting_for_split
<< dendl
;
10932 _add_slot_waiter(token
, slot
, std::move(qi
));
10933 } else if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10934 dout(20) << __func__
<< " " << token
10935 << " map " << qi
.get_map_epoch() << " > "
10936 << osdmap
->get_epoch() << dendl
;
10937 _add_slot_waiter(token
, slot
, std::move(qi
));
10938 } else if (qi
.is_peering()) {
10939 if (!qi
.peering_requires_pg()) {
10940 // for pg-less events, we run them under the ordering lock, since
10941 // we don't have the pg lock to keep them ordered.
10942 qi
.run(osd
, sdata
, pg
, tp_handle
);
10943 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10945 if (create_info
->by_mon
&&
10946 osdmap
->get_pg_acting_primary(token
.pgid
) != osd
->whoami
) {
10947 dout(20) << __func__
<< " " << token
10948 << " no pg, no longer primary, ignoring mon create on "
10951 dout(20) << __func__
<< " " << token
10952 << " no pg, should create on " << qi
<< dendl
;
10953 pg
= osd
->handle_pg_create_info(osdmap
, create_info
);
10955 // we created the pg! drop out and continue "normally"!
10956 sdata
->_attach_pg(slot
, pg
.get());
10957 sdata
->_wake_pg_slot(token
, slot
);
10959 // identify split children between create epoch and shard epoch.
10960 osd
->service
.identify_splits_and_merges(
10961 pg
->get_osdmap(), osdmap
, pg
->pg_id
, &new_children
, nullptr);
10962 sdata
->_prime_splits(&new_children
);
10963 // distribute remaining split children to other shards below!
10966 dout(20) << __func__
<< " ignored create on " << qi
<< dendl
;
10969 dout(20) << __func__
<< " " << token
10970 << " no pg, peering, !create, discarding " << qi
<< dendl
;
10973 dout(20) << __func__
<< " " << token
10974 << " no pg, peering, doesn't map here e" << osdmap
->get_epoch()
10975 << ", discarding " << qi
10978 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10979 dout(20) << __func__
<< " " << token
10980 << " no pg, should exist e" << osdmap
->get_epoch()
10981 << ", will wait on " << qi
<< dendl
;
10982 _add_slot_waiter(token
, slot
, std::move(qi
));
10984 dout(20) << __func__
<< " " << token
10985 << " no pg, shouldn't exist e" << osdmap
->get_epoch()
10986 << ", dropping " << qi
<< dendl
;
10987 // share map with client?
10988 if (boost::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
10989 auto priv
= (*_op
)->get_req()->get_connection()->get_priv();
10990 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
10991 osd
->maybe_share_map(session
, *_op
, sdata
->shard_osdmap
);
10994 unsigned pushes_to_free
= qi
.get_reserved_pushes();
10995 if (pushes_to_free
> 0) {
10996 sdata
->shard_lock
.unlock();
10997 osd
->service
.release_reserved_pushes(pushes_to_free
);
10998 handle_oncommits(oncommits
);
11002 sdata
->shard_lock
.unlock();
11003 handle_oncommits(oncommits
);
11006 if (qi
.is_peering()) {
11007 OSDMapRef osdmap
= sdata
->shard_osdmap
;
11008 if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
11009 _add_slot_waiter(token
, slot
, std::move(qi
));
11010 sdata
->shard_lock
.unlock();
11012 handle_oncommits(oncommits
);
11016 sdata
->shard_lock
.unlock();
11018 if (!new_children
.empty()) {
11019 for (auto shard
: osd
->shards
) {
11020 shard
->prime_splits(osdmap
, &new_children
);
11022 ceph_assert(new_children
.empty());
11025 // osd_opwq_process marks the point at which an operation has been dequeued
11026 // and will begin to be handled by a worker thread.
11030 if (boost::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11031 reqid
= (*_op
)->get_reqid();
11034 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
11035 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
11038 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
11039 Formatter
*f
= Formatter::create("json");
11040 f
->open_object_section("q");
11042 f
->close_section();
11047 qi
.run(osd
, sdata
, pg
, tp_handle
);
11052 if (boost::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11053 reqid
= (*_op
)->get_reqid();
11056 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
11057 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
11060 handle_oncommits(oncommits
);
11063 void OSD::ShardedOpWQ::_enqueue(OpQueueItem
&& item
) {
11064 uint32_t shard_index
=
11065 item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11067 OSDShard
* sdata
= osd
->shards
[shard_index
];
11068 assert (NULL
!= sdata
);
11069 unsigned priority
= item
.get_priority();
11070 unsigned cost
= item
.get_cost();
11071 sdata
->shard_lock
.lock();
11073 dout(20) << __func__
<< " " << item
<< dendl
;
11074 if (priority
>= osd
->op_prio_cutoff
)
11075 sdata
->pqueue
->enqueue_strict(
11076 item
.get_owner(), priority
, std::move(item
));
11078 sdata
->pqueue
->enqueue(
11079 item
.get_owner(), priority
, cost
, std::move(item
));
11080 sdata
->shard_lock
.unlock();
11082 std::lock_guard l
{sdata
->sdata_wait_lock
};
11083 sdata
->sdata_cond
.notify_one();
11086 void OSD::ShardedOpWQ::_enqueue_front(OpQueueItem
&& item
)
11088 auto shard_index
= item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11089 auto& sdata
= osd
->shards
[shard_index
];
11090 ceph_assert(sdata
);
11091 sdata
->shard_lock
.lock();
11092 auto p
= sdata
->pg_slots
.find(item
.get_ordering_token());
11093 if (p
!= sdata
->pg_slots
.end() &&
11094 !p
->second
->to_process
.empty()) {
11095 // we may be racing with _process, which has dequeued a new item
11096 // from pqueue, put it on to_process, and is now busy taking the
11097 // pg lock. ensure this old requeued item is ordered before any
11098 // such newer item in to_process.
11099 p
->second
->to_process
.push_front(std::move(item
));
11100 item
= std::move(p
->second
->to_process
.back());
11101 p
->second
->to_process
.pop_back();
11102 dout(20) << __func__
11103 << " " << p
->second
->to_process
.front()
11104 << " shuffled w/ " << item
<< dendl
;
11106 dout(20) << __func__
<< " " << item
<< dendl
;
11108 sdata
->_enqueue_front(std::move(item
), osd
->op_prio_cutoff
);
11109 sdata
->shard_lock
.unlock();
11110 std::lock_guard l
{sdata
->sdata_wait_lock
};
11111 sdata
->sdata_cond
.notify_one();
11115 namespace osd_cmds
{
11117 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
,
11120 if (!ceph_using_tcmalloc()) {
11121 os
<< "could not issue heap profiler command -- not using tcmalloc!";
11122 return -EOPNOTSUPP
;
11126 if (!cmd_getval(&cct
, cmdmap
, "heapcmd", cmd
)) {
11127 os
<< "unable to get value for command \"" << cmd
<< "\"";
11131 std::vector
<std::string
> cmd_vec
;
11132 get_str_vec(cmd
, cmd_vec
);
11135 if (cmd_getval(&cct
, cmdmap
, "value", val
)) {
11136 cmd_vec
.push_back(val
);
11139 ceph_heap_profiler_handle_command(cmd_vec
, os
);
11144 }} // namespace ceph::osd_cmds
11147 std::ostream
& operator<<(std::ostream
& out
, const io_queue
& q
) {
11149 case io_queue::prioritized
:
11150 out
<< "prioritized";
11152 case io_queue::weightedpriority
:
11153 out
<< "weightedpriority";
11155 case io_queue::mclock_opclass
:
11156 out
<< "mclock_opclass";
11158 case io_queue::mclock_client
:
11159 out
<< "mclock_client";