1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
23 #include <boost/scoped_ptr.hpp>
25 #ifdef HAVE_SYS_PARAM_H
26 #include <sys/param.h>
29 #ifdef HAVE_SYS_MOUNT_H
30 #include <sys/mount.h>
35 #include "include/types.h"
36 #include "include/compat.h"
41 #include "osdc/Objecter.h"
43 #include "common/errno.h"
44 #include "common/ceph_argparse.h"
45 #include "common/version.h"
46 #include "common/io_priority.h"
48 #include "os/ObjectStore.h"
50 #include "os/FuseStore.h"
53 #include "PrimaryLogPG.h"
56 #include "msg/Messenger.h"
57 #include "msg/Message.h"
59 #include "mon/MonClient.h"
61 #include "messages/MLog.h"
63 #include "messages/MGenericMessage.h"
64 #include "messages/MPing.h"
65 #include "messages/MOSDPing.h"
66 #include "messages/MOSDFailure.h"
67 #include "messages/MOSDMarkMeDown.h"
68 #include "messages/MOSDFull.h"
69 #include "messages/MOSDOp.h"
70 #include "messages/MOSDOpReply.h"
71 #include "messages/MOSDBackoff.h"
72 #include "messages/MOSDBeacon.h"
73 #include "messages/MOSDRepOp.h"
74 #include "messages/MOSDRepOpReply.h"
75 #include "messages/MOSDBoot.h"
76 #include "messages/MOSDPGTemp.h"
78 #include "messages/MOSDMap.h"
79 #include "messages/MMonGetOSDMap.h"
80 #include "messages/MOSDPGNotify.h"
81 #include "messages/MOSDPGQuery.h"
82 #include "messages/MOSDPGLog.h"
83 #include "messages/MOSDPGRemove.h"
84 #include "messages/MOSDPGInfo.h"
85 #include "messages/MOSDPGCreate.h"
86 #include "messages/MOSDPGTrim.h"
87 #include "messages/MOSDPGScan.h"
88 #include "messages/MOSDPGBackfill.h"
89 #include "messages/MBackfillReserve.h"
90 #include "messages/MRecoveryReserve.h"
91 #include "messages/MOSDECSubOpWrite.h"
92 #include "messages/MOSDECSubOpWriteReply.h"
93 #include "messages/MOSDECSubOpRead.h"
94 #include "messages/MOSDECSubOpReadReply.h"
95 #include "messages/MOSDPGCreated.h"
96 #include "messages/MOSDPGUpdateLogMissing.h"
97 #include "messages/MOSDPGUpdateLogMissingReply.h"
99 #include "messages/MOSDAlive.h"
101 #include "messages/MOSDScrub.h"
102 #include "messages/MOSDScrubReserve.h"
103 #include "messages/MOSDRepScrub.h"
105 #include "messages/MMonCommand.h"
106 #include "messages/MCommand.h"
107 #include "messages/MCommandReply.h"
109 #include "messages/MPGStats.h"
110 #include "messages/MPGStatsAck.h"
112 #include "messages/MWatchNotify.h"
113 #include "messages/MOSDPGPush.h"
114 #include "messages/MOSDPGPushReply.h"
115 #include "messages/MOSDPGPull.h"
117 #include "common/perf_counters.h"
118 #include "common/Timer.h"
119 #include "common/LogClient.h"
120 #include "common/AsyncReserver.h"
121 #include "common/HeartbeatMap.h"
122 #include "common/admin_socket.h"
123 #include "common/ceph_context.h"
125 #include "global/signal_handler.h"
126 #include "global/pidfile.h"
128 #include "include/color.h"
129 #include "perfglue/cpu_profiler.h"
130 #include "perfglue/heap_profiler.h"
132 #include "osd/OpRequest.h"
134 #include "auth/AuthAuthorizeHandler.h"
135 #include "auth/RotatingKeyRing.h"
136 #include "common/errno.h"
138 #include "objclass/objclass.h"
140 #include "common/cmdparse.h"
141 #include "include/str_list.h"
142 #include "include/util.h"
144 #include "include/assert.h"
145 #include "common/config.h"
146 #include "common/EventTrace.h"
149 #define TRACEPOINT_DEFINE
150 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
151 #include "tracing/osd.h"
152 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
153 #undef TRACEPOINT_DEFINE
155 #define tracepoint(...)
158 #define dout_context cct
159 #define dout_subsys ceph_subsys_osd
161 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
163 const double OSD::OSD_TICK_INTERVAL
= 1.0;
165 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
166 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
169 void PGQueueable::RunVis::operator()(const OpRequestRef
&op
) {
170 return osd
->dequeue_op(pg
, op
, handle
);
173 void PGQueueable::RunVis::operator()(const PGSnapTrim
&op
) {
174 return pg
->snap_trimmer(op
.epoch_queued
);
177 void PGQueueable::RunVis::operator()(const PGScrub
&op
) {
178 return pg
->scrub(op
.epoch_queued
, handle
);
181 void PGQueueable::RunVis::operator()(const PGRecovery
&op
) {
182 return osd
->do_recovery(pg
.get(), op
.epoch_queued
, op
.reserved_pushes
, handle
);
185 //Initial features in new superblock.
186 //Features here are also automatically upgraded
187 CompatSet
OSD::get_osd_initial_compat_set() {
188 CompatSet::FeatureSet ceph_osd_feature_compat
;
189 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
190 CompatSet::FeatureSet ceph_osd_feature_incompat
;
191 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
192 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
193 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
194 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
195 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
196 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
197 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
198 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
199 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
200 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
201 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
202 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
203 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
204 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
205 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
206 ceph_osd_feature_incompat
);
209 //Features are added here that this OSD supports.
210 CompatSet
OSD::get_osd_compat_set() {
211 CompatSet compat
= get_osd_initial_compat_set();
212 //Any features here can be set in code, but not in initial superblock
213 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
217 OSDService::OSDService(OSD
*osd
) :
220 meta_osr(new ObjectStore::Sequencer("meta")),
221 whoami(osd
->whoami
), store(osd
->store
),
222 log_client(osd
->log_client
), clog(osd
->clog
),
223 pg_recovery_stats(osd
->pg_recovery_stats
),
224 cluster_messenger(osd
->cluster_messenger
),
225 client_messenger(osd
->client_messenger
),
227 recoverystate_perf(osd
->recoverystate_perf
),
229 peering_wq(osd
->peering_wq
),
230 recovery_gen_wq("recovery_gen_wq", cct
->_conf
->osd_recovery_thread_timeout
,
232 class_handler(osd
->class_handler
),
233 pg_epoch_lock("OSDService::pg_epoch_lock"),
234 publish_lock("OSDService::publish_lock"),
235 pre_publish_lock("OSDService::pre_publish_lock"),
237 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
238 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
240 agent_lock("OSDService::agent_lock"),
241 agent_valid_iterator(false),
243 flush_mode_high_count(0),
246 agent_stop_flag(false),
247 agent_timer_lock("OSDService::agent_timer_lock"),
248 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
249 last_recalibrate(ceph_clock_now()),
250 promote_max_objects(0),
251 promote_max_bytes(0),
252 objecter(new Objecter(osd
->client_messenger
->cct
, osd
->objecter_messenger
, osd
->monc
, NULL
, 0, 0)),
253 objecter_finisher(osd
->client_messenger
->cct
),
254 watch_lock("OSDService::watch_lock"),
255 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
257 recovery_request_lock("OSDService::recovery_request_lock"),
258 recovery_request_timer(cct
, recovery_request_lock
, false),
259 reserver_finisher(cct
),
260 local_reserver(&reserver_finisher
, cct
->_conf
->osd_max_backfills
,
261 cct
->_conf
->osd_min_recovery_priority
),
262 remote_reserver(&reserver_finisher
, cct
->_conf
->osd_max_backfills
,
263 cct
->_conf
->osd_min_recovery_priority
),
264 pg_temp_lock("OSDService::pg_temp_lock"),
265 snap_sleep_lock("OSDService::snap_sleep_lock"),
267 osd
->client_messenger
->cct
, snap_sleep_lock
, false /* relax locking */),
268 snap_reserver(&reserver_finisher
,
269 cct
->_conf
->osd_max_trimming_pgs
),
270 recovery_lock("OSDService::recovery_lock"),
271 recovery_ops_active(0),
272 recovery_ops_reserved(0),
273 recovery_paused(false),
274 map_cache_lock("OSDService::map_cache_lock"),
275 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
276 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
277 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
278 in_progress_split_lock("OSDService::in_progress_split_lock"),
279 stat_lock("OSDService::stat_lock"),
280 full_status_lock("OSDService::full_status_lock"),
283 epoch_lock("OSDService::epoch_lock"),
284 boot_epoch(0), up_epoch(0), bind_epoch(0),
285 is_stopping_lock("OSDService::is_stopping_lock")
287 , pgid_lock("OSDService::pgid_lock")
293 OSDService::~OSDService()
298 void OSDService::_start_split(spg_t parent
, const set
<spg_t
> &children
)
300 for (set
<spg_t
>::const_iterator i
= children
.begin();
303 dout(10) << __func__
<< ": Starting split on pg " << *i
304 << ", parent=" << parent
<< dendl
;
305 assert(!pending_splits
.count(*i
));
306 assert(!in_progress_splits
.count(*i
));
307 pending_splits
.insert(make_pair(*i
, parent
));
309 assert(!rev_pending_splits
[parent
].count(*i
));
310 rev_pending_splits
[parent
].insert(*i
);
314 void OSDService::mark_split_in_progress(spg_t parent
, const set
<spg_t
> &children
)
316 Mutex::Locker
l(in_progress_split_lock
);
317 map
<spg_t
, set
<spg_t
> >::iterator piter
= rev_pending_splits
.find(parent
);
318 assert(piter
!= rev_pending_splits
.end());
319 for (set
<spg_t
>::const_iterator i
= children
.begin();
322 assert(piter
->second
.count(*i
));
323 assert(pending_splits
.count(*i
));
324 assert(!in_progress_splits
.count(*i
));
325 assert(pending_splits
[*i
] == parent
);
327 pending_splits
.erase(*i
);
328 piter
->second
.erase(*i
);
329 in_progress_splits
.insert(*i
);
331 if (piter
->second
.empty())
332 rev_pending_splits
.erase(piter
);
335 void OSDService::cancel_pending_splits_for_parent(spg_t parent
)
337 Mutex::Locker
l(in_progress_split_lock
);
338 _cancel_pending_splits_for_parent(parent
);
341 void OSDService::_cancel_pending_splits_for_parent(spg_t parent
)
343 map
<spg_t
, set
<spg_t
> >::iterator piter
= rev_pending_splits
.find(parent
);
344 if (piter
== rev_pending_splits
.end())
347 for (set
<spg_t
>::iterator i
= piter
->second
.begin();
348 i
!= piter
->second
.end();
350 assert(pending_splits
.count(*i
));
351 assert(!in_progress_splits
.count(*i
));
352 pending_splits
.erase(*i
);
353 dout(10) << __func__
<< ": Completing split on pg " << *i
354 << " for parent: " << parent
<< dendl
;
355 _cancel_pending_splits_for_parent(*i
);
357 rev_pending_splits
.erase(piter
);
360 void OSDService::_maybe_split_pgid(OSDMapRef old_map
,
364 assert(old_map
->have_pg_pool(pgid
.pool()));
365 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
366 if (pgid
.ps() < static_cast<unsigned>(old_pgnum
)) {
368 if (pgid
.is_split(old_pgnum
,
369 new_map
->get_pg_num(pgid
.pool()), &children
)) {
370 _start_split(pgid
, children
); }
372 assert(pgid
.ps() < static_cast<unsigned>(new_map
->get_pg_num(pgid
.pool())));
376 void OSDService::init_splits_between(spg_t pgid
,
380 // First, check whether we can avoid this potentially expensive check
381 if (tomap
->have_pg_pool(pgid
.pool()) &&
383 frommap
->get_pg_num(pgid
.pool()),
384 tomap
->get_pg_num(pgid
.pool()),
386 // Ok, a split happened, so we need to walk the osdmaps
387 set
<spg_t
> new_pgs
; // pgs to scan on each map
388 new_pgs
.insert(pgid
);
389 OSDMapRef
curmap(get_map(frommap
->get_epoch()));
390 for (epoch_t e
= frommap
->get_epoch() + 1;
391 e
<= tomap
->get_epoch();
393 OSDMapRef
nextmap(try_get_map(e
));
396 set
<spg_t
> even_newer_pgs
; // pgs added in this loop
397 for (set
<spg_t
>::iterator i
= new_pgs
.begin(); i
!= new_pgs
.end(); ++i
) {
398 set
<spg_t
> split_pgs
;
399 if (i
->is_split(curmap
->get_pg_num(i
->pool()),
400 nextmap
->get_pg_num(i
->pool()),
402 start_split(*i
, split_pgs
);
403 even_newer_pgs
.insert(split_pgs
.begin(), split_pgs
.end());
406 new_pgs
.insert(even_newer_pgs
.begin(), even_newer_pgs
.end());
409 assert(curmap
== tomap
); // we must have had both frommap and tomap
413 void OSDService::expand_pg_num(OSDMapRef old_map
,
416 Mutex::Locker
l(in_progress_split_lock
);
417 for (set
<spg_t
>::iterator i
= in_progress_splits
.begin();
418 i
!= in_progress_splits
.end();
420 if (!new_map
->have_pg_pool(i
->pool())) {
421 in_progress_splits
.erase(i
++);
423 _maybe_split_pgid(old_map
, new_map
, *i
);
427 for (map
<spg_t
, spg_t
>::iterator i
= pending_splits
.begin();
428 i
!= pending_splits
.end();
430 if (!new_map
->have_pg_pool(i
->first
.pool())) {
431 rev_pending_splits
.erase(i
->second
);
432 pending_splits
.erase(i
++);
434 _maybe_split_pgid(old_map
, new_map
, i
->first
);
440 bool OSDService::splitting(spg_t pgid
)
442 Mutex::Locker
l(in_progress_split_lock
);
443 return in_progress_splits
.count(pgid
) ||
444 pending_splits
.count(pgid
);
447 void OSDService::complete_split(const set
<spg_t
> &pgs
)
449 Mutex::Locker
l(in_progress_split_lock
);
450 for (set
<spg_t
>::const_iterator i
= pgs
.begin();
453 dout(10) << __func__
<< ": Completing split on pg " << *i
<< dendl
;
454 assert(!pending_splits
.count(*i
));
455 assert(in_progress_splits
.count(*i
));
456 in_progress_splits
.erase(*i
);
460 void OSDService::need_heartbeat_peer_update()
462 osd
->need_heartbeat_peer_update();
465 void OSDService::pg_stat_queue_enqueue(PG
*pg
)
467 osd
->pg_stat_queue_enqueue(pg
);
470 void OSDService::pg_stat_queue_dequeue(PG
*pg
)
472 osd
->pg_stat_queue_dequeue(pg
);
475 void OSDService::start_shutdown()
478 Mutex::Locker
l(agent_timer_lock
);
479 agent_timer
.shutdown();
483 void OSDService::shutdown()
485 reserver_finisher
.wait_for_empty();
486 reserver_finisher
.stop();
488 Mutex::Locker
l(watch_lock
);
489 watch_timer
.shutdown();
492 objecter
->shutdown();
493 objecter_finisher
.wait_for_empty();
494 objecter_finisher
.stop();
497 Mutex::Locker
l(recovery_request_lock
);
498 recovery_request_timer
.shutdown();
502 Mutex::Locker
l(snap_sleep_lock
);
503 snap_sleep_timer
.shutdown();
506 osdmap
= OSDMapRef();
507 next_osdmap
= OSDMapRef();
510 void OSDService::init()
512 reserver_finisher
.start();
513 objecter_finisher
.start();
514 objecter
->set_client_incarnation(0);
516 // deprioritize objecter in daemonperf output
517 objecter
->get_logger()->set_prio_adjust(-3);
521 snap_sleep_timer
.init();
523 agent_thread
.create("osd_srv_agent");
525 if (cct
->_conf
->osd_recovery_delay_start
)
526 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
529 void OSDService::final_init()
531 objecter
->start(osdmap
.get());
534 void OSDService::activate_map()
536 // wake/unwake the tiering agent
539 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
545 class AgentTimeoutCB
: public Context
{
548 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
549 void finish(int) override
{
550 pg
->agent_choose_mode_restart();
554 void OSDService::agent_entry()
556 dout(10) << __func__
<< " start" << dendl
;
559 while (!agent_stop_flag
) {
560 if (agent_queue
.empty()) {
561 dout(20) << __func__
<< " empty queue" << dendl
;
562 agent_cond
.Wait(agent_lock
);
565 uint64_t level
= agent_queue
.rbegin()->first
;
566 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
568 << " tiers " << agent_queue
.size()
569 << ", top is " << level
570 << " with pgs " << top
.size()
571 << ", ops " << agent_ops
<< "/"
572 << cct
->_conf
->osd_agent_max_ops
573 << (agent_active
? " active" : " NOT ACTIVE")
575 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
576 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
577 int agent_flush_quota
= max
;
578 if (!flush_mode_high_count
)
579 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
580 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
581 agent_cond
.Wait(agent_lock
);
585 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
586 agent_queue_pos
= top
.begin();
587 agent_valid_iterator
= true;
589 PGRef pg
= *agent_queue_pos
;
590 dout(10) << "high_count " << flush_mode_high_count
591 << " agent_ops " << agent_ops
592 << " flush_quota " << agent_flush_quota
<< dendl
;
594 if (!pg
->agent_work(max
, agent_flush_quota
)) {
595 dout(10) << __func__
<< " " << pg
->get_pgid()
596 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
597 << " seconds" << dendl
;
599 osd
->logger
->inc(l_osd_tier_delay
);
600 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
601 agent_timer_lock
.Lock();
602 Context
*cb
= new AgentTimeoutCB(pg
);
603 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
604 agent_timer_lock
.Unlock();
609 dout(10) << __func__
<< " finish" << dendl
;
612 void OSDService::agent_stop()
615 Mutex::Locker
l(agent_lock
);
617 // By this time all ops should be cancelled
618 assert(agent_ops
== 0);
619 // By this time all PGs are shutdown and dequeued
620 if (!agent_queue
.empty()) {
621 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
622 derr
<< "agent queue not empty, for example " << (*top
.begin())->info
.pgid
<< dendl
;
623 assert(0 == "agent queue not empty");
626 agent_stop_flag
= true;
632 // -------------------------------------
634 void OSDService::promote_throttle_recalibrate()
636 utime_t now
= ceph_clock_now();
637 double dur
= now
- last_recalibrate
;
638 last_recalibrate
= now
;
639 unsigned prob
= promote_probability_millis
;
641 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
642 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
644 unsigned min_prob
= 1;
646 uint64_t attempts
, obj
, bytes
;
647 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
648 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
649 << obj
<< " objects and " << pretty_si_t(bytes
) << " bytes; target "
650 << target_obj_sec
<< " obj/sec or "
651 << pretty_si_t(target_bytes_sec
) << " bytes/sec"
654 // calculate what the probability *should* be, given the targets
656 if (attempts
&& dur
> 0) {
657 uint64_t avg_size
= 1;
659 avg_size
= MAX(bytes
/ obj
, 1);
660 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
661 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
663 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
664 << avg_size
<< dendl
;
665 if (target_obj_sec
&& target_bytes_sec
)
666 new_prob
= MIN(po
, pb
);
667 else if (target_obj_sec
)
669 else if (target_bytes_sec
)
676 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
678 // correct for persistent skew between target rate and actual rate, adjust
681 if (attempts
&& obj
) {
682 actual
= obj
* 1000 / attempts
;
683 ratio
= (double)actual
/ (double)prob
;
684 new_prob
= (double)new_prob
/ ratio
;
686 new_prob
= MAX(new_prob
, min_prob
);
687 new_prob
= MIN(new_prob
, 1000);
690 prob
= (prob
+ new_prob
) / 2;
691 prob
= MAX(prob
, min_prob
);
692 prob
= MIN(prob
, 1000);
693 dout(10) << __func__
<< " actual " << actual
694 << ", actual/prob ratio " << ratio
695 << ", adjusted new_prob " << new_prob
696 << ", prob " << promote_probability_millis
<< " -> " << prob
698 promote_probability_millis
= prob
;
700 // set hard limits for this interval to mitigate stampedes
701 promote_max_objects
= target_obj_sec
* OSD::OSD_TICK_INTERVAL
* 2;
702 promote_max_bytes
= target_bytes_sec
* OSD::OSD_TICK_INTERVAL
* 2;
705 // -------------------------------------
707 float OSDService::get_failsafe_full_ratio()
709 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
710 if (full_ratio
> 1.0) full_ratio
/= 100.0;
714 void OSDService::check_full_status(const osd_stat_t
&osd_stat
)
716 Mutex::Locker
l(full_status_lock
);
718 float ratio
= ((float)osd_stat
.kb_used
) / ((float)osd_stat
.kb
);
721 // The OSDMap ratios take precendence. So if the failsafe is .95 and
722 // the admin sets the cluster full to .96, the failsafe moves up to .96
723 // too. (Not that having failsafe == full is ideal, but it's better than
724 // dropping writes before the clusters appears full.)
725 OSDMapRef osdmap
= get_osdmap();
726 if (!osdmap
|| osdmap
->get_epoch() == 0) {
730 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
731 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
732 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
733 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
735 if (!osdmap
->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
736 // use the failsafe for nearfull and full; the mon isn't using the
737 // flags anyway because we're mid-upgrade.
738 full_ratio
= failsafe_ratio
;
739 backfillfull_ratio
= failsafe_ratio
;
740 nearfull_ratio
= failsafe_ratio
;
741 } else if (full_ratio
<= 0 ||
742 backfillfull_ratio
<= 0 ||
743 nearfull_ratio
<= 0) {
744 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
745 // use failsafe flag. ick. the monitor did something wrong or the user
746 // did something stupid.
747 full_ratio
= failsafe_ratio
;
748 backfillfull_ratio
= failsafe_ratio
;
749 nearfull_ratio
= failsafe_ratio
;
754 if (injectfull_state
> NONE
&& injectfull
) {
755 new_state
= injectfull_state
;
756 inject
= "(Injected)";
757 } else if (ratio
> failsafe_ratio
) {
758 new_state
= FAILSAFE
;
759 } else if (ratio
> full_ratio
) {
761 } else if (ratio
> backfillfull_ratio
) {
762 new_state
= BACKFILLFULL
;
763 } else if (ratio
> nearfull_ratio
) {
764 new_state
= NEARFULL
;
768 dout(20) << __func__
<< " cur ratio " << ratio
769 << ". nearfull_ratio " << nearfull_ratio
770 << ". backfillfull_ratio " << backfillfull_ratio
771 << ", full_ratio " << full_ratio
772 << ", failsafe_ratio " << failsafe_ratio
773 << ", new state " << get_full_state_name(new_state
)
778 if (cur_state
!= new_state
) {
779 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
780 << " -> " << get_full_state_name(new_state
) << dendl
;
781 if (new_state
== FAILSAFE
) {
782 clog
->error() << "failsafe engaged, dropping updates, now "
783 << (int)roundf(ratio
* 100) << "% full";
784 } else if (cur_state
== FAILSAFE
) {
785 clog
->error() << "failsafe disengaged, no longer dropping updates, now "
786 << (int)roundf(ratio
* 100) << "% full";
788 cur_state
= new_state
;
792 bool OSDService::need_fullness_update()
794 OSDMapRef osdmap
= get_osdmap();
796 if (osdmap
->exists(whoami
)) {
797 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
799 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
801 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
808 else if (is_backfillfull())
810 else if (is_nearfull())
815 bool OSDService::_check_full(s_names type
, ostream
&ss
) const
817 Mutex::Locker
l(full_status_lock
);
819 if (injectfull
&& injectfull_state
>= type
) {
820 // injectfull is either a count of the number of times to return failsafe full
821 // or if -1 then always return full
824 ss
<< "Injected " << get_full_state_name(type
) << " OSD ("
825 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")";
829 ss
<< "current usage is " << cur_ratio
;
830 return cur_state
>= type
;
833 bool OSDService::check_failsafe_full(ostream
&ss
) const
835 return _check_full(FAILSAFE
, ss
);
838 bool OSDService::check_full(ostream
&ss
) const
840 return _check_full(FULL
, ss
);
843 bool OSDService::check_backfill_full(ostream
&ss
) const
845 return _check_full(BACKFILLFULL
, ss
);
848 bool OSDService::check_nearfull(ostream
&ss
) const
850 return _check_full(NEARFULL
, ss
);
853 bool OSDService::is_failsafe_full() const
855 Mutex::Locker
l(full_status_lock
);
856 return cur_state
== FAILSAFE
;
859 bool OSDService::is_full() const
861 Mutex::Locker
l(full_status_lock
);
862 return cur_state
>= FULL
;
865 bool OSDService::is_backfillfull() const
867 Mutex::Locker
l(full_status_lock
);
868 return cur_state
>= BACKFILLFULL
;
871 bool OSDService::is_nearfull() const
873 Mutex::Locker
l(full_status_lock
);
874 return cur_state
>= NEARFULL
;
877 void OSDService::set_injectfull(s_names type
, int64_t count
)
879 Mutex::Locker
l(full_status_lock
);
880 injectfull_state
= type
;
884 void OSDService::update_osd_stat(vector
<int>& hb_peers
)
886 Mutex::Locker
lock(stat_lock
);
888 osd_stat
.hb_peers
.swap(hb_peers
);
890 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
892 // fill in osd stats too
893 struct store_statfs_t stbuf
;
894 int r
= osd
->store
->statfs(&stbuf
);
896 derr
<< "statfs() failed: " << cpp_strerror(r
) << dendl
;
900 uint64_t bytes
= stbuf
.total
;
901 uint64_t used
= bytes
- stbuf
.available
;
902 uint64_t avail
= stbuf
.available
;
904 osd_stat
.kb
= bytes
>> 10;
905 osd_stat
.kb_used
= used
>> 10;
906 osd_stat
.kb_avail
= avail
>> 10;
908 osd
->logger
->set(l_osd_stat_bytes
, bytes
);
909 osd
->logger
->set(l_osd_stat_bytes_used
, used
);
910 osd
->logger
->set(l_osd_stat_bytes_avail
, avail
);
912 dout(20) << "update_osd_stat " << osd_stat
<< dendl
;
914 check_full_status(osd_stat
);
917 bool OSDService::check_osdmap_full(const set
<pg_shard_t
> &missing_on
)
919 OSDMapRef osdmap
= get_osdmap();
920 for (auto shard
: missing_on
) {
921 if (osdmap
->get_state(shard
.osd
) & CEPH_OSD_FULL
)
927 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
929 OSDMapRef next_map
= get_nextmap_reserved();
930 // service map is always newer/newest
931 assert(from_epoch
<= next_map
->get_epoch());
933 if (next_map
->is_down(peer
) ||
934 next_map
->get_info(peer
).up_from
> from_epoch
) {
936 release_map(next_map
);
939 const entity_inst_t
& peer_inst
= next_map
->get_cluster_inst(peer
);
940 ConnectionRef peer_con
= osd
->cluster_messenger
->get_connection(peer_inst
);
941 share_map_peer(peer
, peer_con
.get(), next_map
);
942 peer_con
->send_message(m
);
943 release_map(next_map
);
946 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
948 OSDMapRef next_map
= get_nextmap_reserved();
949 // service map is always newer/newest
950 assert(from_epoch
<= next_map
->get_epoch());
952 if (next_map
->is_down(peer
) ||
953 next_map
->get_info(peer
).up_from
> from_epoch
) {
954 release_map(next_map
);
957 ConnectionRef con
= osd
->cluster_messenger
->get_connection(next_map
->get_cluster_inst(peer
));
958 release_map(next_map
);
962 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
964 OSDMapRef next_map
= get_nextmap_reserved();
965 // service map is always newer/newest
966 assert(from_epoch
<= next_map
->get_epoch());
968 pair
<ConnectionRef
,ConnectionRef
> ret
;
969 if (next_map
->is_down(peer
) ||
970 next_map
->get_info(peer
).up_from
> from_epoch
) {
971 release_map(next_map
);
974 ret
.first
= osd
->hb_back_client_messenger
->get_connection(next_map
->get_hb_back_inst(peer
));
975 if (next_map
->get_hb_front_addr(peer
) != entity_addr_t())
976 ret
.second
= osd
->hb_front_client_messenger
->get_connection(next_map
->get_hb_front_inst(peer
));
977 release_map(next_map
);
982 void OSDService::queue_want_pg_temp(pg_t pgid
, vector
<int>& want
)
984 Mutex::Locker
l(pg_temp_lock
);
985 map
<pg_t
,vector
<int> >::iterator p
= pg_temp_pending
.find(pgid
);
986 if (p
== pg_temp_pending
.end() ||
988 pg_temp_wanted
[pgid
] = want
;
992 void OSDService::remove_want_pg_temp(pg_t pgid
)
994 Mutex::Locker
l(pg_temp_lock
);
995 pg_temp_wanted
.erase(pgid
);
996 pg_temp_pending
.erase(pgid
);
999 void OSDService::_sent_pg_temp()
1001 for (map
<pg_t
,vector
<int> >::iterator p
= pg_temp_wanted
.begin();
1002 p
!= pg_temp_wanted
.end();
1004 pg_temp_pending
[p
->first
] = p
->second
;
1005 pg_temp_wanted
.clear();
1008 void OSDService::requeue_pg_temp()
1010 Mutex::Locker
l(pg_temp_lock
);
1011 // wanted overrides pending. note that remove_want_pg_temp
1012 // clears the item out of both.
1013 unsigned old_wanted
= pg_temp_wanted
.size();
1014 unsigned old_pending
= pg_temp_pending
.size();
1016 pg_temp_wanted
.swap(pg_temp_pending
);
1017 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1018 << pg_temp_wanted
.size() << dendl
;
1021 void OSDService::send_pg_temp()
1023 Mutex::Locker
l(pg_temp_lock
);
1024 if (pg_temp_wanted
.empty())
1026 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1027 MOSDPGTemp
*m
= new MOSDPGTemp(osdmap
->get_epoch());
1028 m
->pg_temp
= pg_temp_wanted
;
1029 monc
->send_mon_message(m
);
1033 void OSDService::send_pg_created(pg_t pgid
)
1035 dout(20) << __func__
<< dendl
;
1036 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1039 // --------------------------------------
1042 epoch_t
OSDService::get_peer_epoch(int peer
)
1044 Mutex::Locker
l(peer_map_epoch_lock
);
1045 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1046 if (p
== peer_map_epoch
.end())
1051 epoch_t
OSDService::note_peer_epoch(int peer
, epoch_t e
)
1053 Mutex::Locker
l(peer_map_epoch_lock
);
1054 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1055 if (p
!= peer_map_epoch
.end()) {
1056 if (p
->second
< e
) {
1057 dout(10) << "note_peer_epoch osd." << peer
<< " has " << e
<< dendl
;
1060 dout(30) << "note_peer_epoch osd." << peer
<< " has " << p
->second
<< " >= " << e
<< dendl
;
1064 dout(10) << "note_peer_epoch osd." << peer
<< " now has " << e
<< dendl
;
1065 peer_map_epoch
[peer
] = e
;
1070 void OSDService::forget_peer_epoch(int peer
, epoch_t as_of
)
1072 Mutex::Locker
l(peer_map_epoch_lock
);
1073 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1074 if (p
!= peer_map_epoch
.end()) {
1075 if (p
->second
<= as_of
) {
1076 dout(10) << "forget_peer_epoch osd." << peer
<< " as_of " << as_of
1077 << " had " << p
->second
<< dendl
;
1078 peer_map_epoch
.erase(p
);
1080 dout(10) << "forget_peer_epoch osd." << peer
<< " as_of " << as_of
1081 << " has " << p
->second
<< " - not forgetting" << dendl
;
1086 bool OSDService::should_share_map(entity_name_t name
, Connection
*con
,
1087 epoch_t epoch
, const OSDMapRef
& osdmap
,
1088 const epoch_t
*sent_epoch_p
)
1090 dout(20) << "should_share_map "
1091 << name
<< " " << con
->get_peer_addr()
1092 << " " << epoch
<< dendl
;
1094 // does client have old map?
1095 if (name
.is_client()) {
1096 bool message_sendmap
= epoch
< osdmap
->get_epoch();
1097 if (message_sendmap
&& sent_epoch_p
) {
1098 dout(20) << "client session last_sent_epoch: "
1100 << " versus osdmap epoch " << osdmap
->get_epoch() << dendl
;
1101 if (*sent_epoch_p
< osdmap
->get_epoch()) {
1103 } // else we don't need to send it out again
1107 if (con
->get_messenger() == osd
->cluster_messenger
&&
1108 con
!= osd
->cluster_messenger
->get_loopback_connection() &&
1109 osdmap
->is_up(name
.num()) &&
1110 (osdmap
->get_cluster_addr(name
.num()) == con
->get_peer_addr() ||
1111 osdmap
->get_hb_back_addr(name
.num()) == con
->get_peer_addr())) {
1113 epoch_t has
= MAX(get_peer_epoch(name
.num()), epoch
);
1116 if (has
< osdmap
->get_epoch()) {
1117 dout(10) << name
<< " " << con
->get_peer_addr()
1118 << " has old map " << epoch
<< " < "
1119 << osdmap
->get_epoch() << dendl
;
1127 void OSDService::share_map(
1132 epoch_t
*sent_epoch_p
)
1134 dout(20) << "share_map "
1135 << name
<< " " << con
->get_peer_addr()
1136 << " " << epoch
<< dendl
;
1138 if (!osd
->is_active()) {
1139 /*It is safe not to proceed as OSD is not in healthy state*/
1143 bool want_shared
= should_share_map(name
, con
, epoch
,
1144 osdmap
, sent_epoch_p
);
1147 if (name
.is_client()) {
1148 dout(10) << name
<< " has old map " << epoch
1149 << " < " << osdmap
->get_epoch() << dendl
;
1150 // we know the Session is valid or we wouldn't be sending
1152 *sent_epoch_p
= osdmap
->get_epoch();
1154 send_incremental_map(epoch
, con
, osdmap
);
1155 } else if (con
->get_messenger() == osd
->cluster_messenger
&&
1156 osdmap
->is_up(name
.num()) &&
1157 (osdmap
->get_cluster_addr(name
.num()) == con
->get_peer_addr() ||
1158 osdmap
->get_hb_back_addr(name
.num()) == con
->get_peer_addr())) {
1159 dout(10) << name
<< " " << con
->get_peer_addr()
1160 << " has old map " << epoch
<< " < "
1161 << osdmap
->get_epoch() << dendl
;
1162 note_peer_epoch(name
.num(), osdmap
->get_epoch());
1163 send_incremental_map(epoch
, con
, osdmap
);
1168 void OSDService::share_map_peer(int peer
, Connection
*con
, OSDMapRef map
)
1174 epoch_t pe
= get_peer_epoch(peer
);
1176 if (pe
< map
->get_epoch()) {
1177 send_incremental_map(pe
, con
, map
);
1178 note_peer_epoch(peer
, map
->get_epoch());
1180 dout(20) << "share_map_peer " << con
<< " already has epoch " << pe
<< dendl
;
1182 dout(20) << "share_map_peer " << con
<< " don't know epoch, doing nothing" << dendl
;
1183 // no idea about peer's epoch.
1184 // ??? send recent ???
1189 bool OSDService::can_inc_scrubs_pending()
1191 bool can_inc
= false;
1192 Mutex::Locker
l(sched_scrub_lock
);
1194 if (scrubs_pending
+ scrubs_active
< cct
->_conf
->osd_max_scrubs
) {
1195 dout(20) << __func__
<< " " << scrubs_pending
<< " -> " << (scrubs_pending
+1)
1196 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", active " << scrubs_active
<< ")" << dendl
;
1199 dout(20) << __func__
<< scrubs_pending
<< " + " << scrubs_active
<< " active >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1205 bool OSDService::inc_scrubs_pending()
1207 bool result
= false;
1209 sched_scrub_lock
.Lock();
1210 if (scrubs_pending
+ scrubs_active
< cct
->_conf
->osd_max_scrubs
) {
1211 dout(20) << "inc_scrubs_pending " << scrubs_pending
<< " -> " << (scrubs_pending
+1)
1212 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", active " << scrubs_active
<< ")" << dendl
;
1216 dout(20) << "inc_scrubs_pending " << scrubs_pending
<< " + " << scrubs_active
<< " active >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1218 sched_scrub_lock
.Unlock();
1223 void OSDService::dec_scrubs_pending()
1225 sched_scrub_lock
.Lock();
1226 dout(20) << "dec_scrubs_pending " << scrubs_pending
<< " -> " << (scrubs_pending
-1)
1227 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", active " << scrubs_active
<< ")" << dendl
;
1229 assert(scrubs_pending
>= 0);
1230 sched_scrub_lock
.Unlock();
1233 void OSDService::inc_scrubs_active(bool reserved
)
1235 sched_scrub_lock
.Lock();
1239 dout(20) << "inc_scrubs_active " << (scrubs_active
-1) << " -> " << scrubs_active
1240 << " (max " << cct
->_conf
->osd_max_scrubs
1241 << ", pending " << (scrubs_pending
+1) << " -> " << scrubs_pending
<< ")" << dendl
;
1242 assert(scrubs_pending
>= 0);
1244 dout(20) << "inc_scrubs_active " << (scrubs_active
-1) << " -> " << scrubs_active
1245 << " (max " << cct
->_conf
->osd_max_scrubs
1246 << ", pending " << scrubs_pending
<< ")" << dendl
;
1248 sched_scrub_lock
.Unlock();
1251 void OSDService::dec_scrubs_active()
1253 sched_scrub_lock
.Lock();
1254 dout(20) << "dec_scrubs_active " << scrubs_active
<< " -> " << (scrubs_active
-1)
1255 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", pending " << scrubs_pending
<< ")" << dendl
;
1257 assert(scrubs_active
>= 0);
1258 sched_scrub_lock
.Unlock();
1261 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1262 epoch_t
*_bind_epoch
) const
1264 Mutex::Locker
l(epoch_lock
);
1266 *_boot_epoch
= boot_epoch
;
1268 *_up_epoch
= up_epoch
;
1270 *_bind_epoch
= bind_epoch
;
1273 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1274 const epoch_t
*_bind_epoch
)
1276 Mutex::Locker
l(epoch_lock
);
1278 assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1279 boot_epoch
= *_boot_epoch
;
1282 assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1283 up_epoch
= *_up_epoch
;
1286 assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1287 bind_epoch
= *_bind_epoch
;
1291 bool OSDService::prepare_to_stop()
1293 Mutex::Locker
l(is_stopping_lock
);
1294 if (get_state() != NOT_STOPPING
)
1297 OSDMapRef osdmap
= get_osdmap();
1298 if (osdmap
&& osdmap
->is_up(whoami
)) {
1299 dout(0) << __func__
<< " telling mon we are shutting down" << dendl
;
1300 set_state(PREPARING_TO_STOP
);
1301 monc
->send_mon_message(new MOSDMarkMeDown(monc
->get_fsid(),
1302 osdmap
->get_inst(whoami
),
1303 osdmap
->get_epoch(),
1306 utime_t now
= ceph_clock_now();
1308 timeout
.set_from_double(now
+ cct
->_conf
->osd_mon_shutdown_timeout
);
1309 while ((ceph_clock_now() < timeout
) &&
1310 (get_state() != STOPPING
)) {
1311 is_stopping_cond
.WaitUntil(is_stopping_lock
, timeout
);
1314 dout(0) << __func__
<< " starting shutdown" << dendl
;
1315 set_state(STOPPING
);
1319 void OSDService::got_stop_ack()
1321 Mutex::Locker
l(is_stopping_lock
);
1322 if (get_state() == PREPARING_TO_STOP
) {
1323 dout(0) << __func__
<< " starting shutdown" << dendl
;
1324 set_state(STOPPING
);
1325 is_stopping_cond
.Signal();
1327 dout(10) << __func__
<< " ignoring msg" << dendl
;
1331 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1332 OSDSuperblock
& sblock
)
1334 MOSDMap
*m
= new MOSDMap(monc
->get_fsid());
1335 m
->oldest_map
= max_oldest_map
;
1336 m
->newest_map
= sblock
.newest_map
;
1338 for (epoch_t e
= to
; e
> since
; e
--) {
1340 if (e
> m
->oldest_map
&& get_inc_map_bl(e
, bl
)) {
1341 m
->incremental_maps
[e
].claim(bl
);
1342 } else if (get_map_bl(e
, bl
)) {
1343 m
->maps
[e
].claim(bl
);
1346 derr
<< "since " << since
<< " to " << to
1347 << " oldest " << m
->oldest_map
<< " newest " << m
->newest_map
1357 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1359 con
->send_message(m
);
1362 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1365 epoch_t to
= osdmap
->get_epoch();
1366 dout(10) << "send_incremental_map " << since
<< " -> " << to
1367 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1371 OSDSuperblock
sblock(get_superblock());
1372 if (since
< sblock
.oldest_map
) {
1373 // just send latest full map
1374 MOSDMap
*m
= new MOSDMap(monc
->get_fsid());
1375 m
->oldest_map
= max_oldest_map
;
1376 m
->newest_map
= sblock
.newest_map
;
1377 get_map_bl(to
, m
->maps
[to
]);
1382 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1383 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1384 << ", only sending most recent" << dendl
;
1385 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1388 if (to
- since
> (epoch_t
)cct
->_conf
->osd_map_message_max
)
1389 to
= since
+ cct
->_conf
->osd_map_message_max
;
1390 m
= build_incremental_map_msg(since
, to
, sblock
);
1395 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1397 bool found
= map_bl_cache
.lookup(e
, &bl
);
1400 found
= store
->read(coll_t::meta(),
1401 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
) >= 0;
1407 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1409 Mutex::Locker
l(map_cache_lock
);
1410 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1413 found
= store
->read(coll_t::meta(),
1414 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
) >= 0;
1416 _add_map_inc_bl(e
, bl
);
1420 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1422 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1423 map_bl_cache
.add(e
, bl
);
1426 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1428 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1429 map_bl_inc_cache
.add(e
, bl
);
1432 void OSDService::pin_map_inc_bl(epoch_t e
, bufferlist
&bl
)
1434 Mutex::Locker
l(map_cache_lock
);
1435 map_bl_inc_cache
.pin(e
, bl
);
1438 void OSDService::pin_map_bl(epoch_t e
, bufferlist
&bl
)
1440 Mutex::Locker
l(map_cache_lock
);
1441 map_bl_cache
.pin(e
, bl
);
1444 void OSDService::clear_map_bl_cache_pins(epoch_t e
)
1446 Mutex::Locker
l(map_cache_lock
);
1447 map_bl_inc_cache
.clear_pinned(e
);
1448 map_bl_cache
.clear_pinned(e
);
1451 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1453 epoch_t e
= o
->get_epoch();
1455 if (cct
->_conf
->osd_map_dedup
) {
1456 // Dedup against an existing map at a nearby epoch
1457 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1459 OSDMap::dedup(for_dedup
.get(), o
);
1463 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1470 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1472 Mutex::Locker
l(map_cache_lock
);
1473 OSDMapRef retval
= map_cache
.lookup(epoch
);
1475 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1477 logger
->inc(l_osd_map_cache_hit
);
1482 logger
->inc(l_osd_map_cache_miss
);
1483 epoch_t lb
= map_cache
.cached_key_lower_bound();
1485 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1486 logger
->inc(l_osd_map_cache_miss_low
);
1487 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1491 OSDMap
*map
= new OSDMap
;
1493 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1495 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1496 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1502 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1504 return _add_map(map
);
1510 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1512 reply_op_error(op
, err
, eversion_t(), 0);
1515 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1518 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1519 assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1521 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1523 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
,
1525 reply
->set_reply_versions(v
, uv
);
1526 m
->get_connection()->send_message(reply
);
1529 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1531 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1532 assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1534 assert(m
->get_map_epoch() >= pg
->info
.history
.same_primary_since
);
1536 if (pg
->is_ec_pg()) {
1538 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1539 * can get this result:
1540 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1541 * [CRUSH_ITEM_NONE, 2, 3]/3
1542 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1544 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1546 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1549 * We can't compute the op target based on the sending map epoch due to
1550 * splitting. The simplest thing is to detect such cases here and drop
1551 * them without an error (the client will resend anyway).
1553 assert(m
->get_map_epoch() <= superblock
.newest_map
);
1554 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1556 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1557 << m
->get_map_epoch() << ", dropping" << dendl
;
1560 pg_t _pgid
= m
->get_raw_pg();
1562 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1563 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1564 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1565 pgid
.shard
!= pg
->info
.pgid
.shard
) {
1566 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1567 << m
->get_map_epoch() << ", dropping" << dendl
;
1572 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1573 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1574 << " pg " << m
->get_raw_pg()
1575 << " to osd." << whoami
1576 << " not " << pg
->acting
1577 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1578 if (g_conf
->osd_enxio_on_misdirected_op
) {
1579 reply_op_error(op
, -ENXIO
);
1583 void OSDService::enqueue_back(spg_t pgid
, PGQueueable qi
)
1585 osd
->op_shardedwq
.queue(make_pair(pgid
, qi
));
1588 void OSDService::enqueue_front(spg_t pgid
, PGQueueable qi
)
1590 osd
->op_shardedwq
.queue_front(make_pair(pgid
, qi
));
1593 void OSDService::queue_for_peering(PG
*pg
)
1595 peering_wq
.queue(pg
);
1598 void OSDService::queue_for_snap_trim(PG
*pg
)
1600 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1601 osd
->op_shardedwq
.queue(
1605 PGSnapTrim(pg
->get_osdmap()->get_epoch()),
1606 cct
->_conf
->osd_snap_trim_cost
,
1607 cct
->_conf
->osd_snap_trim_priority
,
1610 pg
->get_osdmap()->get_epoch())));
1614 // ====================================================================
1618 #define dout_prefix *_dout
1620 // Commands shared between OSD's console and admin console:
1622 namespace osd_cmds
{
1624 int heap(CephContext
& cct
, cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
);
1626 }} // namespace ceph::osd_cmds
1628 int OSD::mkfs(CephContext
*cct
, ObjectStore
*store
, const string
&dev
,
1629 uuid_d fsid
, int whoami
)
1633 ceph::shared_ptr
<ObjectStore::Sequencer
> osr(
1634 new ObjectStore::Sequencer("mkfs"));
1639 // if we are fed a uuid for this osd, use it.
1640 store
->set_fsid(cct
->_conf
->osd_uuid
);
1642 ret
= store
->mkfs();
1644 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error " << ret
<< dendl
;
1648 store
->set_cache_shards(cct
->_conf
->osd_op_num_shards
);
1650 ret
= store
->mount();
1652 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error " << ret
<< dendl
;
1656 ret
= store
->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
1658 /* if we already have superblock, check content of superblock */
1659 dout(0) << " have superblock" << dendl
;
1660 bufferlist::iterator p
;
1663 if (whoami
!= sb
.whoami
) {
1664 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
1669 if (fsid
!= sb
.cluster_fsid
) {
1670 derr
<< "provided cluster fsid " << fsid
1671 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
1676 // create superblock
1677 sb
.cluster_fsid
= fsid
;
1678 sb
.osd_fsid
= store
->get_fsid();
1680 sb
.compat_features
= get_osd_initial_compat_set();
1685 ObjectStore::Transaction t
;
1686 t
.create_collection(coll_t::meta(), 0);
1687 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
1688 ret
= store
->apply_transaction(osr
.get(), std::move(t
));
1690 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1691 << "apply_transaction returned " << ret
<< dendl
;
1696 if (!osr
->flush_commit(&waiter
)) {
1700 ret
= write_meta(store
, sb
.cluster_fsid
, sb
.osd_fsid
, whoami
);
1702 derr
<< "OSD::mkfs: failed to write fsid file: error " << ret
<< dendl
;
1713 int OSD::write_meta(ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
)
1718 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
1719 r
= store
->write_meta("magic", val
);
1723 snprintf(val
, sizeof(val
), "%d", whoami
);
1724 r
= store
->write_meta("whoami", val
);
1728 cluster_fsid
.print(val
);
1729 r
= store
->write_meta("ceph_fsid", val
);
1733 r
= store
->write_meta("ready", "ready");
1740 int OSD::peek_meta(ObjectStore
*store
, std::string
& magic
,
1741 uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int& whoami
)
1745 int r
= store
->read_meta("magic", &val
);
1750 r
= store
->read_meta("whoami", &val
);
1753 whoami
= atoi(val
.c_str());
1755 r
= store
->read_meta("ceph_fsid", &val
);
1758 r
= cluster_fsid
.parse(val
.c_str());
1762 r
= store
->read_meta("fsid", &val
);
1764 osd_fsid
= uuid_d();
1766 r
= osd_fsid
.parse(val
.c_str());
1776 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1780 OSD::OSD(CephContext
*cct_
, ObjectStore
*store_
,
1782 Messenger
*internal_messenger
,
1783 Messenger
*external_messenger
,
1784 Messenger
*hb_client_front
,
1785 Messenger
*hb_client_back
,
1786 Messenger
*hb_front_serverm
,
1787 Messenger
*hb_back_serverm
,
1788 Messenger
*osdc_messenger
,
1790 const std::string
&dev
, const std::string
&jdev
) :
1792 osd_lock("OSD::osd_lock"),
1793 tick_timer(cct
, osd_lock
),
1794 tick_timer_lock("OSD::tick_timer_lock"),
1795 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
1796 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct
,
1797 cct
->_conf
->auth_supported
.empty() ?
1798 cct
->_conf
->auth_cluster_required
:
1799 cct
->_conf
->auth_supported
)),
1800 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct
,
1801 cct
->_conf
->auth_supported
.empty() ?
1802 cct
->_conf
->auth_service_required
:
1803 cct
->_conf
->auth_supported
)),
1804 cluster_messenger(internal_messenger
),
1805 client_messenger(external_messenger
),
1806 objecter_messenger(osdc_messenger
),
1808 mgrc(cct_
, client_messenger
),
1810 recoverystate_perf(NULL
),
1812 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
1813 clog(log_client
.create_channel()),
1815 dev_path(dev
), journal_path(jdev
),
1816 trace_endpoint("0.0.0.0", 0, "osd"),
1818 osd_compat(get_osd_compat_set()),
1819 osd_tp(cct
, "OSD::osd_tp", "tp_osd", cct
->_conf
->osd_op_threads
, "osd_op_threads"),
1820 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
1821 cct
->_conf
->osd_op_num_threads_per_shard
* cct
->_conf
->osd_op_num_shards
),
1822 disk_tp(cct
, "OSD::disk_tp", "tp_osd_disk", cct
->_conf
->osd_disk_threads
, "osd_disk_threads"),
1823 command_tp(cct
, "OSD::command_tp", "tp_osd_cmd", 1),
1824 session_waiting_lock("OSD::session_waiting_lock"),
1825 heartbeat_lock("OSD::heartbeat_lock"),
1826 heartbeat_stop(false),
1827 heartbeat_need_update(true),
1828 hb_front_client_messenger(hb_client_front
),
1829 hb_back_client_messenger(hb_client_back
),
1830 hb_front_server_messenger(hb_front_serverm
),
1831 hb_back_server_messenger(hb_back_serverm
),
1833 heartbeat_thread(this),
1834 heartbeat_dispatcher(this),
1835 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
1836 cct
->_conf
->osd_num_op_tracker_shard
),
1837 test_ops_hook(NULL
),
1838 op_queue(get_io_queue()),
1839 op_prio_cutoff(get_io_prio_cut()),
1841 cct
->_conf
->osd_op_num_shards
,
1843 cct
->_conf
->osd_op_thread_timeout
,
1844 cct
->_conf
->osd_op_thread_suicide_timeout
,
1848 cct
->_conf
->osd_op_thread_timeout
,
1849 cct
->_conf
->osd_op_thread_suicide_timeout
,
1851 map_lock("OSD::map_lock"),
1852 pg_map_lock("OSD::pg_map_lock"),
1853 last_pg_create_epoch(0),
1854 mon_report_lock("OSD::mon_report_lock"),
1855 stats_ack_timeout(cct
->_conf
->osd_mon_ack_timeout
),
1857 requested_full_first(0),
1858 requested_full_last(0),
1859 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
1860 osd_stat_updated(false),
1861 pg_stat_tid(0), pg_stat_tid_flushed(0),
1864 cct
->_conf
->osd_command_thread_timeout
,
1865 cct
->_conf
->osd_command_thread_suicide_timeout
,
1870 cct
->_conf
->osd_remove_thread_timeout
,
1871 cct
->_conf
->osd_remove_thread_suicide_timeout
,
1875 monc
->set_messenger(client_messenger
);
1876 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
1877 cct
->_conf
->osd_op_log_threshold
);
1878 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
1879 cct
->_conf
->osd_op_history_duration
);
1880 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
1881 cct
->_conf
->osd_op_history_slow_op_threshold
);
1883 std::stringstream ss
;
1884 ss
<< "osd." << whoami
;
1885 trace_endpoint
.copy_name(ss
.str());
1891 delete authorize_handler_cluster_registry
;
1892 delete authorize_handler_service_registry
;
1893 delete class_handler
;
1894 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
1895 cct
->get_perfcounters_collection()->remove(logger
);
1896 delete recoverystate_perf
;
1901 void cls_initialize(ClassHandler
*ch
);
1903 void OSD::handle_signal(int signum
)
1905 assert(signum
== SIGINT
|| signum
== SIGTERM
);
1906 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
1912 Mutex::Locker
lock(osd_lock
);
1916 if (store
->test_mount_in_use()) {
1917 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
1918 << "currently in use. (Is ceph-osd already running?)" << dendl
;
1922 cct
->_conf
->add_observer(this);
1928 class OSDSocketHook
: public AdminSocketHook
{
1931 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
1932 bool call(std::string admin_command
, cmdmap_t
& cmdmap
, std::string format
,
1933 bufferlist
& out
) override
{
1935 bool r
= osd
->asok_command(admin_command
, cmdmap
, format
, ss
);
1941 bool OSD::asok_command(string admin_command
, cmdmap_t
& cmdmap
, string format
,
1944 Formatter
*f
= Formatter::create(format
, "json-pretty", "json-pretty");
1945 if (admin_command
== "status") {
1946 f
->open_object_section("status");
1947 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
1948 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
1949 f
->dump_unsigned("whoami", superblock
.whoami
);
1950 f
->dump_string("state", get_state_name(get_state()));
1951 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
1952 f
->dump_unsigned("newest_map", superblock
.newest_map
);
1954 RWLock::RLocker
l(pg_map_lock
);
1955 f
->dump_unsigned("num_pgs", pg_map
.size());
1958 } else if (admin_command
== "flush_journal") {
1959 store
->flush_journal();
1960 } else if (admin_command
== "dump_ops_in_flight" ||
1961 admin_command
== "ops") {
1962 if (!op_tracker
.dump_ops_in_flight(f
)) {
1963 ss
<< "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
1964 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
1966 } else if (admin_command
== "dump_blocked_ops") {
1967 if (!op_tracker
.dump_ops_in_flight(f
, true)) {
1968 ss
<< "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
1969 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
1971 } else if (admin_command
== "dump_historic_ops") {
1972 if (!op_tracker
.dump_historic_ops(f
, false)) {
1973 ss
<< "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
1974 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
1976 } else if (admin_command
== "dump_historic_ops_by_duration") {
1977 if (!op_tracker
.dump_historic_ops(f
, true)) {
1978 ss
<< "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
1979 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
1981 } else if (admin_command
== "dump_historic_slow_ops") {
1982 if (!op_tracker
.dump_historic_slow_ops(f
)) {
1983 ss
<< "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
1984 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
1986 } else if (admin_command
== "dump_op_pq_state") {
1987 f
->open_object_section("pq");
1988 op_shardedwq
.dump(f
);
1990 } else if (admin_command
== "dump_blacklist") {
1991 list
<pair
<entity_addr_t
,utime_t
> > bl
;
1992 OSDMapRef curmap
= service
.get_osdmap();
1994 f
->open_array_section("blacklist");
1995 curmap
->get_blacklist(&bl
);
1996 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
1997 it
!= bl
.end(); ++it
) {
1998 f
->open_array_section("entry");
1999 f
->open_object_section("entity_addr_t");
2001 f
->close_section(); //entity_addr_t
2002 it
->second
.localtime(f
->dump_stream("expire_time"));
2003 f
->close_section(); //entry
2005 f
->close_section(); //blacklist
2006 } else if (admin_command
== "dump_watchers") {
2007 list
<obj_watch_item_t
> watchers
;
2010 Mutex::Locker
l(osd_lock
);
2011 RWLock::RLocker
l2(pg_map_lock
);
2012 for (ceph::unordered_map
<spg_t
,PG
*>::iterator it
= pg_map
.begin();
2016 list
<obj_watch_item_t
> pg_watchers
;
2017 PG
*pg
= it
->second
;
2019 pg
->get_watchers(pg_watchers
);
2021 watchers
.splice(watchers
.end(), pg_watchers
);
2025 f
->open_array_section("watchers");
2026 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2027 it
!= watchers
.end(); ++it
) {
2029 f
->open_array_section("watch");
2031 f
->dump_string("namespace", it
->obj
.nspace
);
2032 f
->dump_string("object", it
->obj
.oid
.name
);
2034 f
->open_object_section("entity_name");
2035 it
->wi
.name
.dump(f
);
2036 f
->close_section(); //entity_name_t
2038 f
->dump_int("cookie", it
->wi
.cookie
);
2039 f
->dump_int("timeout", it
->wi
.timeout_seconds
);
2041 f
->open_object_section("entity_addr_t");
2042 it
->wi
.addr
.dump(f
);
2043 f
->close_section(); //entity_addr_t
2045 f
->close_section(); //watch
2048 f
->close_section(); //watchers
2049 } else if (admin_command
== "dump_reservations") {
2050 f
->open_object_section("reservations");
2051 f
->open_object_section("local_reservations");
2052 service
.local_reserver
.dump(f
);
2054 f
->open_object_section("remote_reservations");
2055 service
.remote_reserver
.dump(f
);
2058 } else if (admin_command
== "get_latest_osdmap") {
2059 get_latest_osdmap();
2060 } else if (admin_command
== "heap") {
2061 auto result
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ss
);
2063 // Note: Failed heap profile commands won't necessarily trigger an error:
2064 f
->open_object_section("result");
2065 f
->dump_string("error", cpp_strerror(result
));
2066 f
->dump_bool("success", result
>= 0);
2068 } else if (admin_command
== "set_heap_property") {
2072 bool success
= false;
2073 if (!cmd_getval(cct
, cmdmap
, "property", property
)) {
2074 error
= "unable to get property";
2076 } else if (!cmd_getval(cct
, cmdmap
, "value", value
)) {
2077 error
= "unable to get value";
2079 } else if (value
< 0) {
2080 error
= "negative value not allowed";
2082 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2083 error
= "invalid property";
2088 f
->open_object_section("result");
2089 f
->dump_string("error", error
);
2090 f
->dump_bool("success", success
);
2092 } else if (admin_command
== "get_heap_property") {
2096 bool success
= false;
2097 if (!cmd_getval(cct
, cmdmap
, "property", property
)) {
2098 error
= "unable to get property";
2100 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2101 error
= "invalid property";
2106 f
->open_object_section("result");
2107 f
->dump_string("error", error
);
2108 f
->dump_bool("success", success
);
2109 f
->dump_int("value", value
);
2111 } else if (admin_command
== "dump_objectstore_kv_stats") {
2112 store
->get_db_statistics(f
);
2113 } else if (admin_command
== "dump_scrubs") {
2114 service
.dumps_scrub(f
);
2115 } else if (admin_command
== "calc_objectstore_db_histogram") {
2116 store
->generate_db_histogram(f
);
2117 } else if (admin_command
== "flush_store_cache") {
2118 store
->flush_cache();
2119 } else if (admin_command
== "dump_pgstate_history") {
2120 f
->open_object_section("pgstate_history");
2121 RWLock::RLocker
l2(pg_map_lock
);
2122 for (ceph::unordered_map
<spg_t
,PG
*>::iterator it
= pg_map
.begin();
2126 PG
*pg
= it
->second
;
2127 f
->dump_stream("pg") << pg
->get_pgid();
2129 pg
->pgstate_history
.dump(f
);
2134 assert(0 == "broken asok registration");
2141 class TestOpsSocketHook
: public AdminSocketHook
{
2142 OSDService
*service
;
2145 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
2146 bool call(std::string command
, cmdmap_t
& cmdmap
, std::string format
,
2147 bufferlist
& out
) override
{
2149 test_ops(service
, store
, command
, cmdmap
, ss
);
2153 void test_ops(OSDService
*service
, ObjectStore
*store
,
2154 const std::string
&command
, cmdmap_t
& cmdmap
, ostream
&ss
);
2158 class OSD::C_Tick
: public Context
{
2161 explicit C_Tick(OSD
*o
) : osd(o
) {}
2162 void finish(int r
) override
{
2167 class OSD::C_Tick_WithoutOSDLock
: public Context
{
2170 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
2171 void finish(int r
) override
{
2172 osd
->tick_without_osd_lock();
2176 int OSD::enable_disable_fuse(bool stop
)
2180 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
2181 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
2182 dout(1) << __func__
<< " disabling" << dendl
;
2186 r
= ::rmdir(mntpath
.c_str());
2190 derr
<< __func__
<< " failed to rmdir " << mntpath
<< dendl
;
2195 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
2196 dout(1) << __func__
<< " enabling" << dendl
;
2197 r
= ::mkdir(mntpath
.c_str(), 0700);
2200 if (r
< 0 && r
!= -EEXIST
) {
2201 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
2202 << cpp_strerror(r
) << dendl
;
2205 fuse_store
= new FuseStore(store
, mntpath
);
2206 r
= fuse_store
->start();
2208 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
2214 #endif // HAVE_LIBFUSE
2220 CompatSet initial
, diff
;
2221 Mutex::Locker
lock(osd_lock
);
2226 tick_timer_without_osd_lock
.init();
2227 service
.recovery_request_timer
.init();
2230 dout(2) << "mounting " << dev_path
<< " "
2231 << (journal_path
.empty() ? "(no journal)" : journal_path
) << dendl
;
2232 assert(store
); // call pre_init() first!
2234 store
->set_cache_shards(cct
->_conf
->osd_op_num_shards
);
2236 int r
= store
->mount();
2238 derr
<< "OSD:init: unable to mount object store" << dendl
;
2242 enable_disable_fuse(false);
2244 dout(2) << "boot" << dendl
;
2246 // initialize the daily loadavg with current 15min loadavg
2248 if (getloadavg(loadavgs
, 3) == 3) {
2249 daily_loadavg
= loadavgs
[2];
2251 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
2252 daily_loadavg
= 1.0;
2255 int rotating_auth_attempts
= 0;
2257 // sanity check long object name handling
2260 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
2261 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
2262 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
2263 r
= store
->validate_hobject_key(l
);
2265 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
2266 << "object name[space] len" << dendl
;
2267 derr
<< " osd max object name len = "
2268 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
2269 derr
<< " osd max object namespace len = "
2270 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
2271 derr
<< cpp_strerror(r
) << dendl
;
2272 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
2275 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
2278 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
2283 r
= read_superblock();
2285 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
2290 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
2291 derr
<< "The disk uses features unsupported by the executable." << dendl
;
2292 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
2293 derr
<< " daemon features " << osd_compat
<< dendl
;
2295 if (osd_compat
.writeable(superblock
.compat_features
)) {
2296 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
2297 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
2302 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
2303 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
2309 assert_warn(whoami
== superblock
.whoami
);
2310 if (whoami
!= superblock
.whoami
) {
2311 derr
<< "OSD::init: superblock says osd"
2312 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
2317 initial
= get_osd_initial_compat_set();
2318 diff
= superblock
.compat_features
.unsupported(initial
);
2319 if (superblock
.compat_features
.merge(initial
)) {
2320 // We need to persist the new compat_set before we
2322 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
2323 ObjectStore::Transaction t
;
2324 write_superblock(t
);
2325 r
= store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
2330 // make sure snap mapper object exists
2331 if (!store
->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2332 dout(10) << "init creating/touching snapmapper object" << dendl
;
2333 ObjectStore::Transaction t
;
2334 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2335 r
= store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
2340 class_handler
= new ClassHandler(cct
);
2341 cls_initialize(class_handler
);
2343 if (cct
->_conf
->osd_open_classes_on_start
) {
2344 int r
= class_handler
->open_all_classes();
2346 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
2349 // load up "current" osdmap
2350 assert_warn(!osdmap
);
2352 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
2356 osdmap
= get_map(superblock
.current_epoch
);
2357 check_osdmap_features(store
);
2359 create_recoverystate_perf();
2362 epoch_t bind_epoch
= osdmap
->get_epoch();
2363 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
2366 clear_temp_objects();
2368 // load up pgs (as they previously existed)
2371 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
2372 dout(0) << "using " << op_queue
<< " op queue with priority op cut off at " <<
2373 op_prio_cutoff
<< "." << dendl
;
2378 client_messenger
->add_dispatcher_head(this);
2379 cluster_messenger
->add_dispatcher_head(this);
2381 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
2382 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
2383 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
2384 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
2386 objecter_messenger
->add_dispatcher_head(service
.objecter
);
2388 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
2389 | CEPH_ENTITY_TYPE_MGR
);
2395 * FIXME: this is a placeholder implementation that unconditionally
2396 * sends every is_primary PG's stats every time we're called, unlike
2397 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2398 * This has equivalent cost to the existing worst case where all
2399 * PGs are busy and their stats are always enqueued for sending.
2401 mgrc
.set_pgstats_cb([this](){
2402 RWLock::RLocker
l(map_lock
);
2404 utime_t had_for
= ceph_clock_now() - had_map_since
;
2405 osd_stat_t cur_stat
= service
.get_osd_stat();
2406 cur_stat
.os_perf_stat
= store
->get_cur_stats();
2408 MPGStats
*m
= new MPGStats(monc
->get_fsid(), osdmap
->get_epoch(), had_for
);
2409 m
->osd_stat
= cur_stat
;
2411 Mutex::Locker lec
{min_last_epoch_clean_lock
};
2412 min_last_epoch_clean
= osdmap
->get_epoch();
2413 min_last_epoch_clean_pgs
.clear();
2414 RWLock::RLocker
lpg(pg_map_lock
);
2415 for (const auto &i
: pg_map
) {
2417 if (!pg
->is_primary()) {
2421 pg
->pg_stats_publish_lock
.Lock();
2422 if (pg
->pg_stats_publish_valid
) {
2423 m
->pg_stat
[pg
->info
.pgid
.pgid
] = pg
->pg_stats_publish
;
2424 const auto lec
= pg
->pg_stats_publish
.get_effective_last_epoch_clean();
2425 min_last_epoch_clean
= min(min_last_epoch_clean
, lec
);
2426 min_last_epoch_clean_pgs
.push_back(pg
->info
.pgid
.pgid
);
2428 pg
->pg_stats_publish_lock
.Unlock();
2435 client_messenger
->add_dispatcher_head(&mgrc
);
2437 // tell monc about log_client so it will know about mon session resets
2438 monc
->set_log_client(&log_client
);
2439 update_log_config();
2446 set_disk_tp_priority();
2448 // start the heartbeat
2449 heartbeat_thread
.create("osd_srv_heartbt");
2452 tick_timer
.add_event_after(cct
->_conf
->osd_heartbeat_interval
, new C_Tick(this));
2454 Mutex::Locker
l(tick_timer_lock
);
2455 tick_timer_without_osd_lock
.add_event_after(cct
->_conf
->osd_heartbeat_interval
, new C_Tick_WithoutOSDLock(this));
2459 service
.publish_map(osdmap
);
2460 service
.publish_superblock(superblock
);
2461 service
.max_oldest_map
= superblock
.oldest_map
;
2465 r
= monc
->authenticate();
2467 osd_lock
.Lock(); // locker is going to unlock this on function exit
2473 while (monc
->wait_auth_rotating(30.0) < 0) {
2474 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
2475 ++rotating_auth_attempts
;
2476 if (rotating_auth_attempts
> g_conf
->max_rotating_auth_attempts
) {
2477 osd_lock
.Lock(); // make locker happy
2478 if (!is_stopping()) {
2485 r
= update_crush_device_class();
2491 r
= update_crush_location();
2501 // start objecter *after* we have authenticated, so that we don't ignore
2502 // the OSDMaps it requests.
2503 service
.final_init();
2507 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
2511 dout(0) << "done with init, starting boot process" << dendl
;
2513 // subscribe to any pg creations
2514 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
2516 // MgrClient needs this (it doesn't have MonClient reference itself)
2517 monc
->sub_want("mgrmap", 0, 0);
2519 // we don't need to ask for an osdmap here; objecter will
2520 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2532 enable_disable_fuse(true);
2539 void OSD::final_init()
2541 AdminSocket
*admin_socket
= cct
->get_admin_socket();
2542 asok_hook
= new OSDSocketHook(this);
2543 int r
= admin_socket
->register_command("status", "status", asok_hook
,
2544 "high-level status of OSD");
2546 r
= admin_socket
->register_command("flush_journal", "flush_journal",
2548 "flush the journal to permanent store");
2550 r
= admin_socket
->register_command("dump_ops_in_flight",
2551 "dump_ops_in_flight", asok_hook
,
2552 "show the ops currently in flight");
2554 r
= admin_socket
->register_command("ops",
2556 "show the ops currently in flight");
2558 r
= admin_socket
->register_command("dump_blocked_ops",
2559 "dump_blocked_ops", asok_hook
,
2560 "show the blocked ops currently in flight");
2562 r
= admin_socket
->register_command("dump_historic_ops", "dump_historic_ops",
2566 r
= admin_socket
->register_command("dump_historic_slow_ops", "dump_historic_slow_ops",
2568 "show slowest recent ops");
2570 r
= admin_socket
->register_command("dump_historic_ops_by_duration", "dump_historic_ops_by_duration",
2572 "show slowest recent ops, sorted by duration");
2574 r
= admin_socket
->register_command("dump_op_pq_state", "dump_op_pq_state",
2576 "dump op priority queue state");
2578 r
= admin_socket
->register_command("dump_blacklist", "dump_blacklist",
2580 "dump blacklisted clients and times");
2582 r
= admin_socket
->register_command("dump_watchers", "dump_watchers",
2584 "show clients which have active watches,"
2585 " and on which objects");
2587 r
= admin_socket
->register_command("dump_reservations", "dump_reservations",
2589 "show recovery reservations");
2591 r
= admin_socket
->register_command("get_latest_osdmap", "get_latest_osdmap",
2593 "force osd to update the latest map from "
2597 r
= admin_socket
->register_command( "heap",
2599 "name=heapcmd,type=CephString",
2601 "show heap usage info (available only if "
2602 "compiled with tcmalloc)");
2605 r
= admin_socket
->register_command("set_heap_property",
2606 "set_heap_property " \
2607 "name=property,type=CephString " \
2608 "name=value,type=CephInt",
2610 "update malloc extension heap property");
2613 r
= admin_socket
->register_command("get_heap_property",
2614 "get_heap_property " \
2615 "name=property,type=CephString",
2617 "get malloc extension heap property");
2620 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
2621 "dump_objectstore_kv_stats",
2623 "print statistics of kvdb which used by bluestore");
2626 r
= admin_socket
->register_command("dump_scrubs",
2629 "print scheduled scrubs");
2632 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
2633 "calc_objectstore_db_histogram",
2635 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2638 r
= admin_socket
->register_command("flush_store_cache",
2639 "flush_store_cache",
2641 "Flush bluestore internal cache");
2643 r
= admin_socket
->register_command("dump_pgstate_history", "dump_pgstate_history",
2645 "show recent state history");
2648 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
);
2649 // Note: pools are CephString instead of CephPoolname because
2650 // these commands traditionally support both pool names and numbers
2651 r
= admin_socket
->register_command(
2654 "name=pool,type=CephString " \
2655 "name=objname,type=CephObjectname " \
2656 "name=key,type=CephString "\
2657 "name=val,type=CephString",
2661 r
= admin_socket
->register_command(
2664 "name=pool,type=CephString " \
2665 "name=objname,type=CephObjectname " \
2666 "name=key,type=CephString",
2670 r
= admin_socket
->register_command(
2673 "name=pool,type=CephString " \
2674 "name=objname,type=CephObjectname " \
2675 "name=header,type=CephString",
2680 r
= admin_socket
->register_command(
2683 "name=pool,type=CephString " \
2684 "name=objname,type=CephObjectname",
2686 "output entire object map");
2689 r
= admin_socket
->register_command(
2692 "name=pool,type=CephString " \
2693 "name=objname,type=CephObjectname " \
2694 "name=len,type=CephInt",
2696 "truncate object to length");
2699 r
= admin_socket
->register_command(
2702 "name=pool,type=CephString " \
2703 "name=objname,type=CephObjectname " \
2704 "name=shardid,type=CephInt,req=false,range=0|255",
2706 "inject data error to an object");
2709 r
= admin_socket
->register_command(
2712 "name=pool,type=CephString " \
2713 "name=objname,type=CephObjectname " \
2714 "name=shardid,type=CephInt,req=false,range=0|255",
2716 "inject metadata error to an object");
2718 r
= admin_socket
->register_command(
2719 "set_recovery_delay",
2720 "set_recovery_delay " \
2721 "name=utime,type=CephInt,req=false",
2723 "Delay osd recovery by specified seconds");
2725 r
= admin_socket
->register_command(
2728 "name=pgid,type=CephString ",
2730 "Trigger a scheduled scrub ");
2732 r
= admin_socket
->register_command(
2735 "name=type,type=CephString,req=false " \
2736 "name=count,type=CephInt,req=false ",
2738 "Inject a full disk (optional count times)");
2742 void OSD::create_logger()
2744 dout(10) << "create_logger" << dendl
;
2746 PerfCountersBuilder
osd_plb(cct
, "osd", l_osd_first
, l_osd_last
);
2748 // Latency axis configuration for op histograms, values are in nanoseconds
2749 PerfHistogramCommon::axis_config_d op_hist_x_axis_config
{
2751 PerfHistogramCommon::SCALE_LOG2
, ///< Latency in logarithmic scale
2753 100000, ///< Quantization unit is 100usec
2754 32, ///< Enough to cover much longer than slow requests
2757 // Op size axis configuration for op histograms, values are in bytes
2758 PerfHistogramCommon::axis_config_d op_hist_y_axis_config
{
2759 "Request size (bytes)",
2760 PerfHistogramCommon::SCALE_LOG2
, ///< Request size in logarithmic scale
2762 512, ///< Quantization unit is 512 bytes
2763 32, ///< Enough to cover requests larger than GB
2768 l_osd_op_wip
, "op_wip",
2769 "Replication operations currently being processed (primary)");
2770 osd_plb
.add_u64_counter(
2772 "Client operations",
2773 "ops", PerfCountersBuilder::PRIO_CRITICAL
);
2774 osd_plb
.add_u64_counter(
2775 l_osd_op_inb
, "op_in_bytes",
2776 "Client operations total write size",
2777 "wr", PerfCountersBuilder::PRIO_INTERESTING
);
2778 osd_plb
.add_u64_counter(
2779 l_osd_op_outb
, "op_out_bytes",
2780 "Client operations total read size",
2781 "rd", PerfCountersBuilder::PRIO_INTERESTING
);
2782 osd_plb
.add_time_avg(
2783 l_osd_op_lat
, "op_latency",
2784 "Latency of client operations (including queue time)",
2786 osd_plb
.add_time_avg(
2787 l_osd_op_process_lat
, "op_process_latency",
2788 "Latency of client operations (excluding queue time)");
2789 osd_plb
.add_time_avg(
2790 l_osd_op_prepare_lat
, "op_prepare_latency",
2791 "Latency of client operations (excluding queue time and wait for finished)");
2793 osd_plb
.add_u64_counter(
2794 l_osd_op_r
, "op_r", "Client read operations");
2795 osd_plb
.add_u64_counter(
2796 l_osd_op_r_outb
, "op_r_out_bytes", "Client data read");
2797 osd_plb
.add_time_avg(
2798 l_osd_op_r_lat
, "op_r_latency",
2799 "Latency of read operation (including queue time)");
2800 osd_plb
.add_histogram(
2801 l_osd_op_r_lat_outb_hist
, "op_r_latency_out_bytes_histogram",
2802 op_hist_x_axis_config
, op_hist_y_axis_config
,
2803 "Histogram of operation latency (including queue time) + data read");
2804 osd_plb
.add_time_avg(
2805 l_osd_op_r_process_lat
, "op_r_process_latency",
2806 "Latency of read operation (excluding queue time)");
2807 osd_plb
.add_time_avg(
2808 l_osd_op_r_prepare_lat
, "op_r_prepare_latency",
2809 "Latency of read operations (excluding queue time and wait for finished)");
2810 osd_plb
.add_u64_counter(
2811 l_osd_op_w
, "op_w", "Client write operations");
2812 osd_plb
.add_u64_counter(
2813 l_osd_op_w_inb
, "op_w_in_bytes", "Client data written");
2814 osd_plb
.add_time_avg(
2815 l_osd_op_w_lat
, "op_w_latency",
2816 "Latency of write operation (including queue time)");
2817 osd_plb
.add_histogram(
2818 l_osd_op_w_lat_inb_hist
, "op_w_latency_in_bytes_histogram",
2819 op_hist_x_axis_config
, op_hist_y_axis_config
,
2820 "Histogram of operation latency (including queue time) + data written");
2821 osd_plb
.add_time_avg(
2822 l_osd_op_w_process_lat
, "op_w_process_latency",
2823 "Latency of write operation (excluding queue time)");
2824 osd_plb
.add_time_avg(
2825 l_osd_op_w_prepare_lat
, "op_w_prepare_latency",
2826 "Latency of write operations (excluding queue time and wait for finished)");
2827 osd_plb
.add_u64_counter(
2828 l_osd_op_rw
, "op_rw",
2829 "Client read-modify-write operations");
2830 osd_plb
.add_u64_counter(
2831 l_osd_op_rw_inb
, "op_rw_in_bytes",
2832 "Client read-modify-write operations write in");
2833 osd_plb
.add_u64_counter(
2834 l_osd_op_rw_outb
,"op_rw_out_bytes",
2835 "Client read-modify-write operations read out ");
2836 osd_plb
.add_time_avg(
2837 l_osd_op_rw_lat
, "op_rw_latency",
2838 "Latency of read-modify-write operation (including queue time)");
2839 osd_plb
.add_histogram(
2840 l_osd_op_rw_lat_inb_hist
, "op_rw_latency_in_bytes_histogram",
2841 op_hist_x_axis_config
, op_hist_y_axis_config
,
2842 "Histogram of rw operation latency (including queue time) + data written");
2843 osd_plb
.add_histogram(
2844 l_osd_op_rw_lat_outb_hist
, "op_rw_latency_out_bytes_histogram",
2845 op_hist_x_axis_config
, op_hist_y_axis_config
,
2846 "Histogram of rw operation latency (including queue time) + data read");
2847 osd_plb
.add_time_avg(
2848 l_osd_op_rw_process_lat
, "op_rw_process_latency",
2849 "Latency of read-modify-write operation (excluding queue time)");
2850 osd_plb
.add_time_avg(
2851 l_osd_op_rw_prepare_lat
, "op_rw_prepare_latency",
2852 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
2854 osd_plb
.add_u64_counter(
2855 l_osd_sop
, "subop", "Suboperations");
2856 osd_plb
.add_u64_counter(
2857 l_osd_sop_inb
, "subop_in_bytes", "Suboperations total size");
2858 osd_plb
.add_time_avg(l_osd_sop_lat
, "subop_latency", "Suboperations latency");
2860 osd_plb
.add_u64_counter(l_osd_sop_w
, "subop_w", "Replicated writes");
2861 osd_plb
.add_u64_counter(
2862 l_osd_sop_w_inb
, "subop_w_in_bytes", "Replicated written data size");
2863 osd_plb
.add_time_avg(
2864 l_osd_sop_w_lat
, "subop_w_latency", "Replicated writes latency");
2865 osd_plb
.add_u64_counter(
2866 l_osd_sop_pull
, "subop_pull", "Suboperations pull requests");
2867 osd_plb
.add_time_avg(
2868 l_osd_sop_pull_lat
, "subop_pull_latency", "Suboperations pull latency");
2869 osd_plb
.add_u64_counter(
2870 l_osd_sop_push
, "subop_push", "Suboperations push messages");
2871 osd_plb
.add_u64_counter(
2872 l_osd_sop_push_inb
, "subop_push_in_bytes", "Suboperations pushed size");
2873 osd_plb
.add_time_avg(
2874 l_osd_sop_push_lat
, "subop_push_latency", "Suboperations push latency");
2876 osd_plb
.add_u64_counter(l_osd_pull
, "pull", "Pull requests sent");
2877 osd_plb
.add_u64_counter(l_osd_push
, "push", "Push messages sent");
2878 osd_plb
.add_u64_counter(l_osd_push_outb
, "push_out_bytes", "Pushed size");
2880 osd_plb
.add_u64_counter(
2881 l_osd_rop
, "recovery_ops",
2882 "Started recovery operations",
2883 "rop", PerfCountersBuilder::PRIO_INTERESTING
);
2885 osd_plb
.add_u64(l_osd_loadavg
, "loadavg", "CPU load");
2886 osd_plb
.add_u64(l_osd_buf
, "buffer_bytes", "Total allocated buffer size");
2887 osd_plb
.add_u64(l_osd_history_alloc_bytes
, "history_alloc_Mbytes");
2888 osd_plb
.add_u64(l_osd_history_alloc_num
, "history_alloc_num");
2890 l_osd_cached_crc
, "cached_crc", "Total number getting crc from crc_cache");
2892 l_osd_cached_crc_adjusted
, "cached_crc_adjusted",
2893 "Total number getting crc from crc_cache with adjusting");
2894 osd_plb
.add_u64(l_osd_missed_crc
, "missed_crc",
2895 "Total number of crc cache misses");
2897 osd_plb
.add_u64(l_osd_pg
, "numpg", "Placement groups",
2898 "pgs", PerfCountersBuilder::PRIO_USEFUL
);
2900 l_osd_pg_primary
, "numpg_primary",
2901 "Placement groups for which this osd is primary");
2903 l_osd_pg_replica
, "numpg_replica",
2904 "Placement groups for which this osd is replica");
2906 l_osd_pg_stray
, "numpg_stray",
2907 "Placement groups ready to be deleted from this osd");
2909 l_osd_hb_to
, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
2910 osd_plb
.add_u64_counter(l_osd_map
, "map_messages", "OSD map messages");
2911 osd_plb
.add_u64_counter(l_osd_mape
, "map_message_epochs", "OSD map epochs");
2912 osd_plb
.add_u64_counter(
2913 l_osd_mape_dup
, "map_message_epoch_dups", "OSD map duplicates");
2914 osd_plb
.add_u64_counter(
2915 l_osd_waiting_for_map
, "messages_delayed_for_map",
2916 "Operations waiting for OSD map");
2917 osd_plb
.add_u64_counter(
2918 l_osd_map_cache_hit
, "osd_map_cache_hit", "osdmap cache hit");
2919 osd_plb
.add_u64_counter(
2920 l_osd_map_cache_miss
, "osd_map_cache_miss", "osdmap cache miss");
2921 osd_plb
.add_u64_counter(
2922 l_osd_map_cache_miss_low
, "osd_map_cache_miss_low",
2923 "osdmap cache miss below cache lower bound");
2924 osd_plb
.add_u64_avg(
2925 l_osd_map_cache_miss_low_avg
, "osd_map_cache_miss_low_avg",
2926 "osdmap cache miss, avg distance below cache lower bound");
2928 osd_plb
.add_u64(l_osd_stat_bytes
, "stat_bytes", "OSD size");
2929 osd_plb
.add_u64(l_osd_stat_bytes_used
, "stat_bytes_used", "Used space");
2930 osd_plb
.add_u64(l_osd_stat_bytes_avail
, "stat_bytes_avail", "Available space");
2932 osd_plb
.add_u64_counter(
2933 l_osd_copyfrom
, "copyfrom", "Rados \"copy-from\" operations");
2935 osd_plb
.add_u64_counter(l_osd_tier_promote
, "tier_promote", "Tier promotions");
2936 osd_plb
.add_u64_counter(l_osd_tier_flush
, "tier_flush", "Tier flushes");
2937 osd_plb
.add_u64_counter(
2938 l_osd_tier_flush_fail
, "tier_flush_fail", "Failed tier flushes");
2939 osd_plb
.add_u64_counter(
2940 l_osd_tier_try_flush
, "tier_try_flush", "Tier flush attempts");
2941 osd_plb
.add_u64_counter(
2942 l_osd_tier_try_flush_fail
, "tier_try_flush_fail",
2943 "Failed tier flush attempts");
2944 osd_plb
.add_u64_counter(
2945 l_osd_tier_evict
, "tier_evict", "Tier evictions");
2946 osd_plb
.add_u64_counter(
2947 l_osd_tier_whiteout
, "tier_whiteout", "Tier whiteouts");
2948 osd_plb
.add_u64_counter(
2949 l_osd_tier_dirty
, "tier_dirty", "Dirty tier flag set");
2950 osd_plb
.add_u64_counter(
2951 l_osd_tier_clean
, "tier_clean", "Dirty tier flag cleaned");
2952 osd_plb
.add_u64_counter(
2953 l_osd_tier_delay
, "tier_delay", "Tier delays (agent waiting)");
2954 osd_plb
.add_u64_counter(
2955 l_osd_tier_proxy_read
, "tier_proxy_read", "Tier proxy reads");
2956 osd_plb
.add_u64_counter(
2957 l_osd_tier_proxy_write
, "tier_proxy_write", "Tier proxy writes");
2959 osd_plb
.add_u64_counter(
2960 l_osd_agent_wake
, "agent_wake", "Tiering agent wake up");
2961 osd_plb
.add_u64_counter(
2962 l_osd_agent_skip
, "agent_skip", "Objects skipped by agent");
2963 osd_plb
.add_u64_counter(
2964 l_osd_agent_flush
, "agent_flush", "Tiering agent flushes");
2965 osd_plb
.add_u64_counter(
2966 l_osd_agent_evict
, "agent_evict", "Tiering agent evictions");
2968 osd_plb
.add_u64_counter(
2969 l_osd_object_ctx_cache_hit
, "object_ctx_cache_hit", "Object context cache hits");
2970 osd_plb
.add_u64_counter(
2971 l_osd_object_ctx_cache_total
, "object_ctx_cache_total", "Object context cache lookups");
2973 osd_plb
.add_u64_counter(l_osd_op_cache_hit
, "op_cache_hit");
2974 osd_plb
.add_time_avg(
2975 l_osd_tier_flush_lat
, "osd_tier_flush_lat", "Object flush latency");
2976 osd_plb
.add_time_avg(
2977 l_osd_tier_promote_lat
, "osd_tier_promote_lat", "Object promote latency");
2978 osd_plb
.add_time_avg(
2979 l_osd_tier_r_lat
, "osd_tier_r_lat", "Object proxy read latency");
2981 osd_plb
.add_u64_counter(
2982 l_osd_pg_info
, "osd_pg_info", "PG updated its info (using any method)");
2983 osd_plb
.add_u64_counter(
2984 l_osd_pg_fastinfo
, "osd_pg_fastinfo",
2985 "PG updated its info using fastinfo attr");
2986 osd_plb
.add_u64_counter(
2987 l_osd_pg_biginfo
, "osd_pg_biginfo", "PG updated its biginfo attr");
2989 logger
= osd_plb
.create_perf_counters();
2990 cct
->get_perfcounters_collection()->add(logger
);
2993 void OSD::create_recoverystate_perf()
2995 dout(10) << "create_recoverystate_perf" << dendl
;
2997 PerfCountersBuilder
rs_perf(cct
, "recoverystate_perf", rs_first
, rs_last
);
2999 rs_perf
.add_time_avg(rs_initial_latency
, "initial_latency", "Initial recovery state latency");
3000 rs_perf
.add_time_avg(rs_started_latency
, "started_latency", "Started recovery state latency");
3001 rs_perf
.add_time_avg(rs_reset_latency
, "reset_latency", "Reset recovery state latency");
3002 rs_perf
.add_time_avg(rs_start_latency
, "start_latency", "Start recovery state latency");
3003 rs_perf
.add_time_avg(rs_primary_latency
, "primary_latency", "Primary recovery state latency");
3004 rs_perf
.add_time_avg(rs_peering_latency
, "peering_latency", "Peering recovery state latency");
3005 rs_perf
.add_time_avg(rs_backfilling_latency
, "backfilling_latency", "Backfilling recovery state latency");
3006 rs_perf
.add_time_avg(rs_waitremotebackfillreserved_latency
, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3007 rs_perf
.add_time_avg(rs_waitlocalbackfillreserved_latency
, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3008 rs_perf
.add_time_avg(rs_notbackfilling_latency
, "notbackfilling_latency", "Notbackfilling recovery state latency");
3009 rs_perf
.add_time_avg(rs_repnotrecovering_latency
, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3010 rs_perf
.add_time_avg(rs_repwaitrecoveryreserved_latency
, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3011 rs_perf
.add_time_avg(rs_repwaitbackfillreserved_latency
, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3012 rs_perf
.add_time_avg(rs_reprecovering_latency
, "reprecovering_latency", "RepRecovering recovery state latency");
3013 rs_perf
.add_time_avg(rs_activating_latency
, "activating_latency", "Activating recovery state latency");
3014 rs_perf
.add_time_avg(rs_waitlocalrecoveryreserved_latency
, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3015 rs_perf
.add_time_avg(rs_waitremoterecoveryreserved_latency
, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3016 rs_perf
.add_time_avg(rs_recovering_latency
, "recovering_latency", "Recovering recovery state latency");
3017 rs_perf
.add_time_avg(rs_recovered_latency
, "recovered_latency", "Recovered recovery state latency");
3018 rs_perf
.add_time_avg(rs_clean_latency
, "clean_latency", "Clean recovery state latency");
3019 rs_perf
.add_time_avg(rs_active_latency
, "active_latency", "Active recovery state latency");
3020 rs_perf
.add_time_avg(rs_replicaactive_latency
, "replicaactive_latency", "Replicaactive recovery state latency");
3021 rs_perf
.add_time_avg(rs_stray_latency
, "stray_latency", "Stray recovery state latency");
3022 rs_perf
.add_time_avg(rs_getinfo_latency
, "getinfo_latency", "Getinfo recovery state latency");
3023 rs_perf
.add_time_avg(rs_getlog_latency
, "getlog_latency", "Getlog recovery state latency");
3024 rs_perf
.add_time_avg(rs_waitactingchange_latency
, "waitactingchange_latency", "Waitactingchange recovery state latency");
3025 rs_perf
.add_time_avg(rs_incomplete_latency
, "incomplete_latency", "Incomplete recovery state latency");
3026 rs_perf
.add_time_avg(rs_down_latency
, "down_latency", "Down recovery state latency");
3027 rs_perf
.add_time_avg(rs_getmissing_latency
, "getmissing_latency", "Getmissing recovery state latency");
3028 rs_perf
.add_time_avg(rs_waitupthru_latency
, "waitupthru_latency", "Waitupthru recovery state latency");
3029 rs_perf
.add_time_avg(rs_notrecovering_latency
, "notrecovering_latency", "Notrecovering recovery state latency");
3031 recoverystate_perf
= rs_perf
.create_perf_counters();
3032 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
3037 if (!service
.prepare_to_stop())
3038 return 0; // already shutting down
3040 if (is_stopping()) {
3044 derr
<< "shutdown" << dendl
;
3046 set_state(STATE_STOPPING
);
3049 cct
->_conf
->set_val("debug_osd", "100");
3050 cct
->_conf
->set_val("debug_journal", "100");
3051 cct
->_conf
->set_val("debug_filestore", "100");
3052 cct
->_conf
->set_val("debug_ms", "100");
3053 cct
->_conf
->apply_changes(NULL
);
3055 // stop MgrClient earlier as it's more like an internal consumer of OSD
3058 service
.start_shutdown();
3060 // stop sending work to pgs. this just prevents any new work in _process
3061 // from racing with on_shutdown and potentially entering the pg after.
3062 op_shardedwq
.drain();
3066 RWLock::RLocker
l(pg_map_lock
);
3067 for (ceph::unordered_map
<spg_t
, PG
*>::iterator p
= pg_map
.begin();
3070 dout(20) << " kicking pg " << p
->first
<< dendl
;
3072 p
->second
->on_shutdown();
3073 p
->second
->unlock();
3074 p
->second
->osr
->flush();
3077 clear_pg_stat_queue();
3079 // drain op queue again (in case PGs requeued something)
3080 op_shardedwq
.drain();
3082 finished
.clear(); // zap waiters (bleh, this is messy)
3085 op_shardedwq
.clear_pg_slots();
3087 // unregister commands
3088 cct
->get_admin_socket()->unregister_command("status");
3089 cct
->get_admin_socket()->unregister_command("flush_journal");
3090 cct
->get_admin_socket()->unregister_command("dump_ops_in_flight");
3091 cct
->get_admin_socket()->unregister_command("ops");
3092 cct
->get_admin_socket()->unregister_command("dump_blocked_ops");
3093 cct
->get_admin_socket()->unregister_command("dump_historic_ops");
3094 cct
->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3095 cct
->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3096 cct
->get_admin_socket()->unregister_command("dump_op_pq_state");
3097 cct
->get_admin_socket()->unregister_command("dump_blacklist");
3098 cct
->get_admin_socket()->unregister_command("dump_watchers");
3099 cct
->get_admin_socket()->unregister_command("dump_reservations");
3100 cct
->get_admin_socket()->unregister_command("get_latest_osdmap");
3101 cct
->get_admin_socket()->unregister_command("set_heap_property");
3102 cct
->get_admin_socket()->unregister_command("get_heap_property");
3103 cct
->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
3104 cct
->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3105 cct
->get_admin_socket()->unregister_command("flush_store_cache");
3106 cct
->get_admin_socket()->unregister_command("dump_pgstate_history");
3110 cct
->get_admin_socket()->unregister_command("setomapval");
3111 cct
->get_admin_socket()->unregister_command("rmomapkey");
3112 cct
->get_admin_socket()->unregister_command("setomapheader");
3113 cct
->get_admin_socket()->unregister_command("getomap");
3114 cct
->get_admin_socket()->unregister_command("truncobj");
3115 cct
->get_admin_socket()->unregister_command("injectdataerr");
3116 cct
->get_admin_socket()->unregister_command("injectmdataerr");
3117 cct
->get_admin_socket()->unregister_command("set_recovery_delay");
3118 delete test_ops_hook
;
3119 test_ops_hook
= NULL
;
3123 heartbeat_lock
.Lock();
3124 heartbeat_stop
= true;
3125 heartbeat_cond
.Signal();
3126 heartbeat_lock
.Unlock();
3127 heartbeat_thread
.join();
3132 dout(10) << "osd tp stopped" << dendl
;
3136 dout(10) << "op sharded tp stopped" << dendl
;
3140 dout(10) << "command tp stopped" << dendl
;
3144 dout(10) << "disk tp paused (new)" << dendl
;
3146 dout(10) << "stopping agent" << dendl
;
3147 service
.agent_stop();
3151 reset_heartbeat_peers();
3153 tick_timer
.shutdown();
3156 Mutex::Locker
l(tick_timer_lock
);
3157 tick_timer_without_osd_lock
.shutdown();
3160 // note unmount epoch
3161 dout(10) << "noting clean unmount in epoch " << osdmap
->get_epoch() << dendl
;
3162 superblock
.mounted
= service
.get_boot_epoch();
3163 superblock
.clean_thru
= osdmap
->get_epoch();
3164 ObjectStore::Transaction t
;
3165 write_superblock(t
);
3166 int r
= store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
3168 derr
<< "OSD::shutdown: error writing superblock: "
3169 << cpp_strerror(r
) << dendl
;
3174 Mutex::Locker
l(pg_stat_queue_lock
);
3175 assert(pg_stat_queue
.empty());
3179 #ifdef PG_DEBUG_REFS
3180 service
.dump_live_pgids();
3183 RWLock::RLocker
l(pg_map_lock
);
3184 for (ceph::unordered_map
<spg_t
, PG
*>::iterator p
= pg_map
.begin();
3187 dout(20) << " kicking pg " << p
->first
<< dendl
;
3189 if (p
->second
->ref
!= 1) {
3190 derr
<< "pgid " << p
->first
<< " has ref count of "
3191 << p
->second
->ref
<< dendl
;
3192 #ifdef PG_DEBUG_REFS
3193 p
->second
->dump_live_ids();
3197 p
->second
->unlock();
3198 p
->second
->put("PGMap");
3202 #ifdef PG_DEBUG_REFS
3203 service
.dump_live_pgids();
3205 cct
->_conf
->remove_observer(this);
3207 dout(10) << "syncing store" << dendl
;
3208 enable_disable_fuse(true);
3210 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
3211 dout(10) << "flushing journal" << dendl
;
3212 store
->flush_journal();
3218 dout(10) << "Store synced" << dendl
;
3223 osdmap
= OSDMapRef();
3225 op_tracker
.on_shutdown();
3227 class_handler
->shutdown();
3228 client_messenger
->shutdown();
3229 cluster_messenger
->shutdown();
3230 hb_front_client_messenger
->shutdown();
3231 hb_back_client_messenger
->shutdown();
3232 objecter_messenger
->shutdown();
3233 hb_front_server_messenger
->shutdown();
3234 hb_back_server_messenger
->shutdown();
3241 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
3243 bool created
= false;
3245 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
3246 vector
<string
> vcmd
{cmd
};
3250 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
3253 if (r
== -ENOENT
&& !created
) {
3254 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
3255 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
3256 vector
<string
> vnewcmd
{newcmd
};
3260 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
3263 derr
<< __func__
<< " fail: osd does not exist and created failed: "
3264 << cpp_strerror(r
) << dendl
;
3270 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
3279 int OSD::update_crush_location()
3281 if (!cct
->_conf
->osd_crush_update_on_start
) {
3282 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
3287 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
3288 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
3290 struct store_statfs_t st
;
3291 int r
= store
->statfs(&st
);
3293 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
3296 snprintf(weight
, sizeof(weight
), "%.4lf",
3298 (double)(st
.total
) /
3299 (double)(1ull << 40 /* TB */)));
3302 std::multimap
<string
,string
> loc
= cct
->crush_location
.get_location();
3303 dout(10) << __func__
<< " crush location is " << loc
<< dendl
;
3306 string("{\"prefix\": \"osd crush create-or-move\", ") +
3307 string("\"id\": ") + stringify(whoami
) + string(", ") +
3308 string("\"weight\":") + weight
+ string(", ") +
3309 string("\"args\": [");
3310 for (multimap
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
) {
3311 if (p
!= loc
.begin())
3313 cmd
+= "\"" + p
->first
+ "=" + p
->second
+ "\"";
3317 return mon_cmd_maybe_osd_create(cmd
);
3320 int OSD::update_crush_device_class()
3322 string device_class
;
3323 int r
= store
->read_meta("crush_device_class", &device_class
);
3328 string("{\"prefix\": \"osd crush set-device-class\", ") +
3329 string("\"id\": ") + stringify(whoami
) + string(", ") +
3330 string("\"class\": \"") + device_class
+ string("\"}");
3332 return mon_cmd_maybe_osd_create(cmd
);
3335 void OSD::write_superblock(ObjectStore::Transaction
& t
)
3337 dout(10) << "write_superblock " << superblock
<< dendl
;
3339 //hack: at minimum it's using the baseline feature set
3340 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
3341 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
3344 ::encode(superblock
, bl
);
3345 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
3348 int OSD::read_superblock()
3351 int r
= store
->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
3355 bufferlist::iterator p
= bl
.begin();
3356 ::decode(superblock
, p
);
3358 dout(10) << "read_superblock " << superblock
<< dendl
;
3363 void OSD::clear_temp_objects()
3365 dout(10) << __func__
<< dendl
;
3367 store
->list_collections(ls
);
3368 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
3370 if (!p
->is_pg(&pgid
))
3373 // list temp objects
3374 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
3376 vector
<ghobject_t
> temps
;
3379 vector
<ghobject_t
> objects
;
3380 store
->collection_list(*p
, next
, ghobject_t::get_max(),
3381 store
->get_ideal_list_max(),
3383 if (objects
.empty())
3385 vector
<ghobject_t
>::iterator q
;
3386 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
3387 // Hammer set pool for temps to -1, so check for clean-up
3388 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
3389 temps
.push_back(*q
);
3394 // If we saw a non-temp object and hit the break above we can
3395 // break out of the while loop too.
3396 if (q
!= objects
.end())
3399 if (!temps
.empty()) {
3400 ObjectStore::Transaction t
;
3402 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
3403 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
3405 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
3406 store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
3407 t
= ObjectStore::Transaction();
3412 store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
3418 void OSD::recursive_remove_collection(CephContext
* cct
,
3419 ObjectStore
*store
, spg_t pgid
,
3425 make_snapmapper_oid());
3427 ceph::shared_ptr
<ObjectStore::Sequencer
> osr (std::make_shared
<
3428 ObjectStore::Sequencer
>("rm"));
3429 ObjectStore::Transaction t
;
3430 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
3432 vector
<ghobject_t
> objects
;
3433 store
->collection_list(tmp
, ghobject_t(), ghobject_t::get_max(),
3434 INT_MAX
, &objects
, 0);
3435 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
3438 for (vector
<ghobject_t
>::iterator p
= objects
.begin();
3441 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
3442 int r
= mapper
.remove_oid(p
->hobj
, &_t
);
3443 if (r
!= 0 && r
!= -ENOENT
)
3446 if (removed
> cct
->_conf
->osd_target_transaction_size
) {
3447 int r
= store
->apply_transaction(osr
.get(), std::move(t
));
3449 t
= ObjectStore::Transaction();
3453 t
.remove_collection(tmp
);
3454 int r
= store
->apply_transaction(osr
.get(), std::move(t
));
3458 if (!osr
->flush_commit(&waiter
)) {
3464 // ======================================================
3467 PGPool
OSD::_get_pool(int id
, OSDMapRef createmap
)
3469 if (!createmap
->have_pg_pool(id
)) {
3470 dout(5) << __func__
<< ": the OSDmap does not contain a PG pool with id = "
3475 PGPool p
= PGPool(cct
, createmap
, id
);
3477 dout(10) << "_get_pool " << p
.id
<< dendl
;
3481 PG
*OSD::_open_lock_pg(
3482 OSDMapRef createmap
,
3483 spg_t pgid
, bool no_lockdep_check
)
3485 assert(osd_lock
.is_locked());
3487 PG
* pg
= _make_pg(createmap
, pgid
);
3489 RWLock::WLocker
l(pg_map_lock
);
3490 pg
->lock(no_lockdep_check
);
3492 pg
->get("PGMap"); // because it's in pg_map
3493 service
.pg_add_epoch(pg
->info
.pgid
, createmap
->get_epoch());
3499 OSDMapRef createmap
,
3502 dout(10) << "_open_lock_pg " << pgid
<< dendl
;
3503 PGPool pool
= _get_pool(pgid
.pool(), createmap
);
3507 if (createmap
->get_pg_type(pgid
.pgid
) == pg_pool_t::TYPE_REPLICATED
||
3508 createmap
->get_pg_type(pgid
.pgid
) == pg_pool_t::TYPE_ERASURE
)
3509 pg
= new PrimaryLogPG(&service
, createmap
, pool
, pgid
);
3517 void OSD::add_newly_split_pg(PG
*pg
, PG::RecoveryCtx
*rctx
)
3519 epoch_t
e(service
.get_osdmap()->get_epoch());
3520 pg
->get("PGMap"); // For pg_map
3521 pg_map
[pg
->info
.pgid
] = pg
;
3522 service
.pg_add_epoch(pg
->info
.pgid
, pg
->get_osdmap()->get_epoch());
3524 dout(10) << "Adding newly split pg " << *pg
<< dendl
;
3525 pg
->handle_loaded(rctx
);
3526 pg
->write_if_dirty(*(rctx
->transaction
));
3527 pg
->queue_null(e
, e
);
3528 map
<spg_t
, list
<PG::CephPeeringEvtRef
> >::iterator to_wake
=
3529 peering_wait_for_split
.find(pg
->info
.pgid
);
3530 if (to_wake
!= peering_wait_for_split
.end()) {
3531 for (list
<PG::CephPeeringEvtRef
>::iterator i
=
3532 to_wake
->second
.begin();
3533 i
!= to_wake
->second
.end();
3535 pg
->queue_peering_event(*i
);
3537 peering_wait_for_split
.erase(to_wake
);
3539 if (!service
.get_osdmap()->have_pg_pool(pg
->info
.pgid
.pool()))
3543 OSD::res_result
OSD::_try_resurrect_pg(
3544 OSDMapRef curmap
, spg_t pgid
, spg_t
*resurrected
, PGRef
*old_pg_state
)
3546 assert(resurrected
);
3547 assert(old_pg_state
);
3548 // find nearest ancestor
3549 DeletingStateRef df
;
3552 df
= service
.deleting_pgs
.lookup(cur
);
3557 cur
= cur
.get_parent();
3560 return RES_NONE
; // good to go
3562 df
->old_pg_state
->lock();
3563 OSDMapRef create_map
= df
->old_pg_state
->get_osdmap();
3564 df
->old_pg_state
->unlock();
3566 set
<spg_t
> children
;
3568 if (df
->try_stop_deletion()) {
3569 dout(10) << __func__
<< ": halted deletion on pg " << pgid
<< dendl
;
3571 *old_pg_state
= df
->old_pg_state
;
3572 service
.deleting_pgs
.remove(pgid
); // PG is no longer being removed!
3575 // raced, ensure we don't see DeletingStateRef when we try to
3577 service
.deleting_pgs
.remove(pgid
);
3580 } else if (cur
.is_split(create_map
->get_pg_num(cur
.pool()),
3581 curmap
->get_pg_num(cur
.pool()),
3583 children
.count(pgid
)) {
3584 if (df
->try_stop_deletion()) {
3585 dout(10) << __func__
<< ": halted deletion on ancestor pg " << pgid
3588 *old_pg_state
= df
->old_pg_state
;
3589 service
.deleting_pgs
.remove(cur
); // PG is no longer being removed!
3592 /* this is not a problem, failing to cancel proves that all objects
3593 * have been removed, so no hobject_t overlap is possible
3601 PG
*OSD::_create_lock_pg(
3602 OSDMapRef createmap
,
3607 vector
<int>& up
, int up_primary
,
3608 vector
<int>& acting
, int acting_primary
,
3609 pg_history_t history
,
3610 const PastIntervals
& pi
,
3611 ObjectStore::Transaction
& t
)
3613 assert(osd_lock
.is_locked());
3614 dout(20) << "_create_lock_pg pgid " << pgid
<< dendl
;
3616 PG
*pg
= _open_lock_pg(createmap
, pgid
, true);
3618 service
.init_splits_between(pgid
, pg
->get_osdmap(), service
.get_osdmap());
3631 dout(7) << "_create_lock_pg " << *pg
<< dendl
;
3635 PG
*OSD::_lookup_lock_pg(spg_t pgid
)
3637 RWLock::RLocker
l(pg_map_lock
);
3639 auto pg_map_entry
= pg_map
.find(pgid
);
3640 if (pg_map_entry
== pg_map
.end())
3642 PG
*pg
= pg_map_entry
->second
;
3647 PG
*OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid
)
3649 assert(pg_map
.count(pgid
));
3650 PG
*pg
= pg_map
[pgid
];
3655 void OSD::load_pgs()
3657 assert(osd_lock
.is_locked());
3658 dout(0) << "load_pgs" << dendl
;
3660 RWLock::RLocker
l(pg_map_lock
);
3661 assert(pg_map
.empty());
3665 int r
= store
->list_collections(ls
);
3667 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
3670 bool has_upgraded
= false;
3672 for (vector
<coll_t
>::iterator it
= ls
.begin();
3676 if (it
->is_temp(&pgid
) ||
3677 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
, pgid
))) {
3678 dout(10) << "load_pgs " << *it
<< " clearing temp" << dendl
;
3679 recursive_remove_collection(cct
, store
, pgid
, *it
);
3683 if (!it
->is_pg(&pgid
)) {
3684 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
3688 if (pgid
.preferred() >= 0) {
3689 dout(10) << __func__
<< ": skipping localized PG " << pgid
<< dendl
;
3690 // FIXME: delete it too, eventually
3694 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
3696 epoch_t map_epoch
= 0;
3697 int r
= PG::peek_map_epoch(store
, pgid
, &map_epoch
, &bl
);
3699 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
3705 if (map_epoch
> 0) {
3706 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
3708 if (!osdmap
->have_pg_pool(pgid
.pool())) {
3709 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
3710 << " on pg " << pgid
<< ", but the pool is not present in the "
3711 << "current map, so this is probably a result of bug 10617. "
3712 << "Skipping the pg for now, you can use ceph-objectstore-tool "
3713 << "to clean it up later." << dendl
;
3716 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
3717 << map_epoch
<< ", but missing map. Crashing."
3719 assert(0 == "Missing map in load_pgs");
3722 pg
= _open_lock_pg(pgosdmap
, pgid
);
3724 pg
= _open_lock_pg(osdmap
, pgid
);
3726 // there can be no waiters here, so we don't call wake_pg_waiters
3728 pg
->ch
= store
->open_collection(pg
->coll
);
3730 // read pg state, log
3731 pg
->read_state(store
, bl
);
3733 if (pg
->must_upgrade()) {
3734 if (!pg
->can_upgrade()) {
3735 derr
<< "PG needs upgrade, but on-disk data is too old; upgrade to"
3736 << " an older version first." << dendl
;
3737 assert(0 == "PG too old to upgrade");
3739 if (!has_upgraded
) {
3740 derr
<< "PGs are upgrading" << dendl
;
3741 has_upgraded
= true;
3743 dout(10) << "PG " << pg
->info
.pgid
3744 << " must upgrade..." << dendl
;
3748 service
.init_splits_between(pg
->info
.pgid
, pg
->get_osdmap(), osdmap
);
3750 // generate state for PG's current mapping
3751 int primary
, up_primary
;
3752 vector
<int> acting
, up
;
3753 pg
->get_osdmap()->pg_to_up_acting_osds(
3754 pgid
.pgid
, &up
, &up_primary
, &acting
, &primary
);
3755 pg
->init_primary_up_acting(
3760 int role
= OSDMap::calc_pg_role(whoami
, pg
->acting
);
3761 if (pg
->pool
.info
.is_replicated() || role
== pg
->pg_whoami
.shard
)
3766 pg
->reg_next_scrub();
3768 PG::RecoveryCtx
rctx(0, 0, 0, 0, 0, 0);
3769 pg
->handle_loaded(&rctx
);
3771 dout(10) << "load_pgs loaded " << *pg
<< " " << pg
->pg_log
.get_log() << dendl
;
3772 if (pg
->pg_log
.is_dirty()) {
3773 ObjectStore::Transaction t
;
3774 pg
->write_if_dirty(t
);
3775 store
->apply_transaction(pg
->osr
.get(), std::move(t
));
3780 RWLock::RLocker
l(pg_map_lock
);
3781 dout(0) << "load_pgs opened " << pg_map
.size() << " pgs" << dendl
;
3784 // clean up old infos object?
3785 if (has_upgraded
&& store
->exists(coll_t::meta(), OSD::make_infos_oid())) {
3786 dout(1) << __func__
<< " removing legacy infos object" << dendl
;
3787 ObjectStore::Transaction t
;
3788 t
.remove(coll_t::meta(), OSD::make_infos_oid());
3789 int r
= store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
3791 derr
<< __func__
<< ": apply_transaction returned "
3792 << cpp_strerror(r
) << dendl
;
3797 build_past_intervals_parallel();
3802 * build past_intervals efficiently on old, degraded, and buried
3803 * clusters. this is important for efficiently catching up osds that
3804 * are way behind on maps to the current cluster state.
3806 * this is a parallel version of PG::generate_past_intervals().
3807 * follow the same logic, but do all pgs at the same time so that we
3808 * can make a single pass across the osdmap history.
3810 void OSD::build_past_intervals_parallel()
3814 vector
<int> old_acting
, old_up
;
3815 epoch_t same_interval_since
;
3819 map
<PG
*,pistate
> pis
;
3821 // calculate junction of map range
3822 epoch_t end_epoch
= superblock
.oldest_map
;
3823 epoch_t cur_epoch
= superblock
.newest_map
;
3825 RWLock::RLocker
l(pg_map_lock
);
3826 for (ceph::unordered_map
<spg_t
, PG
*>::iterator i
= pg_map
.begin();
3831 auto rpib
= pg
->get_required_past_interval_bounds(
3833 superblock
.oldest_map
);
3834 if (rpib
.first
>= rpib
.second
&& pg
->past_intervals
.empty()) {
3835 if (pg
->info
.history
.same_interval_since
== 0) {
3836 pg
->info
.history
.same_interval_since
= rpib
.second
;
3840 auto apib
= pg
->past_intervals
.get_bounds();
3841 if (apib
.second
>= rpib
.second
&&
3842 apib
.first
<= rpib
.first
) {
3843 if (pg
->info
.history
.same_interval_since
== 0) {
3844 pg
->info
.history
.same_interval_since
= rpib
.second
;
3850 dout(10) << pg
->info
.pgid
<< " needs " << rpib
.first
<< "-"
3851 << rpib
.second
<< dendl
;
3852 pistate
& p
= pis
[pg
];
3853 p
.start
= rpib
.first
;
3854 p
.end
= rpib
.second
;
3855 p
.same_interval_since
= 0;
3857 if (rpib
.first
< cur_epoch
)
3858 cur_epoch
= rpib
.first
;
3859 if (rpib
.second
> end_epoch
)
3860 end_epoch
= rpib
.second
;
3864 dout(10) << __func__
<< " nothing to build" << dendl
;
3868 dout(1) << __func__
<< " over " << cur_epoch
<< "-" << end_epoch
<< dendl
;
3869 assert(cur_epoch
<= end_epoch
);
3871 OSDMapRef cur_map
, last_map
;
3872 for ( ; cur_epoch
<= end_epoch
; cur_epoch
++) {
3873 dout(10) << __func__
<< " epoch " << cur_epoch
<< dendl
;
3875 cur_map
= get_map(cur_epoch
);
3877 for (map
<PG
*,pistate
>::iterator i
= pis
.begin(); i
!= pis
.end(); ++i
) {
3879 pistate
& p
= i
->second
;
3881 if (cur_epoch
< p
.start
|| cur_epoch
> p
.end
)
3884 vector
<int> acting
, up
;
3887 pg_t pgid
= pg
->info
.pgid
.pgid
;
3888 if (p
.same_interval_since
&& last_map
->get_pools().count(pgid
.pool()))
3889 pgid
= pgid
.get_ancestor(last_map
->get_pg_num(pgid
.pool()));
3890 cur_map
->pg_to_up_acting_osds(
3891 pgid
, &up
, &up_primary
, &acting
, &primary
);
3893 if (p
.same_interval_since
== 0) {
3894 dout(10) << __func__
<< " epoch " << cur_epoch
<< " pg " << pg
->info
.pgid
3895 << " first map, acting " << acting
3896 << " up " << up
<< ", same_interval_since = " << cur_epoch
<< dendl
;
3897 p
.same_interval_since
= cur_epoch
;
3899 p
.old_acting
= acting
;
3900 p
.primary
= primary
;
3901 p
.up_primary
= up_primary
;
3906 boost::scoped_ptr
<IsPGRecoverablePredicate
> recoverable(
3907 pg
->get_is_recoverable_predicate());
3908 std::stringstream debug
;
3909 bool new_interval
= PastIntervals::check_new_interval(
3912 p
.old_acting
, acting
,
3916 p
.same_interval_since
,
3917 pg
->info
.history
.last_epoch_clean
,
3921 &pg
->past_intervals
,
3924 dout(10) << __func__
<< " epoch " << cur_epoch
<< " pg " << pg
->info
.pgid
3925 << " " << debug
.str() << dendl
;
3927 p
.old_acting
= acting
;
3928 p
.primary
= primary
;
3929 p
.up_primary
= up_primary
;
3930 p
.same_interval_since
= cur_epoch
;
3935 // Now that past_intervals have been recomputed let's fix the same_interval_since
3936 // if it was cleared by import.
3937 for (map
<PG
*,pistate
>::iterator i
= pis
.begin(); i
!= pis
.end(); ++i
) {
3939 pistate
& p
= i
->second
;
3941 if (pg
->info
.history
.same_interval_since
== 0) {
3942 assert(p
.same_interval_since
);
3943 dout(10) << __func__
<< " fix same_interval_since " << p
.same_interval_since
<< " pg " << *pg
<< dendl
;
3944 dout(10) << __func__
<< " past_intervals " << pg
->past_intervals
<< dendl
;
3946 pg
->info
.history
.same_interval_since
= p
.same_interval_since
;
3950 // write info only at the end. this is necessary because we check
3951 // whether the past_intervals go far enough back or forward in time,
3952 // but we don't check for holes. we could avoid it by discarding
3953 // the previous past_intervals and rebuilding from scratch, or we
3954 // can just do this and commit all our work at the end.
3955 ObjectStore::Transaction t
;
3957 for (map
<PG
*,pistate
>::iterator i
= pis
.begin(); i
!= pis
.end(); ++i
) {
3960 pg
->dirty_big_info
= true;
3961 pg
->dirty_info
= true;
3962 pg
->write_if_dirty(t
);
3965 // don't let the transaction get too big
3966 if (++num
>= cct
->_conf
->osd_target_transaction_size
) {
3967 store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
3968 t
= ObjectStore::Transaction();
3973 store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
3977 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
3978 * hasn't changed since the given epoch and we are the primary.
3980 int OSD::handle_pg_peering_evt(
3982 const pg_history_t
& orig_history
,
3983 const PastIntervals
& pi
,
3985 PG::CephPeeringEvtRef evt
)
3987 if (service
.splitting(pgid
)) {
3988 peering_wait_for_split
[pgid
].push_back(evt
);
3992 PG
*pg
= _lookup_lock_pg(pgid
);
3995 if (!osdmap
->have_pg_pool(pgid
.pool()))
3997 int up_primary
, acting_primary
;
3998 vector
<int> up
, acting
;
3999 osdmap
->pg_to_up_acting_osds(
4000 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4002 pg_history_t history
= orig_history
;
4003 bool valid_history
= project_pg_history(
4004 pgid
, history
, epoch
, up
, up_primary
, acting
, acting_primary
);
4006 if (!valid_history
|| epoch
< history
.same_interval_since
) {
4007 dout(10) << __func__
<< pgid
<< " acting changed in "
4008 << history
.same_interval_since
<< " (msg from " << epoch
<< ")"
4013 if (service
.splitting(pgid
)) {
4017 // do we need to resurrect a deleting pg?
4020 res_result result
= _try_resurrect_pg(
4021 service
.get_osdmap(),
4026 PG::RecoveryCtx rctx
= create_context();
4029 const pg_pool_t
* pp
= osdmap
->get_pg_pool(pgid
.pool());
4030 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
4031 store
->get_type() != "bluestore") {
4032 clog
->warn() << "pg " << pgid
4033 << " is at risk of silent data corruption: "
4034 << "the pool allows ec overwrites but is not stored in "
4035 << "bluestore, so deep scrubbing will not detect bitrot";
4037 PG::_create(*rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
4038 PG::_init(*rctx
.transaction
, pgid
, pp
);
4040 int role
= osdmap
->calc_pg_role(whoami
, acting
, acting
.size());
4041 if (!pp
->is_replicated() && role
!= pgid
.shard
)
4044 pg
= _create_lock_pg(
4049 acting
, acting_primary
,
4052 pg
->handle_create(&rctx
);
4053 pg
->write_if_dirty(*rctx
.transaction
);
4054 dispatch_context(rctx
, pg
, osdmap
);
4056 dout(10) << *pg
<< " is new" << dendl
;
4058 pg
->queue_peering_event(evt
);
4059 wake_pg_waiters(pg
);
4064 old_pg_state
->lock();
4065 OSDMapRef old_osd_map
= old_pg_state
->get_osdmap();
4066 int old_role
= old_pg_state
->role
;
4067 vector
<int> old_up
= old_pg_state
->up
;
4068 int old_up_primary
= old_pg_state
->up_primary
.osd
;
4069 vector
<int> old_acting
= old_pg_state
->acting
;
4070 int old_primary
= old_pg_state
->primary
.osd
;
4071 pg_history_t old_history
= old_pg_state
->info
.history
;
4072 PastIntervals old_past_intervals
= old_pg_state
->past_intervals
;
4073 old_pg_state
->unlock();
4074 pg
= _create_lock_pg(
4087 pg
->handle_create(&rctx
);
4088 pg
->write_if_dirty(*rctx
.transaction
);
4089 dispatch_context(rctx
, pg
, osdmap
);
4091 dout(10) << *pg
<< " is new (resurrected)" << dendl
;
4093 pg
->queue_peering_event(evt
);
4094 wake_pg_waiters(pg
);
4099 assert(old_pg_state
);
4100 old_pg_state
->lock();
4101 OSDMapRef old_osd_map
= old_pg_state
->get_osdmap();
4102 int old_role
= old_pg_state
->role
;
4103 vector
<int> old_up
= old_pg_state
->up
;
4104 int old_up_primary
= old_pg_state
->up_primary
.osd
;
4105 vector
<int> old_acting
= old_pg_state
->acting
;
4106 int old_primary
= old_pg_state
->primary
.osd
;
4107 pg_history_t old_history
= old_pg_state
->info
.history
;
4108 PastIntervals old_past_intervals
= old_pg_state
->past_intervals
;
4109 old_pg_state
->unlock();
4110 PG
*parent
= _create_lock_pg(
4124 parent
->handle_create(&rctx
);
4125 parent
->write_if_dirty(*rctx
.transaction
);
4126 dispatch_context(rctx
, parent
, osdmap
);
4128 dout(10) << *parent
<< " is new" << dendl
;
4130 assert(service
.splitting(pgid
));
4131 peering_wait_for_split
[pgid
].push_back(evt
);
4133 //parent->queue_peering_event(evt);
4134 parent
->queue_null(osdmap
->get_epoch(), osdmap
->get_epoch());
4135 wake_pg_waiters(parent
);
4144 // already had it. did the mapping change?
4145 if (epoch
< pg
->info
.history
.same_interval_since
) {
4146 dout(10) << *pg
<< __func__
<< " acting changed in "
4147 << pg
->info
.history
.same_interval_since
4148 << " (msg from " << epoch
<< ")" << dendl
;
4150 pg
->queue_peering_event(evt
);
4158 void OSD::build_initial_pg_history(
4161 utime_t created_stamp
,
4165 dout(10) << __func__
<< " " << pgid
<< " created " << created
<< dendl
;
4166 h
->epoch_created
= created
;
4167 h
->same_interval_since
= created
;
4168 h
->same_up_since
= created
;
4169 h
->same_primary_since
= created
;
4170 h
->last_scrub_stamp
= created_stamp
;
4171 h
->last_deep_scrub_stamp
= created_stamp
;
4172 h
->last_clean_scrub_stamp
= created_stamp
;
4174 OSDMapRef lastmap
= service
.get_map(created
);
4175 int up_primary
, acting_primary
;
4176 vector
<int> up
, acting
;
4177 lastmap
->pg_to_up_acting_osds(
4178 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4180 ostringstream debug
;
4181 for (epoch_t e
= created
+ 1; e
<= osdmap
->get_epoch(); ++e
) {
4182 OSDMapRef osdmap
= service
.get_map(e
);
4183 int new_up_primary
, new_acting_primary
;
4184 vector
<int> new_up
, new_acting
;
4185 osdmap
->pg_to_up_acting_osds(
4186 pgid
.pgid
, &new_up
, &new_up_primary
, &new_acting
, &new_acting_primary
);
4188 // this is a bit imprecise, but sufficient?
4189 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
4190 const pg_pool_t
*pi
;
4191 bool operator()(const set
<pg_shard_t
> &have
) const {
4192 return have
.size() >= pi
->min_size
;
4194 min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
4195 } min_size_predicate(osdmap
->get_pg_pool(pgid
.pgid
.pool()));
4197 bool new_interval
= PastIntervals::check_new_interval(
4204 h
->same_interval_since
,
4205 h
->last_epoch_clean
,
4209 &min_size_predicate
,
4213 h
->same_interval_since
= e
;
4216 h
->same_up_since
= e
;
4218 if (acting_primary
!= new_acting_primary
) {
4219 h
->same_primary_since
= e
;
4223 dout(20) << __func__
<< " " << debug
.str() << dendl
;
4224 dout(10) << __func__
<< " " << *h
<< " " << *pi
4225 << " [" << (pi
->empty() ? pair
<epoch_t
,epoch_t
>(0,0) :
4226 pi
->get_bounds()) << ")"
4231 * Fill in the passed history so you know same_interval_since, same_up_since,
4232 * and same_primary_since.
4234 bool OSD::project_pg_history(spg_t pgid
, pg_history_t
& h
, epoch_t from
,
4235 const vector
<int>& currentup
,
4236 int currentupprimary
,
4237 const vector
<int>& currentacting
,
4238 int currentactingprimary
)
4240 dout(15) << "project_pg_history " << pgid
4241 << " from " << from
<< " to " << osdmap
->get_epoch()
4246 for (e
= osdmap
->get_epoch();
4249 // verify during intermediate epoch (e-1)
4250 OSDMapRef oldmap
= service
.try_get_map(e
-1);
4252 dout(15) << __func__
<< ": found map gap, returning false" << dendl
;
4255 assert(oldmap
->have_pg_pool(pgid
.pool()));
4257 int upprimary
, actingprimary
;
4258 vector
<int> up
, acting
;
4259 oldmap
->pg_to_up_acting_osds(
4266 // acting set change?
4267 if ((actingprimary
!= currentactingprimary
||
4268 upprimary
!= currentupprimary
||
4269 acting
!= currentacting
||
4270 up
!= currentup
) && e
> h
.same_interval_since
) {
4271 dout(15) << "project_pg_history " << pgid
<< " acting|up changed in " << e
4272 << " from " << acting
<< "/" << up
4273 << " " << actingprimary
<< "/" << upprimary
4274 << " -> " << currentacting
<< "/" << currentup
4275 << " " << currentactingprimary
<< "/" << currentupprimary
4277 h
.same_interval_since
= e
;
4280 if (pgid
.is_split(oldmap
->get_pg_num(pgid
.pool()),
4281 osdmap
->get_pg_num(pgid
.pool()),
4282 0) && e
> h
.same_interval_since
) {
4283 h
.same_interval_since
= e
;
4286 if ((up
!= currentup
|| upprimary
!= currentupprimary
)
4287 && e
> h
.same_up_since
) {
4288 dout(15) << "project_pg_history " << pgid
<< " up changed in " << e
4289 << " from " << up
<< " " << upprimary
4290 << " -> " << currentup
<< " " << currentupprimary
<< dendl
;
4291 h
.same_up_since
= e
;
4295 if (OSDMap::primary_changed(
4298 currentactingprimary
,
4300 e
> h
.same_primary_since
) {
4301 dout(15) << "project_pg_history " << pgid
<< " primary changed in " << e
<< dendl
;
4302 h
.same_primary_since
= e
;
4305 if (h
.same_interval_since
>= e
&& h
.same_up_since
>= e
&& h
.same_primary_since
>= e
)
4309 // base case: these floors should be the creation epoch if we didn't
4310 // find any changes.
4311 if (e
== h
.epoch_created
) {
4312 if (!h
.same_interval_since
)
4313 h
.same_interval_since
= e
;
4314 if (!h
.same_up_since
)
4315 h
.same_up_since
= e
;
4316 if (!h
.same_primary_since
)
4317 h
.same_primary_since
= e
;
4320 dout(15) << "project_pg_history end " << h
<< dendl
;
4326 void OSD::_add_heartbeat_peer(int p
)
4332 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
4333 if (i
== heartbeat_peers
.end()) {
4334 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, osdmap
->get_epoch());
4337 hi
= &heartbeat_peers
[p
];
4339 HeartbeatSession
*s
= new HeartbeatSession(p
);
4340 hi
->con_back
= cons
.first
.get();
4341 hi
->con_back
->set_priv(s
->get());
4343 hi
->con_front
= cons
.second
.get();
4344 hi
->con_front
->set_priv(s
->get());
4345 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4346 << " " << hi
->con_back
->get_peer_addr()
4347 << " " << hi
->con_front
->get_peer_addr()
4350 hi
->con_front
.reset(NULL
);
4351 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4352 << " " << hi
->con_back
->get_peer_addr()
4359 hi
->epoch
= osdmap
->get_epoch();
4362 void OSD::_remove_heartbeat_peer(int n
)
4364 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
4365 assert(q
!= heartbeat_peers
.end());
4366 dout(20) << " removing heartbeat peer osd." << n
4367 << " " << q
->second
.con_back
->get_peer_addr()
4368 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
4370 q
->second
.con_back
->mark_down();
4371 if (q
->second
.con_front
) {
4372 q
->second
.con_front
->mark_down();
4374 heartbeat_peers
.erase(q
);
4377 void OSD::need_heartbeat_peer_update()
4381 dout(20) << "need_heartbeat_peer_update" << dendl
;
4382 heartbeat_set_peers_need_update();
4385 void OSD::maybe_update_heartbeat_peers()
4387 assert(osd_lock
.is_locked());
4389 if (is_waiting_for_healthy()) {
4390 utime_t now
= ceph_clock_now();
4391 if (last_heartbeat_resample
== utime_t()) {
4392 last_heartbeat_resample
= now
;
4393 heartbeat_set_peers_need_update();
4394 } else if (!heartbeat_peers_need_update()) {
4395 utime_t dur
= now
- last_heartbeat_resample
;
4396 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
4397 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
4398 heartbeat_set_peers_need_update();
4399 last_heartbeat_resample
= now
;
4400 reset_heartbeat_peers(); // we want *new* peers!
4405 if (!heartbeat_peers_need_update())
4407 heartbeat_clear_peers_need_update();
4409 Mutex::Locker
l(heartbeat_lock
);
4411 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
4414 // build heartbeat from set
4416 RWLock::RLocker
l(pg_map_lock
);
4417 for (ceph::unordered_map
<spg_t
, PG
*>::iterator i
= pg_map
.begin();
4421 pg
->heartbeat_peer_lock
.Lock();
4422 dout(20) << i
->first
<< " heartbeat_peers " << pg
->heartbeat_peers
<< dendl
;
4423 for (set
<int>::iterator p
= pg
->heartbeat_peers
.begin();
4424 p
!= pg
->heartbeat_peers
.end();
4426 if (osdmap
->is_up(*p
))
4427 _add_heartbeat_peer(*p
);
4428 for (set
<int>::iterator p
= pg
->probe_targets
.begin();
4429 p
!= pg
->probe_targets
.end();
4431 if (osdmap
->is_up(*p
))
4432 _add_heartbeat_peer(*p
);
4433 pg
->heartbeat_peer_lock
.Unlock();
4437 // include next and previous up osds to ensure we have a fully-connected set
4438 set
<int> want
, extras
;
4439 int next
= osdmap
->get_next_up_osd_after(whoami
);
4442 int prev
= osdmap
->get_previous_up_osd_before(whoami
);
4443 if (prev
>= 0 && prev
!= next
)
4446 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
4447 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
4449 _add_heartbeat_peer(*p
);
4452 // remove down peers; enumerate extras
4453 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
4454 while (p
!= heartbeat_peers
.end()) {
4455 if (!osdmap
->is_up(p
->first
)) {
4458 _remove_heartbeat_peer(o
);
4461 if (p
->second
.epoch
< osdmap
->get_epoch()) {
4462 extras
.insert(p
->first
);
4468 int start
= osdmap
->get_next_up_osd_after(whoami
);
4469 for (int n
= start
; n
>= 0; ) {
4470 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
4472 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
4473 dout(10) << " adding random peer osd." << n
<< dendl
;
4475 _add_heartbeat_peer(n
);
4477 n
= osdmap
->get_next_up_osd_after(n
);
4479 break; // came full circle; stop
4483 for (set
<int>::iterator p
= extras
.begin();
4484 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
4488 _remove_heartbeat_peer(*p
);
4491 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
4494 void OSD::reset_heartbeat_peers()
4496 assert(osd_lock
.is_locked());
4497 dout(10) << "reset_heartbeat_peers" << dendl
;
4498 Mutex::Locker
l(heartbeat_lock
);
4499 while (!heartbeat_peers
.empty()) {
4500 HeartbeatInfo
& hi
= heartbeat_peers
.begin()->second
;
4501 hi
.con_back
->mark_down();
4503 hi
.con_front
->mark_down();
4505 heartbeat_peers
.erase(heartbeat_peers
.begin());
4507 failure_queue
.clear();
4510 void OSD::handle_osd_ping(MOSDPing
*m
)
4512 if (superblock
.cluster_fsid
!= m
->fsid
) {
4513 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
4514 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
<< dendl
;
4519 int from
= m
->get_source().num();
4521 heartbeat_lock
.Lock();
4522 if (is_stopping()) {
4523 heartbeat_lock
.Unlock();
4528 OSDMapRef curmap
= service
.get_osdmap();
4533 case MOSDPing::PING
:
4535 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
4536 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
4537 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
4538 if (heartbeat_drop
->second
== 0) {
4539 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
4541 --heartbeat_drop
->second
;
4542 dout(5) << "Dropping heartbeat from " << from
4543 << ", " << heartbeat_drop
->second
4544 << " remaining to drop" << dendl
;
4547 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
4548 ((((double)(rand()%100))/100.0))) {
4550 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
4551 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
4552 dout(5) << "Dropping heartbeat from " << from
4553 << ", " << heartbeat_drop
->second
4554 << " remaining to drop" << dendl
;
4559 if (!cct
->get_heartbeat_map()->is_healthy()) {
4560 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl
;
4564 Message
*r
= new MOSDPing(monc
->get_fsid(),
4565 curmap
->get_epoch(),
4566 MOSDPing::PING_REPLY
,
4568 m
->get_connection()->send_message(r
);
4570 if (curmap
->is_up(from
)) {
4571 service
.note_peer_epoch(from
, m
->map_epoch
);
4573 ConnectionRef con
= service
.get_con_osd_cluster(from
, curmap
->get_epoch());
4575 service
.share_map_peer(from
, con
.get());
4578 } else if (!curmap
->exists(from
) ||
4579 curmap
->get_down_at(from
) > m
->map_epoch
) {
4580 // tell them they have died
4581 Message
*r
= new MOSDPing(monc
->get_fsid(),
4582 curmap
->get_epoch(),
4585 m
->get_connection()->send_message(r
);
4590 case MOSDPing::PING_REPLY
:
4592 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
4593 if (i
!= heartbeat_peers
.end()) {
4594 if (m
->get_connection() == i
->second
.con_back
) {
4595 dout(25) << "handle_osd_ping got reply from osd." << from
4596 << " first_tx " << i
->second
.first_tx
4597 << " last_tx " << i
->second
.last_tx
4598 << " last_rx_back " << i
->second
.last_rx_back
<< " -> " << m
->stamp
4599 << " last_rx_front " << i
->second
.last_rx_front
4601 i
->second
.last_rx_back
= m
->stamp
;
4602 // if there is no front con, set both stamps.
4603 if (i
->second
.con_front
== NULL
)
4604 i
->second
.last_rx_front
= m
->stamp
;
4605 } else if (m
->get_connection() == i
->second
.con_front
) {
4606 dout(25) << "handle_osd_ping got reply from osd." << from
4607 << " first_tx " << i
->second
.first_tx
4608 << " last_tx " << i
->second
.last_tx
4609 << " last_rx_back " << i
->second
.last_rx_back
4610 << " last_rx_front " << i
->second
.last_rx_front
<< " -> " << m
->stamp
4612 i
->second
.last_rx_front
= m
->stamp
;
4615 utime_t cutoff
= ceph_clock_now();
4616 cutoff
-= cct
->_conf
->osd_heartbeat_grace
;
4617 if (i
->second
.is_healthy(cutoff
)) {
4618 // Cancel false reports
4619 auto failure_queue_entry
= failure_queue
.find(from
);
4620 if (failure_queue_entry
!= failure_queue
.end()) {
4621 dout(10) << "handle_osd_ping canceling queued "
4622 << "failure report for osd." << from
<< dendl
;
4623 failure_queue
.erase(failure_queue_entry
);
4626 auto failure_pending_entry
= failure_pending
.find(from
);
4627 if (failure_pending_entry
!= failure_pending
.end()) {
4628 dout(10) << "handle_osd_ping canceling in-flight "
4629 << "failure report for osd." << from
<< dendl
;
4630 send_still_alive(curmap
->get_epoch(),
4631 failure_pending_entry
->second
.second
);
4632 failure_pending
.erase(failure_pending_entry
);
4638 curmap
->is_up(from
)) {
4639 service
.note_peer_epoch(from
, m
->map_epoch
);
4641 ConnectionRef con
= service
.get_con_osd_cluster(from
, curmap
->get_epoch());
4643 service
.share_map_peer(from
, con
.get());
4650 case MOSDPing::YOU_DIED
:
4651 dout(10) << "handle_osd_ping " << m
->get_source_inst()
4652 << " says i am down in " << m
->map_epoch
<< dendl
;
4653 osdmap_subscribe(curmap
->get_epoch()+1, false);
4657 heartbeat_lock
.Unlock();
4661 void OSD::heartbeat_entry()
4663 Mutex::Locker
l(heartbeat_lock
);
4666 while (!heartbeat_stop
) {
4669 double wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
4671 w
.set_from_double(wait
);
4672 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
4673 heartbeat_cond
.WaitInterval(heartbeat_lock
, w
);
4676 dout(30) << "heartbeat_entry woke up" << dendl
;
4680 void OSD::heartbeat_check()
4682 assert(heartbeat_lock
.is_locked());
4683 utime_t now
= ceph_clock_now();
4685 // check for heartbeat replies (move me elsewhere?)
4686 utime_t cutoff
= now
;
4687 cutoff
-= cct
->_conf
->osd_heartbeat_grace
;
4688 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
4689 p
!= heartbeat_peers
.end();
4692 if (p
->second
.first_tx
== utime_t()) {
4693 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
4694 << "yet, skipping" << dendl
;
4698 dout(25) << "heartbeat_check osd." << p
->first
4699 << " first_tx " << p
->second
.first_tx
4700 << " last_tx " << p
->second
.last_tx
4701 << " last_rx_back " << p
->second
.last_rx_back
4702 << " last_rx_front " << p
->second
.last_rx_front
4704 if (p
->second
.is_unhealthy(cutoff
)) {
4705 if (p
->second
.last_rx_back
== utime_t() ||
4706 p
->second
.last_rx_front
== utime_t()) {
4707 derr
<< "heartbeat_check: no reply from " << p
->second
.con_front
->get_peer_addr().get_sockaddr()
4708 << " osd." << p
->first
<< " ever on either front or back, first ping sent "
4709 << p
->second
.first_tx
<< " (cutoff " << cutoff
<< ")" << dendl
;
4711 failure_queue
[p
->first
] = p
->second
.last_tx
;
4713 derr
<< "heartbeat_check: no reply from " << p
->second
.con_front
->get_peer_addr().get_sockaddr()
4714 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
4715 << " front " << p
->second
.last_rx_front
4716 << " (cutoff " << cutoff
<< ")" << dendl
;
4718 failure_queue
[p
->first
] = MIN(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
4724 void OSD::heartbeat()
4726 dout(30) << "heartbeat" << dendl
;
4730 int n_samples
= 86400 / cct
->_conf
->osd_heartbeat_interval
;
4731 if (getloadavg(loadavgs
, 1) == 1) {
4732 logger
->set(l_osd_loadavg
, 100 * loadavgs
[0]);
4733 daily_loadavg
= (daily_loadavg
* (n_samples
- 1) + loadavgs
[0]) / n_samples
;
4734 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg
<< dendl
;
4737 dout(30) << "heartbeat checking stats" << dendl
;
4740 vector
<int> hb_peers
;
4741 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
4742 p
!= heartbeat_peers
.end();
4744 hb_peers
.push_back(p
->first
);
4745 service
.update_osd_stat(hb_peers
);
4747 dout(5) << "heartbeat: " << service
.get_osd_stat() << dendl
;
4749 utime_t now
= ceph_clock_now();
4752 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
4753 i
!= heartbeat_peers
.end();
4755 int peer
= i
->first
;
4756 i
->second
.last_tx
= now
;
4757 if (i
->second
.first_tx
== utime_t())
4758 i
->second
.first_tx
= now
;
4759 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
4760 i
->second
.con_back
->send_message(new MOSDPing(monc
->get_fsid(),
4761 service
.get_osdmap()->get_epoch(),
4765 if (i
->second
.con_front
)
4766 i
->second
.con_front
->send_message(new MOSDPing(monc
->get_fsid(),
4767 service
.get_osdmap()->get_epoch(),
4772 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
4774 // hmm.. am i all alone?
4775 dout(30) << "heartbeat lonely?" << dendl
;
4776 if (heartbeat_peers
.empty()) {
4777 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
4778 last_mon_heartbeat
= now
;
4779 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
4780 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
4784 dout(30) << "heartbeat done" << dendl
;
4787 bool OSD::heartbeat_reset(Connection
*con
)
4789 HeartbeatSession
*s
= static_cast<HeartbeatSession
*>(con
->get_priv());
4791 heartbeat_lock
.Lock();
4792 if (is_stopping()) {
4793 heartbeat_lock
.Unlock();
4797 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(s
->peer
);
4798 if (p
!= heartbeat_peers
.end() &&
4799 (p
->second
.con_back
== con
||
4800 p
->second
.con_front
== con
)) {
4801 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
4802 << ", reopening" << dendl
;
4803 if (con
!= p
->second
.con_back
) {
4804 p
->second
.con_back
->mark_down();
4806 p
->second
.con_back
.reset(NULL
);
4807 if (p
->second
.con_front
&& con
!= p
->second
.con_front
) {
4808 p
->second
.con_front
->mark_down();
4810 p
->second
.con_front
.reset(NULL
);
4811 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
4813 p
->second
.con_back
= newcon
.first
.get();
4814 p
->second
.con_back
->set_priv(s
->get());
4815 if (newcon
.second
) {
4816 p
->second
.con_front
= newcon
.second
.get();
4817 p
->second
.con_front
->set_priv(s
->get());
4820 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
4821 << ", raced with osdmap update, closing out peer" << dendl
;
4822 heartbeat_peers
.erase(p
);
4825 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
4827 heartbeat_lock
.Unlock();
4835 // =========================================
4839 assert(osd_lock
.is_locked());
4840 dout(10) << "tick" << dendl
;
4842 if (is_active() || is_waiting_for_healthy()) {
4843 maybe_update_heartbeat_peers();
4846 if (is_waiting_for_healthy()) {
4852 tick_timer
.add_event_after(OSD_TICK_INTERVAL
, new C_Tick(this));
4855 const auto now
= ceph::coarse_mono_clock::now();
4856 const auto elapsed
= now
- last_sent_beacon
;
4857 if (chrono::duration_cast
<chrono::seconds
>(elapsed
).count() >
4858 cct
->_conf
->osd_beacon_report_interval
) {
4864 void OSD::tick_without_osd_lock()
4866 assert(tick_timer_lock
.is_locked());
4867 dout(10) << "tick_without_osd_lock" << dendl
;
4869 logger
->set(l_osd_buf
, buffer::get_total_alloc());
4870 logger
->set(l_osd_history_alloc_bytes
, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
4871 logger
->set(l_osd_history_alloc_num
, buffer::get_history_alloc_num());
4872 logger
->set(l_osd_cached_crc
, buffer::get_cached_crc());
4873 logger
->set(l_osd_cached_crc_adjusted
, buffer::get_cached_crc_adjusted());
4874 logger
->set(l_osd_missed_crc
, buffer::get_missed_crc());
4876 // osd_lock is not being held, which means the OSD state
4877 // might change when doing the monitor report
4878 if (is_active() || is_waiting_for_healthy()) {
4879 heartbeat_lock
.Lock();
4881 heartbeat_lock
.Unlock();
4883 map_lock
.get_read();
4884 Mutex::Locker
l(mon_report_lock
);
4888 bool report
= false;
4889 utime_t now
= ceph_clock_now();
4890 pg_stat_queue_lock
.Lock();
4891 double backoff
= stats_ack_timeout
/ cct
->_conf
->osd_mon_ack_timeout
;
4892 double adjusted_min
= cct
->_conf
->osd_mon_report_interval_min
* backoff
;
4893 // note: we shouldn't adjust max because it must remain < the
4894 // mon's mon_osd_report_timeout (which defaults to 1.5x our
4896 double max
= cct
->_conf
->osd_mon_report_interval_max
;
4897 if (!outstanding_pg_stats
.empty() &&
4898 (now
- stats_ack_timeout
) > last_pg_stats_ack
) {
4899 dout(1) << __func__
<< " mon hasn't acked PGStats in "
4900 << now
- last_pg_stats_ack
4901 << " seconds, reconnecting elsewhere" << dendl
;
4903 last_pg_stats_ack
= now
; // reset clock
4904 last_pg_stats_sent
= utime_t();
4906 MAX(cct
->_conf
->osd_mon_ack_timeout
,
4907 stats_ack_timeout
* cct
->_conf
->osd_stats_ack_timeout_factor
);
4908 outstanding_pg_stats
.clear();
4910 if (now
- last_pg_stats_sent
> max
) {
4911 osd_stat_updated
= true;
4913 } else if (service
.need_fullness_update()) {
4915 } else if ((int)outstanding_pg_stats
.size() >=
4916 cct
->_conf
->osd_mon_report_max_in_flight
) {
4917 dout(20) << __func__
<< " have max " << outstanding_pg_stats
4918 << " stats updates in flight" << dendl
;
4920 if (now
- last_mon_report
> adjusted_min
) {
4921 dout(20) << __func__
<< " stats backoff " << backoff
4922 << " adjusted_min " << adjusted_min
<< " - sending report"
4924 osd_stat_updated
= true;
4928 pg_stat_queue_lock
.Unlock();
4931 monc
->reopen_session();
4932 } else if (report
) {
4933 last_mon_report
= now
;
4935 // do any pending reports
4940 map_lock
.put_read();
4944 if (!scrub_random_backoff()) {
4947 service
.promote_throttle_recalibrate();
4950 check_ops_in_flight();
4951 service
.kick_recovery_queue();
4952 tick_timer_without_osd_lock
.add_event_after(OSD_TICK_INTERVAL
, new C_Tick_WithoutOSDLock(this));
4955 void OSD::check_ops_in_flight()
4957 vector
<string
> warnings
;
4958 if (op_tracker
.check_ops_in_flight(warnings
)) {
4959 for (vector
<string
>::iterator i
= warnings
.begin();
4960 i
!= warnings
.end();
4968 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
4969 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
4970 // setomapheader <pool-id> [namespace/]<obj-name> <header>
4971 // getomap <pool> [namespace/]<obj-name>
4972 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
4973 // injectmdataerr [namespace/]<obj-name> [shardid]
4974 // injectdataerr [namespace/]<obj-name> [shardid]
4976 // set_recovery_delay [utime]
4977 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
4978 const std::string
&command
, cmdmap_t
& cmdmap
, ostream
&ss
)
4981 //Support changing the omap on a single osd by using the Admin Socket to
4982 //directly request the osd make a change.
4983 if (command
== "setomapval" || command
== "rmomapkey" ||
4984 command
== "setomapheader" || command
== "getomap" ||
4985 command
== "truncobj" || command
== "injectmdataerr" ||
4986 command
== "injectdataerr"
4990 OSDMapRef curmap
= service
->get_osdmap();
4995 cmd_getval(service
->cct
, cmdmap
, "pool", poolstr
);
4996 pool
= curmap
->lookup_pg_pool_name(poolstr
);
4997 //If we can't find it by name then maybe id specified
4998 if (pool
< 0 && isdigit(poolstr
[0]))
4999 pool
= atoll(poolstr
.c_str());
5001 ss
<< "Invalid pool" << poolstr
;
5005 string objname
, nspace
;
5006 cmd_getval(service
->cct
, cmdmap
, "objname", objname
);
5007 std::size_t found
= objname
.find_first_of('/');
5008 if (found
!= string::npos
) {
5009 nspace
= objname
.substr(0, found
);
5010 objname
= objname
.substr(found
+1);
5012 object_locator_t
oloc(pool
, nspace
);
5013 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
5016 ss
<< "Invalid namespace/objname";
5021 cmd_getval(service
->cct
, cmdmap
, "shardid", shardid
, int64_t(shard_id_t::NO_SHARD
));
5022 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
5023 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
5024 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
5025 if (curmap
->pg_is_ec(rawpg
)) {
5026 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
5027 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
5032 ObjectStore::Transaction t
;
5034 if (command
== "setomapval") {
5035 map
<string
, bufferlist
> newattrs
;
5038 cmd_getval(service
->cct
, cmdmap
, "key", key
);
5039 cmd_getval(service
->cct
, cmdmap
, "val", valstr
);
5042 newattrs
[key
] = val
;
5043 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
5044 r
= store
->apply_transaction(service
->meta_osr
.get(), std::move(t
));
5046 ss
<< "error=" << r
;
5049 } else if (command
== "rmomapkey") {
5052 cmd_getval(service
->cct
, cmdmap
, "key", key
);
5055 t
.omap_rmkeys(coll_t(pgid
), ghobject_t(obj
), keys
);
5056 r
= store
->apply_transaction(service
->meta_osr
.get(), std::move(t
));
5058 ss
<< "error=" << r
;
5061 } else if (command
== "setomapheader") {
5062 bufferlist newheader
;
5065 cmd_getval(service
->cct
, cmdmap
, "header", headerstr
);
5066 newheader
.append(headerstr
);
5067 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
5068 r
= store
->apply_transaction(service
->meta_osr
.get(), std::move(t
));
5070 ss
<< "error=" << r
;
5073 } else if (command
== "getomap") {
5074 //Debug: Output entire omap
5076 map
<string
, bufferlist
> keyvals
;
5077 r
= store
->omap_get(coll_t(pgid
), ghobject_t(obj
), &hdrbl
, &keyvals
);
5079 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
5080 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
5081 it
!= keyvals
.end(); ++it
)
5082 ss
<< " key=" << (*it
).first
<< " val="
5083 << string((*it
).second
.c_str(), (*it
).second
.length());
5085 ss
<< "error=" << r
;
5087 } else if (command
== "truncobj") {
5089 cmd_getval(service
->cct
, cmdmap
, "len", trunclen
);
5090 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
5091 r
= store
->apply_transaction(service
->meta_osr
.get(), std::move(t
));
5093 ss
<< "error=" << r
;
5096 } else if (command
== "injectdataerr") {
5097 store
->inject_data_error(gobj
);
5099 } else if (command
== "injectmdataerr") {
5100 store
->inject_mdata_error(gobj
);
5105 if (command
== "set_recovery_delay") {
5107 cmd_getval(service
->cct
, cmdmap
, "utime", delay
, (int64_t)0);
5110 int r
= service
->cct
->_conf
->set_val("osd_recovery_delay_start",
5113 ss
<< "set_recovery_delay: error setting "
5114 << "osd_recovery_delay_start to '" << delay
<< "': error "
5118 service
->cct
->_conf
->apply_changes(NULL
);
5119 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
5120 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
5123 if (command
== "trigger_scrub") {
5125 OSDMapRef curmap
= service
->get_osdmap();
5129 cmd_getval(service
->cct
, cmdmap
, "pgid", pgidstr
);
5130 if (!pgid
.parse(pgidstr
.c_str())) {
5131 ss
<< "Invalid pgid specified";
5135 PG
*pg
= service
->osd
->_lookup_lock_pg(pgid
);
5136 if (pg
== nullptr) {
5137 ss
<< "Can't find pg " << pgid
;
5141 if (pg
->is_primary()) {
5142 pg
->unreg_next_scrub();
5143 const pg_pool_t
*p
= curmap
->get_pg_pool(pgid
.pool());
5144 double pool_scrub_max_interval
= 0;
5145 p
->opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &pool_scrub_max_interval
);
5146 double scrub_max_interval
= pool_scrub_max_interval
> 0 ?
5147 pool_scrub_max_interval
: g_conf
->osd_scrub_max_interval
;
5148 // Instead of marking must_scrub force a schedule scrub
5149 utime_t stamp
= ceph_clock_now();
5150 stamp
-= scrub_max_interval
;
5151 stamp
-= 100.0; // push back last scrub more for good measure
5152 pg
->info
.history
.last_scrub_stamp
= stamp
;
5153 pg
->reg_next_scrub();
5156 ss
<< "Not primary";
5161 if (command
== "injectfull") {
5164 OSDService::s_names state
;
5165 cmd_getval(service
->cct
, cmdmap
, "type", type
, string("full"));
5166 cmd_getval(service
->cct
, cmdmap
, "count", count
, (int64_t)-1);
5167 if (type
== "none" || count
== 0) {
5171 state
= service
->get_full_state(type
);
5172 if (state
== OSDService::s_names::INVALID
) {
5173 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5176 service
->set_injectfull(state
, count
);
5179 ss
<< "Internal error - command=" << command
;
5182 // =========================================
5185 ObjectStore
*store
, SnapMapper
*mapper
,
5187 ObjectStore::Sequencer
*osr
,
5188 coll_t coll
, DeletingStateRef dstate
,
5190 ThreadPool::TPHandle
&handle
)
5192 vector
<ghobject_t
> olist
;
5194 ObjectStore::Transaction t
;
5196 handle
.reset_tp_timeout();
5197 store
->collection_list(
5200 ghobject_t::get_max(),
5201 store
->get_ideal_list_max(),
5204 generic_dout(10) << __func__
<< " " << olist
<< dendl
;
5205 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5206 // will recheck the answer before it really goes on.
5208 for (vector
<ghobject_t
>::iterator i
= olist
.begin();
5213 OSDriver::OSTransaction
_t(osdriver
->get_transaction(&t
));
5214 int r
= mapper
->remove_oid(i
->hobj
, &_t
);
5215 if (r
!= 0 && r
!= -ENOENT
) {
5219 if (++num
>= cct
->_conf
->osd_target_transaction_size
) {
5221 store
->queue_transaction(osr
, std::move(t
), &waiter
);
5222 cont
= dstate
->pause_clearing();
5223 handle
.suspend_tp_timeout();
5225 handle
.reset_tp_timeout();
5227 cont
= dstate
->resume_clearing();
5230 t
= ObjectStore::Transaction();
5236 store
->queue_transaction(osr
, std::move(t
), &waiter
);
5237 cont
= dstate
->pause_clearing();
5238 handle
.suspend_tp_timeout();
5240 handle
.reset_tp_timeout();
5242 cont
= dstate
->resume_clearing();
5244 // whether there are more objects to remove in the collection
5245 *finished
= next
.is_max();
5249 void OSD::RemoveWQ::_process(
5250 pair
<PGRef
, DeletingStateRef
> item
,
5251 ThreadPool::TPHandle
&handle
)
5254 PGRef
pg(item
.first
);
5255 SnapMapper
&mapper
= pg
->snap_mapper
;
5256 OSDriver
&driver
= pg
->osdriver
;
5257 coll_t coll
= coll_t(pg
->info
.pgid
);
5259 bool finished
= false;
5261 if (!item
.second
->start_or_resume_clearing())
5264 bool cont
= remove_dir(
5265 pg
->cct
, store
, &mapper
, &driver
, pg
->osr
.get(), coll
, item
.second
,
5270 if (item
.second
->pause_clearing())
5275 if (!item
.second
->start_deleting())
5278 ObjectStore::Transaction t
;
5279 PGLog::clear_info_log(pg
->info
.pgid
, &t
);
5281 if (cct
->_conf
->osd_inject_failure_on_pg_removal
) {
5282 generic_derr
<< "osd_inject_failure_on_pg_removal" << dendl
;
5285 t
.remove_collection(coll
);
5287 // We need the sequencer to stick around until the op is complete
5288 store
->queue_transaction(
5293 0, // onreadable sync
5294 new ContainerContext
<PGRef
>(pg
),
5297 item
.second
->finish_deleting();
5299 // =========================================
5301 void OSD::ms_handle_connect(Connection
*con
)
5303 dout(10) << __func__
<< " con " << con
<< dendl
;
5304 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
5305 Mutex::Locker
l(osd_lock
);
5308 dout(10) << __func__
<< " on mon" << dendl
;
5312 } else if (is_booting()) {
5313 _send_boot(); // resend boot message
5315 map_lock
.get_read();
5316 Mutex::Locker
l2(mon_report_lock
);
5318 utime_t now
= ceph_clock_now();
5319 last_mon_report
= now
;
5321 // resend everything, it's a new session
5324 service
.requeue_pg_temp();
5325 service
.send_pg_temp();
5330 map_lock
.put_read();
5332 send_beacon(ceph::coarse_mono_clock::now());
5336 // full map requests may happen while active or pre-boot
5337 if (requested_full_first
) {
5338 rerequest_full_maps();
5343 void OSD::ms_handle_fast_connect(Connection
*con
)
5345 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
5346 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
5347 Session
*s
= static_cast<Session
*>(con
->get_priv());
5349 s
= new Session(cct
);
5350 con
->set_priv(s
->get());
5352 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
5353 << " addr=" << s
->con
->get_peer_addr() << dendl
;
5354 // we don't connect to clients
5355 assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
5356 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
5362 void OSD::ms_handle_fast_accept(Connection
*con
)
5364 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
5365 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
5366 Session
*s
= static_cast<Session
*>(con
->get_priv());
5368 s
= new Session(cct
);
5369 con
->set_priv(s
->get());
5371 dout(10) << "new session (incoming)" << s
<< " con=" << con
5372 << " addr=" << con
->get_peer_addr()
5373 << " must have raced with connect" << dendl
;
5374 assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
5375 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
5381 bool OSD::ms_handle_reset(Connection
*con
)
5383 Session
*session
= static_cast<Session
*>(con
->get_priv());
5384 dout(2) << "ms_handle_reset con " << con
<< " session " << session
<< dendl
;
5387 session
->wstate
.reset(con
);
5388 session
->con
.reset(NULL
); // break con <-> session ref cycle
5389 // note that we break session->con *before* the session_handle_reset
5390 // cleanup below. this avoids a race between us and
5391 // PG::add_backoff, Session::check_backoff, etc.
5392 session_handle_reset(session
);
5397 bool OSD::ms_handle_refused(Connection
*con
)
5399 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
5402 Session
*session
= static_cast<Session
*>(con
->get_priv());
5403 dout(2) << "ms_handle_refused con " << con
<< " session " << session
<< dendl
;
5406 int type
= con
->get_peer_type();
5407 // handle only OSD failures here
5408 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
5409 OSDMapRef osdmap
= get_osdmap();
5411 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
5412 if (id
>= 0 && osdmap
->is_up(id
)) {
5413 // I'm cheating mon heartbeat grace logic, because we know it's not going
5414 // to respawn alone. +1 so we won't hit any boundary case.
5415 monc
->send_mon_message(new MOSDFailure(monc
->get_fsid(),
5416 osdmap
->get_inst(id
),
5417 cct
->_conf
->osd_heartbeat_grace
+ 1,
5418 osdmap
->get_epoch(),
5419 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
5428 struct C_OSD_GetVersion
: public Context
{
5430 uint64_t oldest
, newest
;
5431 explicit C_OSD_GetVersion(OSD
*o
) : osd(o
), oldest(0), newest(0) {}
5432 void finish(int r
) override
{
5434 osd
->_got_mon_epochs(oldest
, newest
);
5438 void OSD::start_boot()
5440 if (!_is_healthy()) {
5441 // if we are not healthy, do not mark ourselves up (yet)
5442 dout(1) << "not healthy; waiting to boot" << dendl
;
5443 if (!is_waiting_for_healthy())
5444 start_waiting_for_healthy();
5445 // send pings sooner rather than later
5449 dout(1) << __func__
<< dendl
;
5450 set_state(STATE_PREBOOT
);
5451 dout(10) << "start_boot - have maps " << superblock
.oldest_map
5452 << ".." << superblock
.newest_map
<< dendl
;
5453 C_OSD_GetVersion
*c
= new C_OSD_GetVersion(this);
5454 monc
->get_version("osdmap", &c
->newest
, &c
->oldest
, c
);
5457 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
5459 Mutex::Locker
l(osd_lock
);
5461 _preboot(oldest
, newest
);
5465 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
5467 assert(is_preboot());
5468 dout(10) << __func__
<< " _preboot mon has osdmaps "
5469 << oldest
<< ".." << newest
<< dendl
;
5471 // ensure our local fullness awareness is accurate
5474 // if our map within recent history, try to add ourselves to the osdmap.
5475 if (osdmap
->test_flag(CEPH_OSDMAP_NOUP
)) {
5476 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
5477 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
5478 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5480 } else if (!osdmap
->test_flag(CEPH_OSDMAP_REQUIRE_JEWEL
)) {
5481 derr
<< "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
5483 } else if (!monc
->monmap
.get_required_features().contains_all(
5484 ceph::features::mon::FEATURE_LUMINOUS
)) {
5485 derr
<< "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
5486 << "Luminous or later before Luminous OSDs will boot" << dendl
;
5487 } else if (service
.need_fullness_update()) {
5488 derr
<< "osdmap fullness state needs update" << dendl
;
5490 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
5491 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
5496 // get all the latest maps
5497 if (osdmap
->get_epoch() + 1 >= oldest
)
5498 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
5500 osdmap_subscribe(oldest
- 1, true);
5503 void OSD::send_full_update()
5505 if (!service
.need_fullness_update())
5508 if (service
.is_full()) {
5509 state
= CEPH_OSD_FULL
;
5510 } else if (service
.is_backfillfull()) {
5511 state
= CEPH_OSD_BACKFILLFULL
;
5512 } else if (service
.is_nearfull()) {
5513 state
= CEPH_OSD_NEARFULL
;
5516 OSDMap::calc_state_set(state
, s
);
5517 dout(10) << __func__
<< " want state " << s
<< dendl
;
5518 monc
->send_mon_message(new MOSDFull(osdmap
->get_epoch(), state
));
5521 void OSD::start_waiting_for_healthy()
5523 dout(1) << "start_waiting_for_healthy" << dendl
;
5524 set_state(STATE_WAITING_FOR_HEALTHY
);
5525 last_heartbeat_resample
= utime_t();
5528 bool OSD::_is_healthy()
5530 if (!cct
->get_heartbeat_map()->is_healthy()) {
5531 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
5535 if (is_waiting_for_healthy()) {
5536 Mutex::Locker
l(heartbeat_lock
);
5537 utime_t cutoff
= ceph_clock_now();
5538 cutoff
-= cct
->_conf
->osd_heartbeat_grace
;
5539 int num
= 0, up
= 0;
5540 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5541 p
!= heartbeat_peers
.end();
5543 if (p
->second
.is_healthy(cutoff
))
5547 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
5548 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
5549 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
5557 void OSD::_send_boot()
5559 dout(10) << "_send_boot" << dendl
;
5560 entity_addr_t cluster_addr
= cluster_messenger
->get_myaddr();
5561 Connection
*local_connection
= cluster_messenger
->get_loopback_connection().get();
5562 if (cluster_addr
.is_blank_ip()) {
5563 int port
= cluster_addr
.get_port();
5564 cluster_addr
= client_messenger
->get_myaddr();
5565 cluster_addr
.set_port(port
);
5566 cluster_messenger
->set_addr_unknowns(cluster_addr
);
5567 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl
;
5569 Session
*s
= static_cast<Session
*>(local_connection
->get_priv());
5573 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
5576 entity_addr_t hb_back_addr
= hb_back_server_messenger
->get_myaddr();
5577 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
5578 if (hb_back_addr
.is_blank_ip()) {
5579 int port
= hb_back_addr
.get_port();
5580 hb_back_addr
= cluster_addr
;
5581 hb_back_addr
.set_port(port
);
5582 hb_back_server_messenger
->set_addr_unknowns(hb_back_addr
);
5583 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl
;
5585 Session
*s
= static_cast<Session
*>(local_connection
->get_priv());
5589 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
5592 entity_addr_t hb_front_addr
= hb_front_server_messenger
->get_myaddr();
5593 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
5594 if (hb_front_addr
.is_blank_ip()) {
5595 int port
= hb_front_addr
.get_port();
5596 hb_front_addr
= client_messenger
->get_myaddr();
5597 hb_front_addr
.set_port(port
);
5598 hb_front_server_messenger
->set_addr_unknowns(hb_front_addr
);
5599 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl
;
5601 Session
*s
= static_cast<Session
*>(local_connection
->get_priv());
5605 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
5608 MOSDBoot
*mboot
= new MOSDBoot(superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
5609 hb_back_addr
, hb_front_addr
, cluster_addr
,
5611 dout(10) << " client_addr " << client_messenger
->get_myaddr()
5612 << ", cluster_addr " << cluster_addr
5613 << ", hb_back_addr " << hb_back_addr
5614 << ", hb_front_addr " << hb_front_addr
5616 _collect_metadata(&mboot
->metadata
);
5617 monc
->send_mon_message(mboot
);
5618 set_state(STATE_BOOTING
);
5621 void OSD::_collect_metadata(map
<string
,string
> *pm
)
5624 (*pm
)["osd_data"] = dev_path
;
5625 (*pm
)["osd_journal"] = journal_path
;
5626 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddr());
5627 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddr());
5628 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddr());
5629 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddr());
5632 (*pm
)["osd_objectstore"] = store
->get_type();
5633 store
->collect_metadata(pm
);
5635 collect_sys_info(pm
, cct
);
5637 dout(10) << __func__
<< " " << *pm
<< dendl
;
5640 void OSD::queue_want_up_thru(epoch_t want
)
5642 map_lock
.get_read();
5643 epoch_t cur
= osdmap
->get_up_thru(whoami
);
5644 Mutex::Locker
l(mon_report_lock
);
5645 if (want
> up_thru_wanted
) {
5646 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
5647 << ", currently " << cur
5649 up_thru_wanted
= want
;
5652 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
5653 << ", currently " << cur
5656 map_lock
.put_read();
5659 void OSD::send_alive()
5661 assert(mon_report_lock
.is_locked());
5662 if (!osdmap
->exists(whoami
))
5664 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
5665 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
5666 if (up_thru_wanted
> up_thru
) {
5667 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
5668 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
5672 void OSD::request_full_map(epoch_t first
, epoch_t last
)
5674 dout(10) << __func__
<< " " << first
<< ".." << last
5675 << ", previously requested "
5676 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
5677 assert(osd_lock
.is_locked());
5678 assert(first
> 0 && last
> 0);
5679 assert(first
<= last
);
5680 assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
5681 if (requested_full_first
== 0) {
5683 requested_full_first
= first
;
5684 requested_full_last
= last
;
5685 } else if (last
<= requested_full_last
) {
5689 // additional request
5690 first
= requested_full_last
+ 1;
5691 requested_full_last
= last
;
5693 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
5694 req
->request_full(first
, last
);
5695 monc
->send_mon_message(req
);
5698 void OSD::got_full_map(epoch_t e
)
5700 assert(requested_full_first
<= requested_full_last
);
5701 assert(osd_lock
.is_locked());
5702 if (requested_full_first
== 0) {
5703 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
5706 if (e
< requested_full_first
) {
5707 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
5708 << ".." << requested_full_last
5709 << ", ignoring" << dendl
;
5712 if (e
>= requested_full_last
) {
5713 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
5714 << ".." << requested_full_last
<< ", resetting" << dendl
;
5715 requested_full_first
= requested_full_last
= 0;
5719 requested_full_first
= e
+ 1;
5721 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
5722 << ".." << requested_full_last
5723 << ", still need more" << dendl
;
5726 void OSD::requeue_failures()
5728 Mutex::Locker
l(heartbeat_lock
);
5729 unsigned old_queue
= failure_queue
.size();
5730 unsigned old_pending
= failure_pending
.size();
5731 for (map
<int,pair
<utime_t
,entity_inst_t
> >::iterator p
=
5732 failure_pending
.begin();
5733 p
!= failure_pending
.end(); ) {
5734 failure_queue
[p
->first
] = p
->second
.first
;
5735 failure_pending
.erase(p
++);
5737 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
5738 << failure_queue
.size() << dendl
;
5741 void OSD::send_failures()
5743 assert(map_lock
.is_locked());
5744 assert(mon_report_lock
.is_locked());
5745 Mutex::Locker
l(heartbeat_lock
);
5746 utime_t now
= ceph_clock_now();
5747 while (!failure_queue
.empty()) {
5748 int osd
= failure_queue
.begin()->first
;
5749 entity_inst_t i
= osdmap
->get_inst(osd
);
5750 if (!failure_pending
.count(osd
)) {
5751 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
5752 monc
->send_mon_message(new MOSDFailure(monc
->get_fsid(), i
, failed_for
,
5753 osdmap
->get_epoch()));
5754 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
, i
);
5756 failure_queue
.erase(osd
);
5760 void OSD::send_still_alive(epoch_t epoch
, const entity_inst_t
&i
)
5762 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), i
, 0, epoch
, MOSDFailure::FLAG_ALIVE
);
5763 monc
->send_mon_message(m
);
5766 void OSD::send_pg_stats(const utime_t
&now
)
5768 assert(map_lock
.is_locked());
5769 dout(20) << "send_pg_stats" << dendl
;
5771 osd_stat_t cur_stat
= service
.get_osd_stat();
5773 cur_stat
.os_perf_stat
= store
->get_cur_stats();
5775 pg_stat_queue_lock
.Lock();
5777 if (osd_stat_updated
|| !pg_stat_queue
.empty()) {
5778 last_pg_stats_sent
= now
;
5779 osd_stat_updated
= false;
5781 dout(10) << "send_pg_stats - " << pg_stat_queue
.size() << " pgs updated" << dendl
;
5783 utime_t
had_for(now
);
5784 had_for
-= had_map_since
;
5786 MPGStats
*m
= new MPGStats(monc
->get_fsid(), osdmap
->get_epoch(), had_for
);
5788 uint64_t tid
= ++pg_stat_tid
;
5790 m
->osd_stat
= cur_stat
;
5792 xlist
<PG
*>::iterator p
= pg_stat_queue
.begin();
5796 if (!pg
->is_primary()) { // we hold map_lock; role is stable.
5797 pg
->stat_queue_item
.remove_myself();
5798 pg
->put("pg_stat_queue");
5801 pg
->pg_stats_publish_lock
.Lock();
5802 if (pg
->pg_stats_publish_valid
) {
5803 m
->pg_stat
[pg
->info
.pgid
.pgid
] = pg
->pg_stats_publish
;
5804 dout(25) << " sending " << pg
->info
.pgid
<< " " << pg
->pg_stats_publish
.reported_epoch
<< ":"
5805 << pg
->pg_stats_publish
.reported_seq
<< dendl
;
5807 dout(25) << " NOT sending " << pg
->info
.pgid
<< " " << pg
->pg_stats_publish
.reported_epoch
<< ":"
5808 << pg
->pg_stats_publish
.reported_seq
<< ", not valid" << dendl
;
5810 pg
->pg_stats_publish_lock
.Unlock();
5813 if (last_pg_stats_ack
== utime_t() || !outstanding_pg_stats
.empty()) {
5814 last_pg_stats_ack
= ceph_clock_now();
5816 outstanding_pg_stats
.insert(tid
);
5817 dout(20) << __func__
<< " updates pending: " << outstanding_pg_stats
<< dendl
;
5819 monc
->send_mon_message(m
);
5822 pg_stat_queue_lock
.Unlock();
5825 void OSD::handle_pg_stats_ack(MPGStatsAck
*ack
)
5827 dout(10) << "handle_pg_stats_ack " << dendl
;
5829 if (!require_mon_peer(ack
)) {
5834 // NOTE: we may get replies from a previous mon even while
5835 // outstanding_pg_stats is empty if reconnecting races with replies
5838 pg_stat_queue_lock
.Lock();
5840 last_pg_stats_ack
= ceph_clock_now();
5842 // decay timeout slowly (analogous to TCP)
5844 MAX(cct
->_conf
->osd_mon_ack_timeout
,
5845 stats_ack_timeout
* cct
->_conf
->osd_stats_ack_timeout_decay
);
5846 dout(20) << __func__
<< " timeout now " << stats_ack_timeout
<< dendl
;
5848 if (ack
->get_tid() > pg_stat_tid_flushed
) {
5849 pg_stat_tid_flushed
= ack
->get_tid();
5850 pg_stat_queue_cond
.Signal();
5853 xlist
<PG
*>::iterator p
= pg_stat_queue
.begin();
5859 auto acked
= ack
->pg_stat
.find(pg
->info
.pgid
.pgid
);
5860 if (acked
!= ack
->pg_stat
.end()) {
5861 pg
->pg_stats_publish_lock
.Lock();
5862 if (acked
->second
.first
== pg
->pg_stats_publish
.reported_seq
&&
5863 acked
->second
.second
== pg
->pg_stats_publish
.reported_epoch
) {
5864 dout(25) << " ack on " << pg
->info
.pgid
<< " " << pg
->pg_stats_publish
.reported_epoch
5865 << ":" << pg
->pg_stats_publish
.reported_seq
<< dendl
;
5866 pg
->stat_queue_item
.remove_myself();
5867 pg
->put("pg_stat_queue");
5869 dout(25) << " still pending " << pg
->info
.pgid
<< " " << pg
->pg_stats_publish
.reported_epoch
5870 << ":" << pg
->pg_stats_publish
.reported_seq
<< " > acked "
5871 << acked
->second
<< dendl
;
5873 pg
->pg_stats_publish_lock
.Unlock();
5875 dout(30) << " still pending " << pg
->info
.pgid
<< " " << pg
->pg_stats_publish
.reported_epoch
5876 << ":" << pg
->pg_stats_publish
.reported_seq
<< dendl
;
5880 outstanding_pg_stats
.erase(ack
->get_tid());
5881 dout(20) << __func__
<< " still pending: " << outstanding_pg_stats
<< dendl
;
5883 pg_stat_queue_lock
.Unlock();
5888 void OSD::flush_pg_stats()
5890 dout(10) << "flush_pg_stats" << dendl
;
5892 utime_t now
= ceph_clock_now();
5893 map_lock
.get_read();
5894 mon_report_lock
.Lock();
5896 mon_report_lock
.Unlock();
5897 map_lock
.put_read();
5900 pg_stat_queue_lock
.Lock();
5901 uint64_t tid
= pg_stat_tid
;
5902 dout(10) << "flush_pg_stats waiting for stats tid " << tid
<< " to flush" << dendl
;
5903 while (tid
> pg_stat_tid_flushed
)
5904 pg_stat_queue_cond
.Wait(pg_stat_queue_lock
);
5905 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid
<< " to flush" << dendl
;
5906 pg_stat_queue_lock
.Unlock();
5911 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
5913 const auto& monmap
= monc
->monmap
;
5914 // send beacon to mon even if we are just connected, and the monmap is not
5915 // initialized yet by then.
5916 if (monmap
.epoch
> 0 &&
5917 monmap
.get_required_features().contains_all(
5918 ceph::features::mon::FEATURE_LUMINOUS
)) {
5919 dout(20) << __func__
<< " sending" << dendl
;
5920 last_sent_beacon
= now
;
5921 MOSDBeacon
* beacon
= nullptr;
5923 Mutex::Locker l
{min_last_epoch_clean_lock
};
5924 beacon
= new MOSDBeacon(osdmap
->get_epoch(), min_last_epoch_clean
);
5925 std::swap(beacon
->pgs
, min_last_epoch_clean_pgs
);
5927 monc
->send_mon_message(beacon
);
5929 dout(20) << __func__
<< " not sending" << dendl
;
5933 void OSD::handle_command(MMonCommand
*m
)
5935 if (!require_mon_peer(m
)) {
5940 Command
*c
= new Command(m
->cmd
, m
->get_tid(), m
->get_data(), NULL
);
5941 command_wq
.queue(c
);
5945 void OSD::handle_command(MCommand
*m
)
5947 ConnectionRef con
= m
->get_connection();
5948 Session
*session
= static_cast<Session
*>(con
->get_priv());
5950 con
->send_message(new MCommandReply(m
, -EPERM
));
5955 OSDCap
& caps
= session
->caps
;
5958 if (!caps
.allow_all() || m
->get_source().is_mon()) {
5959 con
->send_message(new MCommandReply(m
, -EPERM
));
5964 Command
*c
= new Command(m
->cmd
, m
->get_tid(), m
->get_data(), con
.get());
5965 command_wq
.queue(c
);
5975 string availability
;
5976 } osd_commands
[] = {
5978 #define COMMAND(parsesig, helptext, module, perm, availability) \
5979 {parsesig, helptext, module, perm, availability},
5981 // yes, these are really pg commands, but there's a limit to how
5982 // much work it's worth. The OSD returns all of them. Make this
5983 // form (pg <pgid> <cmd>) valid only for the cli.
5984 // Rest uses "tell <pgid> <cmd>"
5987 "name=pgid,type=CephPgid " \
5988 "name=cmd,type=CephChoices,strings=query", \
5989 "show details of a specific pg", "osd", "r", "cli")
5991 "name=pgid,type=CephPgid " \
5992 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
5993 "name=mulcmd,type=CephChoices,strings=revert|delete", \
5994 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
5997 "name=pgid,type=CephPgid " \
5998 "name=cmd,type=CephChoices,strings=list_missing " \
5999 "name=offset,type=CephString,req=false",
6000 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6003 // new form: tell <pgid> <cmd> for both cli and rest
6006 "show details of a specific pg", "osd", "r", "cli,rest")
6007 COMMAND("mark_unfound_lost " \
6008 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6009 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6010 "osd", "rw", "cli,rest")
6011 COMMAND("list_missing " \
6012 "name=offset,type=CephString,req=false",
6013 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6014 "osd", "r", "cli,rest")
6016 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6017 COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6018 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6019 COMMAND("injectargs " \
6020 "name=injected_args,type=CephString,n=N",
6021 "inject configuration arguments into running OSD",
6022 "osd", "rw", "cli,rest")
6023 COMMAND("cluster_log " \
6024 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6025 "name=message,type=CephString,n=N",
6026 "log a message to the cluster log",
6027 "osd", "rw", "cli,rest")
6029 "name=count,type=CephInt,req=false " \
6030 "name=size,type=CephInt,req=false " \
6031 "name=object_size,type=CephInt,req=false " \
6032 "name=object_num,type=CephInt,req=false ", \
6033 "OSD benchmark: write <count> <size>-byte objects, " \
6034 "(default 1G size 4MB). Results in log.",
6035 "osd", "rw", "cli,rest")
6036 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6038 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6039 "show heap usage info (available only if compiled with tcmalloc)", \
6040 "osd", "rw", "cli,rest")
6041 COMMAND("debug dump_missing " \
6042 "name=filename,type=CephFilepath",
6043 "dump missing objects to a named file", "osd", "r", "cli,rest")
6044 COMMAND("debug kick_recovery_wq " \
6045 "name=delay,type=CephInt,range=0",
6046 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6047 COMMAND("cpu_profiler " \
6048 "name=arg,type=CephChoices,strings=status|flush",
6049 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6050 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6051 "osd", "r", "cli,rest")
6052 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6053 "osd", "rw", "cli,rest")
6056 void OSD::do_command(Connection
*con
, ceph_tid_t tid
, vector
<string
>& cmd
, bufferlist
& data
)
6059 stringstream ss
, ds
;
6063 dout(20) << "do_command tid " << tid
<< " " << cmd
<< dendl
;
6065 map
<string
, cmd_vartype
> cmdmap
;
6069 boost::scoped_ptr
<Formatter
> f
;
6072 ss
<< "no command given";
6076 if (!cmdmap_from_json(cmd
, &cmdmap
, ss
)) {
6081 cmd_getval(cct
, cmdmap
, "prefix", prefix
);
6083 if (prefix
== "get_command_descriptions") {
6085 JSONFormatter
*f
= new JSONFormatter();
6086 f
->open_object_section("command_descriptions");
6087 for (OSDCommand
*cp
= osd_commands
;
6088 cp
< &osd_commands
[ARRAY_SIZE(osd_commands
)]; cp
++) {
6090 ostringstream secname
;
6091 secname
<< "cmd" << setfill('0') << std::setw(3) << cmdnum
;
6092 dump_cmddesc_to_json(f
, secname
.str(), cp
->cmdstring
, cp
->helpstring
,
6093 cp
->module
, cp
->perm
, cp
->availability
, 0);
6096 f
->close_section(); // command_descriptions
6103 cmd_getval(cct
, cmdmap
, "format", format
);
6104 f
.reset(Formatter::create(format
));
6106 if (prefix
== "version") {
6108 f
->open_object_section("version");
6109 f
->dump_string("version", pretty_version_to_str());
6113 ds
<< pretty_version_to_str();
6117 else if (prefix
== "injectargs") {
6118 vector
<string
> argsvec
;
6119 cmd_getval(cct
, cmdmap
, "injected_args", argsvec
);
6121 if (argsvec
.empty()) {
6123 ss
<< "ignoring empty injectargs";
6126 string args
= argsvec
.front();
6127 for (vector
<string
>::iterator a
= ++argsvec
.begin(); a
!= argsvec
.end(); ++a
)
6130 r
= cct
->_conf
->injectargs(args
, &ss
);
6133 else if (prefix
== "cluster_log") {
6135 cmd_getval(cct
, cmdmap
, "message", msg
);
6138 ss
<< "ignoring empty log message";
6141 string message
= msg
.front();
6142 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
6143 message
+= " " + *a
;
6145 cmd_getval(cct
, cmdmap
, "level", lvl
);
6146 clog_type level
= string_to_clog_type(lvl
);
6149 ss
<< "unknown level '" << lvl
<< "'";
6152 clog
->do_log(level
, message
);
6155 // either 'pg <pgid> <command>' or
6156 // 'tell <pgid>' (which comes in without any of that prefix)?
6158 else if (prefix
== "pg" ||
6159 prefix
== "query" ||
6160 prefix
== "mark_unfound_lost" ||
6161 prefix
== "list_missing"
6165 if (!cmd_getval(cct
, cmdmap
, "pgid", pgidstr
)) {
6166 ss
<< "no pgid specified";
6168 } else if (!pgid
.parse(pgidstr
.c_str())) {
6169 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
6174 if (osdmap
->get_primary_shard(pgid
, &pcand
) &&
6175 (pg
= _lookup_lock_pg(pcand
))) {
6176 if (pg
->is_primary()) {
6177 // simulate pg <pgid> cmd= for pg->do-command
6179 cmd_putval(cct
, cmdmap
, "cmd", prefix
);
6180 r
= pg
->do_command(cmdmap
, ss
, data
, odata
, con
, tid
);
6183 // don't reply, pg will do so async
6187 ss
<< "not primary for pgid " << pgid
;
6189 // send them the latest diff to ensure they realize the mapping
6191 service
.send_incremental_map(osdmap
->get_epoch() - 1, con
, osdmap
);
6193 // do not reply; they will get newer maps and realize they
6200 ss
<< "i don't have pgid " << pgid
;
6206 else if (prefix
== "bench") {
6209 int64_t osize
, onum
;
6210 // default count 1G, size 4MB
6211 cmd_getval(cct
, cmdmap
, "count", count
, (int64_t)1 << 30);
6212 cmd_getval(cct
, cmdmap
, "size", bsize
, (int64_t)4 << 20);
6213 cmd_getval(cct
, cmdmap
, "object_size", osize
, (int64_t)0);
6214 cmd_getval(cct
, cmdmap
, "object_num", onum
, (int64_t)0);
6216 ceph::shared_ptr
<ObjectStore::Sequencer
> osr (std::make_shared
<
6217 ObjectStore::Sequencer
>("bench"));
6219 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
6221 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
6222 // let us limit the block size because the next checks rely on it
6223 // having a sane value. If we allow any block size to be set things
6224 // can still go sideways.
6225 ss
<< "block 'size' values are capped at "
6226 << prettybyte_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
6227 << " a higher value, please adjust 'osd_bench_max_block_size'";
6230 } else if (bsize
< (int64_t) (1 << 20)) {
6231 // entering the realm of small block sizes.
6232 // limit the count to a sane value, assuming a configurable amount of
6233 // IOPS and duration, so that the OSD doesn't get hung up on this,
6234 // preventing timeouts from going off
6236 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
6237 if (count
> max_count
) {
6238 ss
<< "'count' values greater than " << max_count
6239 << " for a block size of " << prettybyte_t(bsize
) << ", assuming "
6240 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
6241 << " for " << duration
<< " seconds,"
6242 << " can cause ill effects on osd. "
6243 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6244 << " value if you wish to use a higher 'count'.";
6249 // 1MB block sizes are big enough so that we get more stuff done.
6250 // However, to avoid the osd from getting hung on this and having
6251 // timers being triggered, we are going to limit the count assuming
6252 // a configurable throughput and duration.
6253 // NOTE: max_count is the total amount of bytes that we believe we
6254 // will be able to write during 'duration' for the given
6255 // throughput. The block size hardly impacts this unless it's
6256 // way too big. Given we already check how big the block size
6257 // is, it's safe to assume everything will check out.
6259 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
6260 if (count
> max_count
) {
6261 ss
<< "'count' values greater than " << max_count
6262 << " for a block size of " << prettybyte_t(bsize
) << ", assuming "
6263 << prettybyte_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
6264 << " for " << duration
<< " seconds,"
6265 << " can cause ill effects on osd. "
6266 << " Please adjust 'osd_bench_large_size_max_throughput'"
6267 << " with a higher value if you wish to use a higher 'count'.";
6273 if (osize
&& bsize
> osize
)
6276 dout(1) << " bench count " << count
6277 << " bsize " << prettybyte_t(bsize
) << dendl
;
6279 ObjectStore::Transaction cleanupt
;
6281 if (osize
&& onum
) {
6283 bufferptr
bp(osize
);
6285 bl
.push_back(std::move(bp
));
6286 bl
.rebuild_page_aligned();
6287 for (int i
=0; i
<onum
; ++i
) {
6289 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
6291 hobject_t
soid(sobject_t(oid
, 0));
6292 ObjectStore::Transaction t
;
6293 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
6294 store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
6295 cleanupt
.remove(coll_t(), ghobject_t(soid
));
6300 bufferptr
bp(bsize
);
6302 bl
.push_back(std::move(bp
));
6303 bl
.rebuild_page_aligned();
6307 if (!osr
->flush_commit(&waiter
)) {
6312 utime_t start
= ceph_clock_now();
6313 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
6315 unsigned offset
= 0;
6316 if (onum
&& osize
) {
6317 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
6318 offset
= rand() % (osize
/ bsize
) * bsize
;
6320 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
6323 hobject_t
soid(sobject_t(oid
, 0));
6324 ObjectStore::Transaction t
;
6325 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
6326 store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
6327 if (!onum
|| !osize
)
6328 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
6333 if (!osr
->flush_commit(&waiter
)) {
6337 utime_t end
= ceph_clock_now();
6340 store
->queue_transaction(osr
.get(), std::move(cleanupt
), NULL
);
6343 if (!osr
->flush_commit(&waiter
)) {
6348 uint64_t rate
= (double)count
/ (end
- start
);
6350 f
->open_object_section("osd_bench_results");
6351 f
->dump_int("bytes_written", count
);
6352 f
->dump_int("blocksize", bsize
);
6353 f
->dump_unsigned("bytes_per_sec", rate
);
6357 ss
<< "bench: wrote " << prettybyte_t(count
)
6358 << " in blocks of " << prettybyte_t(bsize
) << " in "
6359 << (end
-start
) << " sec at " << prettybyte_t(rate
) << "/sec";
6363 else if (prefix
== "flush_pg_stats") {
6367 else if (prefix
== "heap") {
6368 r
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ds
);
6371 else if (prefix
== "debug dump_missing") {
6373 cmd_getval(cct
, cmdmap
, "filename", file_name
);
6374 std::ofstream
fout(file_name
.c_str());
6375 if (!fout
.is_open()) {
6376 ss
<< "failed to open file '" << file_name
<< "'";
6381 fout
<< "*** osd " << whoami
<< ": dump_missing ***" << std::endl
;
6382 RWLock::RLocker
l(pg_map_lock
);
6383 for (ceph::unordered_map
<spg_t
, PG
*>::const_iterator pg_map_e
= pg_map
.begin();
6384 pg_map_e
!= pg_map
.end(); ++pg_map_e
) {
6385 PG
*pg
= pg_map_e
->second
;
6388 fout
<< *pg
<< std::endl
;
6389 std::map
<hobject_t
, pg_missing_item
>::const_iterator mend
=
6390 pg
->pg_log
.get_missing().get_items().end();
6391 std::map
<hobject_t
, pg_missing_item
>::const_iterator mi
=
6392 pg
->pg_log
.get_missing().get_items().begin();
6393 for (; mi
!= mend
; ++mi
) {
6394 fout
<< mi
->first
<< " -> " << mi
->second
<< std::endl
;
6395 if (!pg
->missing_loc
.needs_recovery(mi
->first
))
6397 if (pg
->missing_loc
.is_unfound(mi
->first
))
6398 fout
<< " unfound ";
6399 const set
<pg_shard_t
> &mls(pg
->missing_loc
.get_locations(mi
->first
));
6402 fout
<< "missing_loc: " << mls
<< std::endl
;
6410 else if (prefix
== "debug kick_recovery_wq") {
6412 cmd_getval(cct
, cmdmap
, "delay", delay
);
6415 r
= cct
->_conf
->set_val("osd_recovery_delay_start", oss
.str().c_str());
6417 ss
<< "kick_recovery_wq: error setting "
6418 << "osd_recovery_delay_start to '" << delay
<< "': error "
6422 cct
->_conf
->apply_changes(NULL
);
6423 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
6424 << "to " << cct
->_conf
->osd_recovery_delay_start
;
6427 else if (prefix
== "cpu_profiler") {
6429 cmd_getval(cct
, cmdmap
, "arg", arg
);
6430 vector
<string
> argvec
;
6431 get_str_vec(arg
, argvec
);
6432 cpu_profiler_handle_command(argvec
, ds
);
6435 else if (prefix
== "dump_pg_recovery_stats") {
6438 pg_recovery_stats
.dump_formatted(f
.get());
6441 pg_recovery_stats
.dump(s
);
6442 ds
<< "dump pg recovery stats: " << s
.str();
6446 else if (prefix
== "reset_pg_recovery_stats") {
6447 ss
<< "reset pg recovery stats";
6448 pg_recovery_stats
.reset();
6452 ss
<< "unrecognized command! " << cmd
;
6459 dout(0) << "do_command r=" << r
<< " " << rs
<< dendl
;
6462 MCommandReply
*reply
= new MCommandReply(r
, rs
);
6463 reply
->set_tid(tid
);
6464 reply
->set_data(odata
);
6465 con
->send_message(reply
);
6469 bool OSD::heartbeat_dispatch(Message
*m
)
6471 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
6472 switch (m
->get_type()) {
6475 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
6480 handle_osd_ping(static_cast<MOSDPing
*>(m
));
6484 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
6491 bool OSD::ms_dispatch(Message
*m
)
6493 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
6494 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
6495 service
.got_stop_ack();
6503 if (is_stopping()) {
6517 void OSD::maybe_share_map(
6522 if (!op
->check_send_map
) {
6525 epoch_t last_sent_epoch
= 0;
6527 session
->sent_epoch_lock
.lock();
6528 last_sent_epoch
= session
->last_sent_epoch
;
6529 session
->sent_epoch_lock
.unlock();
6531 const Message
*m
= op
->get_req();
6534 m
->get_connection().get(),
6537 session
? &last_sent_epoch
: NULL
);
6539 session
->sent_epoch_lock
.lock();
6540 if (session
->last_sent_epoch
< last_sent_epoch
) {
6541 session
->last_sent_epoch
= last_sent_epoch
;
6543 session
->sent_epoch_lock
.unlock();
6545 op
->check_send_map
= false;
6548 void OSD::dispatch_session_waiting(Session
*session
, OSDMapRef osdmap
)
6550 assert(session
->session_dispatch_lock
.is_locked());
6552 auto i
= session
->waiting_on_map
.begin();
6553 while (i
!= session
->waiting_on_map
.end()) {
6554 OpRequestRef op
= &(*i
);
6555 assert(ms_can_fast_dispatch(op
->get_req()));
6556 const MOSDFastDispatchOp
*m
= static_cast<const MOSDFastDispatchOp
*>(
6558 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
6561 session
->waiting_on_map
.erase(i
++);
6565 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
6566 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
6567 static_cast<const MOSDOp
*>(m
)->get_pg());
6568 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
6572 pgid
= m
->get_spg();
6574 enqueue_op(pgid
, op
, m
->get_map_epoch());
6577 if (session
->waiting_on_map
.empty()) {
6578 clear_session_waiting_on_map(session
);
6580 register_session_waiting_on_map(session
);
6584 void OSD::ms_fast_dispatch(Message
*m
)
6587 if (service
.is_stopping()) {
6591 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
6594 osd_reqid_t reqid
= op
->get_reqid();
6596 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
6597 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
6601 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
6603 // note sender epoch, min req'd epoch
6604 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
6605 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
6606 assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
6608 service
.maybe_inject_dispatch_delay();
6610 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
6611 m
->get_type() != CEPH_MSG_OSD_OP
) {
6612 // queue it directly
6614 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
6616 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
6618 // legacy client, and this is an MOSDOp (the *only* fast dispatch
6619 // message that didn't have an explicit spg_t); we need to map
6620 // them to an spg_t while preserving delivery order.
6621 Session
*session
= static_cast<Session
*>(m
->get_connection()->get_priv());
6624 Mutex::Locker
l(session
->session_dispatch_lock
);
6626 session
->waiting_on_map
.push_back(*op
);
6627 OSDMapRef nextmap
= service
.get_nextmap_reserved();
6628 dispatch_session_waiting(session
, nextmap
);
6629 service
.release_map(nextmap
);
6634 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
6637 void OSD::ms_fast_preprocess(Message
*m
)
6639 if (m
->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD
) {
6640 if (m
->get_type() == CEPH_MSG_OSD_MAP
) {
6641 MOSDMap
*mm
= static_cast<MOSDMap
*>(m
);
6642 Session
*s
= static_cast<Session
*>(m
->get_connection()->get_priv());
6644 s
->received_map_lock
.lock();
6645 s
->received_map_epoch
= mm
->get_last();
6646 s
->received_map_lock
.unlock();
6653 bool OSD::ms_get_authorizer(int dest_type
, AuthAuthorizer
**authorizer
, bool force_new
)
6655 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type
) << dendl
;
6657 if (dest_type
== CEPH_ENTITY_TYPE_MON
)
6661 /* the MonClient checks keys every tick(), so we should just wait for that cycle
6663 if (monc
->wait_auth_rotating(10) < 0) {
6664 derr
<< "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl
;
6669 *authorizer
= monc
->build_authorizer(dest_type
);
6670 return *authorizer
!= NULL
;
6674 bool OSD::ms_verify_authorizer(Connection
*con
, int peer_type
,
6675 int protocol
, bufferlist
& authorizer_data
, bufferlist
& authorizer_reply
,
6676 bool& isvalid
, CryptoKey
& session_key
)
6678 AuthAuthorizeHandler
*authorize_handler
= 0;
6679 switch (peer_type
) {
6680 case CEPH_ENTITY_TYPE_MDS
:
6682 * note: mds is technically a client from our perspective, but
6683 * this makes the 'cluster' consistent w/ monitor's usage.
6685 case CEPH_ENTITY_TYPE_OSD
:
6686 case CEPH_ENTITY_TYPE_MGR
:
6687 authorize_handler
= authorize_handler_cluster_registry
->get_handler(protocol
);
6690 authorize_handler
= authorize_handler_service_registry
->get_handler(protocol
);
6692 if (!authorize_handler
) {
6693 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol
<< dendl
;
6698 AuthCapsInfo caps_info
;
6701 uint64_t auid
= CEPH_AUTH_UID_DEFAULT
;
6703 isvalid
= authorize_handler
->verify_authorizer(
6704 cct
, monc
->rotating_secrets
.get(),
6705 authorizer_data
, authorizer_reply
, name
, global_id
, caps_info
, session_key
,
6709 Session
*s
= static_cast<Session
*>(con
->get_priv());
6711 s
= new Session(cct
);
6712 con
->set_priv(s
->get());
6714 dout(10) << " new session " << s
<< " con=" << s
->con
<< " addr=" << s
->con
->get_peer_addr() << dendl
;
6717 s
->entity_name
= name
;
6718 if (caps_info
.allow_all
)
6719 s
->caps
.set_allow_all();
6722 if (caps_info
.caps
.length() > 0) {
6723 bufferlist::iterator p
= caps_info
.caps
.begin();
6728 catch (buffer::error
& e
) {
6730 bool success
= s
->caps
.parse(str
);
6732 dout(10) << " session " << s
<< " " << s
->entity_name
<< " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
6734 dout(10) << " session " << s
<< " " << s
->entity_name
<< " failed to parse caps '" << str
<< "'" << dendl
;
6742 void OSD::do_waiters()
6744 assert(osd_lock
.is_locked());
6746 dout(10) << "do_waiters -- start" << dendl
;
6747 while (!finished
.empty()) {
6748 OpRequestRef next
= finished
.front();
6749 finished
.pop_front();
6752 dout(10) << "do_waiters -- finish" << dendl
;
6755 void OSD::dispatch_op(OpRequestRef op
)
6757 switch (op
->get_req()->get_type()) {
6759 case MSG_OSD_PG_CREATE
:
6760 handle_pg_create(op
);
6762 case MSG_OSD_PG_NOTIFY
:
6763 handle_pg_notify(op
);
6765 case MSG_OSD_PG_QUERY
:
6766 handle_pg_query(op
);
6768 case MSG_OSD_PG_LOG
:
6771 case MSG_OSD_PG_REMOVE
:
6772 handle_pg_remove(op
);
6774 case MSG_OSD_PG_INFO
:
6777 case MSG_OSD_PG_TRIM
:
6780 case MSG_OSD_BACKFILL_RESERVE
:
6781 handle_pg_backfill_reserve(op
);
6783 case MSG_OSD_RECOVERY_RESERVE
:
6784 handle_pg_recovery_reserve(op
);
6789 void OSD::_dispatch(Message
*m
)
6791 assert(osd_lock
.is_locked());
6792 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
6794 switch (m
->get_type()) {
6796 // -- don't need lock --
6798 dout(10) << "ping from " << m
->get_source() << dendl
;
6802 // -- don't need OSDMap --
6804 // map and replication
6805 case CEPH_MSG_OSD_MAP
:
6806 handle_osd_map(static_cast<MOSDMap
*>(m
));
6810 case MSG_PGSTATSACK
:
6811 handle_pg_stats_ack(static_cast<MPGStatsAck
*>(m
));
6814 case MSG_MON_COMMAND
:
6815 handle_command(static_cast<MMonCommand
*>(m
));
6818 handle_command(static_cast<MCommand
*>(m
));
6822 handle_scrub(static_cast<MOSDScrub
*>(m
));
6825 // -- need OSDMap --
6827 case MSG_OSD_PG_CREATE
:
6828 case MSG_OSD_PG_NOTIFY
:
6829 case MSG_OSD_PG_QUERY
:
6830 case MSG_OSD_PG_LOG
:
6831 case MSG_OSD_PG_REMOVE
:
6832 case MSG_OSD_PG_INFO
:
6833 case MSG_OSD_PG_TRIM
:
6834 case MSG_OSD_BACKFILL_RESERVE
:
6835 case MSG_OSD_RECOVERY_RESERVE
:
6837 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
6839 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
6840 // no map? starting up?
6842 dout(7) << "no OSDMap, not booted" << dendl
;
6843 logger
->inc(l_osd_waiting_for_map
);
6844 waiting_for_osdmap
.push_back(op
);
6845 op
->mark_delayed("no osdmap");
6855 void OSD::handle_pg_scrub(MOSDScrub
*m
, PG
*pg
)
6858 if (pg
->is_primary()) {
6859 pg
->unreg_next_scrub();
6860 pg
->scrubber
.must_scrub
= true;
6861 pg
->scrubber
.must_deep_scrub
= m
->deep
|| m
->repair
;
6862 pg
->scrubber
.must_repair
= m
->repair
;
6863 pg
->reg_next_scrub();
6864 dout(10) << "marking " << *pg
<< " for scrub" << dendl
;
6869 void OSD::handle_scrub(MOSDScrub
*m
)
6871 dout(10) << "handle_scrub " << *m
<< dendl
;
6872 if (!require_mon_or_mgr_peer(m
)) {
6876 if (m
->fsid
!= monc
->get_fsid()) {
6877 dout(0) << "handle_scrub fsid " << m
->fsid
<< " != " << monc
->get_fsid() << dendl
;
6882 RWLock::RLocker
l(pg_map_lock
);
6883 if (m
->scrub_pgs
.empty()) {
6884 for (ceph::unordered_map
<spg_t
, PG
*>::iterator p
= pg_map
.begin();
6887 handle_pg_scrub(m
, p
->second
);
6889 for (vector
<pg_t
>::iterator p
= m
->scrub_pgs
.begin();
6890 p
!= m
->scrub_pgs
.end();
6893 if (osdmap
->get_primary_shard(*p
, &pcand
)) {
6894 auto pg_map_entry
= pg_map
.find(pcand
);
6895 if (pg_map_entry
!= pg_map
.end()) {
6896 handle_pg_scrub(m
, pg_map_entry
->second
);
6905 bool OSD::scrub_random_backoff()
6907 bool coin_flip
= (rand() / (double)RAND_MAX
>=
6908 cct
->_conf
->osd_scrub_backoff_ratio
);
6910 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl
;
6916 OSDService::ScrubJob::ScrubJob(CephContext
* cct
,
6917 const spg_t
& pg
, const utime_t
& timestamp
,
6918 double pool_scrub_min_interval
,
6919 double pool_scrub_max_interval
, bool must
)
6922 sched_time(timestamp
),
6925 // if not explicitly requested, postpone the scrub with a random delay
6927 double scrub_min_interval
= pool_scrub_min_interval
> 0 ?
6928 pool_scrub_min_interval
: cct
->_conf
->osd_scrub_min_interval
;
6929 double scrub_max_interval
= pool_scrub_max_interval
> 0 ?
6930 pool_scrub_max_interval
: cct
->_conf
->osd_scrub_max_interval
;
6932 sched_time
+= scrub_min_interval
;
6933 double r
= rand() / (double)RAND_MAX
;
6935 scrub_min_interval
* cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
6936 deadline
+= scrub_max_interval
;
6940 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob
& rhs
) const {
6941 if (sched_time
< rhs
.sched_time
)
6943 if (sched_time
> rhs
.sched_time
)
6945 return pgid
< rhs
.pgid
;
6948 bool OSD::scrub_time_permit(utime_t now
)
6951 time_t tt
= now
.sec();
6952 localtime_r(&tt
, &bdt
);
6953 bool time_permit
= false;
6954 if (cct
->_conf
->osd_scrub_begin_hour
< cct
->_conf
->osd_scrub_end_hour
) {
6955 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
&& bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
6959 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
|| bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
6964 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
6965 << " - " << cct
->_conf
->osd_scrub_end_hour
6966 << " now " << bdt
.tm_hour
<< " = no" << dendl
;
6968 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
6969 << " - " << cct
->_conf
->osd_scrub_end_hour
6970 << " now " << bdt
.tm_hour
<< " = yes" << dendl
;
6975 bool OSD::scrub_load_below_threshold()
6978 if (getloadavg(loadavgs
, 3) != 3) {
6979 dout(10) << __func__
<< " couldn't read loadavgs\n" << dendl
;
6983 // allow scrub if below configured threshold
6984 if (loadavgs
[0] < cct
->_conf
->osd_scrub_load_threshold
) {
6985 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
6986 << " < max " << cct
->_conf
->osd_scrub_load_threshold
6987 << " = yes" << dendl
;
6991 // allow scrub if below daily avg and currently decreasing
6992 if (loadavgs
[0] < daily_loadavg
&& loadavgs
[0] < loadavgs
[2]) {
6993 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
6994 << " < daily_loadavg " << daily_loadavg
6995 << " and < 15m avg " << loadavgs
[2]
6996 << " = yes" << dendl
;
7000 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7001 << " >= max " << cct
->_conf
->osd_scrub_load_threshold
7002 << " and ( >= daily_loadavg " << daily_loadavg
7003 << " or >= 15m avg " << loadavgs
[2]
7004 << ") = no" << dendl
;
7008 void OSD::sched_scrub()
7010 // if not permitted, fail fast
7011 if (!service
.can_inc_scrubs_pending()) {
7015 utime_t now
= ceph_clock_now();
7016 bool time_permit
= scrub_time_permit(now
);
7017 bool load_is_low
= scrub_load_below_threshold();
7018 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low
<< dendl
;
7020 OSDService::ScrubJob scrub
;
7021 if (service
.first_scrub_stamp(&scrub
)) {
7023 dout(30) << "sched_scrub examine " << scrub
.pgid
<< " at " << scrub
.sched_time
<< dendl
;
7025 if (scrub
.sched_time
> now
) {
7026 // save ourselves some effort
7027 dout(10) << "sched_scrub " << scrub
.pgid
<< " scheduled at " << scrub
.sched_time
7028 << " > " << now
<< dendl
;
7032 if (!cct
->_conf
->osd_scrub_during_recovery
&& service
.is_recovery_active()) {
7033 dout(10) << __func__
<< "not scheduling scrub of " << scrub
.pgid
<< " due to active recovery ops" << dendl
;
7037 if ((scrub
.deadline
>= now
) && !(time_permit
&& load_is_low
)) {
7038 dout(10) << __func__
<< " not scheduling scrub for " << scrub
.pgid
<< " due to "
7039 << (!time_permit
? "time not permit" : "high load") << dendl
;
7043 PG
*pg
= _lookup_lock_pg(scrub
.pgid
);
7046 if (pg
->get_pgbackend()->scrub_supported() && pg
->is_active()) {
7047 dout(10) << "sched_scrub scrubbing " << scrub
.pgid
<< " at " << scrub
.sched_time
7048 << (pg
->scrubber
.must_scrub
? ", explicitly requested" :
7049 (load_is_low
? ", load_is_low" : " deadline < now"))
7051 if (pg
->sched_scrub()) {
7057 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7059 dout(20) << "sched_scrub done" << dendl
;
7064 // =====================================================
7067 void OSD::wait_for_new_map(OpRequestRef op
)
7070 if (waiting_for_osdmap
.empty()) {
7071 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
7074 logger
->inc(l_osd_waiting_for_map
);
7075 waiting_for_osdmap
.push_back(op
);
7076 op
->mark_delayed("wait for new map");
7081 * assimilate new OSDMap(s). scan pgs, etc.
7084 void OSD::note_down_osd(int peer
)
7086 assert(osd_lock
.is_locked());
7087 cluster_messenger
->mark_down(osdmap
->get_cluster_addr(peer
));
7089 heartbeat_lock
.Lock();
7090 failure_queue
.erase(peer
);
7091 failure_pending
.erase(peer
);
7092 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
7093 if (p
!= heartbeat_peers
.end()) {
7094 p
->second
.con_back
->mark_down();
7095 if (p
->second
.con_front
) {
7096 p
->second
.con_front
->mark_down();
7098 heartbeat_peers
.erase(p
);
7100 heartbeat_lock
.Unlock();
7103 void OSD::note_up_osd(int peer
)
7105 service
.forget_peer_epoch(peer
, osdmap
->get_epoch() - 1);
7106 heartbeat_set_peers_need_update();
7109 struct C_OnMapCommit
: public Context
{
7111 epoch_t first
, last
;
7113 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
7114 : osd(o
), first(f
), last(l
), msg(m
) {}
7115 void finish(int r
) override
{
7116 osd
->_committed_osd_maps(first
, last
, msg
);
7121 struct C_OnMapApply
: public Context
{
7122 OSDService
*service
;
7123 list
<OSDMapRef
> pinned_maps
;
7125 C_OnMapApply(OSDService
*service
,
7126 const list
<OSDMapRef
> &pinned_maps
,
7128 : service(service
), pinned_maps(pinned_maps
), e(e
) {}
7129 void finish(int r
) override
{
7130 service
->clear_map_bl_cache_pins(e
);
7134 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
7136 OSDMapRef osdmap
= service
.get_osdmap();
7137 if (osdmap
->get_epoch() >= epoch
)
7140 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
7146 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
7148 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
7149 if (min
<= superblock
.oldest_map
)
7153 ObjectStore::Transaction t
;
7154 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
7155 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
7156 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
7157 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
7158 superblock
.oldest_map
= e
+ 1;
7160 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
7161 service
.publish_superblock(superblock
);
7162 write_superblock(t
);
7163 int tr
= store
->queue_transaction(service
.meta_osr
.get(), std::move(t
), nullptr);
7167 // skip_maps leaves us with a range of old maps if we fail to remove all
7168 // of them before moving superblock.oldest_map forward to the first map
7169 // in the incoming MOSDMap msg. so we should continue removing them in
7170 // this case, even we could do huge series of delete transactions all at
7177 service
.publish_superblock(superblock
);
7178 write_superblock(t
);
7179 store
->queue_transaction(service
.meta_osr
.get(), std::move(t
), nullptr);
7181 // we should not remove the cached maps
7182 assert(min
<= service
.map_cache
.cached_key_lower_bound());
7185 void OSD::handle_osd_map(MOSDMap
*m
)
7187 assert(osd_lock
.is_locked());
7188 // Keep a ref in the list until we get the newly received map written
7189 // onto disk. This is important because as long as the refs are alive,
7190 // the OSDMaps will be pinned in the cache and we won't try to read it
7191 // off of disk. Otherwise these maps will probably not stay in the cache,
7192 // and reading those OSDMaps before they are actually written can result
7194 list
<OSDMapRef
> pinned_maps
;
7195 if (m
->fsid
!= monc
->get_fsid()) {
7196 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
7197 << monc
->get_fsid() << dendl
;
7201 if (is_initializing()) {
7202 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
7207 Session
*session
= static_cast<Session
*>(m
->get_connection()->get_priv());
7208 if (session
&& !(session
->entity_name
.is_mon() ||
7209 session
->entity_name
.is_osd())) {
7211 dout(10) << "got osd map from Session " << session
7212 << " which we can't take maps from (not a mon or osd)" << dendl
;
7220 // share with the objecter
7222 service
.objecter
->handle_osd_map(m
);
7224 epoch_t first
= m
->get_first();
7225 epoch_t last
= m
->get_last();
7226 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
7227 << superblock
.newest_map
7228 << ", src has [" << m
->oldest_map
<< "," << m
->newest_map
<< "]"
7231 logger
->inc(l_osd_map
);
7232 logger
->inc(l_osd_mape
, last
- first
+ 1);
7233 if (first
<= superblock
.newest_map
)
7234 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
7235 if (service
.max_oldest_map
< m
->oldest_map
) {
7236 service
.max_oldest_map
= m
->oldest_map
;
7237 assert(service
.max_oldest_map
>= superblock
.oldest_map
);
7240 // make sure there is something new, here, before we bother flushing
7241 // the queues and such
7242 if (last
<= superblock
.newest_map
) {
7243 dout(10) << " no new maps here, dropping" << dendl
;
7249 bool skip_maps
= false;
7250 if (first
> superblock
.newest_map
+ 1) {
7251 dout(10) << "handle_osd_map message skips epochs "
7252 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
7253 if (m
->oldest_map
<= superblock
.newest_map
+ 1) {
7254 osdmap_subscribe(superblock
.newest_map
+ 1, false);
7258 // always try to get the full range of maps--as many as we can. this
7259 // 1- is good to have
7260 // 2- is at present the only way to ensure that we get a *full* map as
7262 if (m
->oldest_map
< first
) {
7263 osdmap_subscribe(m
->oldest_map
- 1, true);
7270 ObjectStore::Transaction t
;
7271 uint64_t txn_size
= 0;
7273 // store new maps: queue for disk and put in the osdmap cache
7274 epoch_t start
= MAX(superblock
.newest_map
+ 1, first
);
7275 for (epoch_t e
= start
; e
<= last
; e
++) {
7276 if (txn_size
>= t
.get_num_bytes()) {
7277 derr
<< __func__
<< " transaction size overflowed" << dendl
;
7278 assert(txn_size
< t
.get_num_bytes());
7280 txn_size
= t
.get_num_bytes();
7281 map
<epoch_t
,bufferlist
>::iterator p
;
7282 p
= m
->maps
.find(e
);
7283 if (p
!= m
->maps
.end()) {
7284 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
7285 OSDMap
*o
= new OSDMap
;
7286 bufferlist
& bl
= p
->second
;
7290 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
7291 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
7293 pinned_maps
.push_back(add_map(o
));
7299 p
= m
->incremental_maps
.find(e
);
7300 if (p
!= m
->incremental_maps
.end()) {
7301 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
7302 bufferlist
& bl
= p
->second
;
7303 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
7304 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
7305 pin_map_inc_bl(e
, bl
);
7307 OSDMap
*o
= new OSDMap
;
7310 bool got
= get_map_bl(e
- 1, obl
);
7315 OSDMap::Incremental inc
;
7316 bufferlist::iterator p
= bl
.begin();
7318 if (o
->apply_incremental(inc
) < 0) {
7319 derr
<< "ERROR: bad fsid? i have " << osdmap
->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
7320 assert(0 == "bad fsid");
7324 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
7326 bool injected_failure
= false;
7327 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
7328 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
7329 derr
<< __func__
<< " injecting map crc failure" << dendl
;
7330 injected_failure
= true;
7333 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
7334 dout(2) << "got incremental " << e
7335 << " but failed to encode full with correct crc; requesting"
7337 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
7338 dout(20) << "my encoded map was:\n";
7339 fbl
.hexdump(*_dout
);
7342 request_full_map(e
, last
);
7348 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
7349 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
7351 pinned_maps
.push_back(add_map(o
));
7355 assert(0 == "MOSDMap lied about what maps it had?");
7358 // even if this map isn't from a mon, we may have satisfied our subscription
7359 monc
->sub_got("osdmap", last
);
7361 if (!m
->maps
.empty() && requested_full_first
) {
7362 dout(10) << __func__
<< " still missing full maps " << requested_full_first
7363 << ".." << requested_full_last
<< dendl
;
7364 rerequest_full_maps();
7367 if (last
<= superblock
.newest_map
) {
7368 dout(10) << " no new maps here, dropping" << dendl
;
7373 if (superblock
.oldest_map
) {
7374 // make sure we at least keep pace with incoming maps
7375 trim_maps(m
->oldest_map
, last
- first
+ 1, skip_maps
);
7378 if (!superblock
.oldest_map
|| skip_maps
)
7379 superblock
.oldest_map
= first
;
7380 superblock
.newest_map
= last
;
7381 superblock
.current_epoch
= last
;
7383 // note in the superblock that we were clean thru the prior epoch
7384 epoch_t boot_epoch
= service
.get_boot_epoch();
7385 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
7386 superblock
.mounted
= boot_epoch
;
7387 superblock
.clean_thru
= last
;
7390 // superblock and commit
7391 write_superblock(t
);
7392 store
->queue_transaction(
7393 service
.meta_osr
.get(),
7395 new C_OnMapApply(&service
, pinned_maps
, last
),
7396 new C_OnMapCommit(this, start
, last
, m
), 0);
7397 service
.publish_superblock(superblock
);
7400 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
7402 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
7403 if (is_stopping()) {
7404 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
7407 Mutex::Locker
l(osd_lock
);
7408 map_lock
.get_write();
7410 bool do_shutdown
= false;
7411 bool do_restart
= false;
7412 bool network_error
= false;
7414 // advance through the new maps
7415 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
7416 dout(10) << " advance to epoch " << cur
7417 << " (<= last " << last
7418 << " <= newest_map " << superblock
.newest_map
7421 OSDMapRef newmap
= get_map(cur
);
7422 assert(newmap
); // we just cached it above!
7424 // start blacklisting messages sent to peers that go down.
7425 service
.pre_publish_map(newmap
);
7427 // kill connections to newly down osds
7428 bool waited_for_reservations
= false;
7430 osdmap
->get_all_osds(old
);
7431 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
7433 osdmap
->is_up(*p
) && // in old map
7434 newmap
->is_down(*p
)) { // but not the new one
7435 if (!waited_for_reservations
) {
7436 service
.await_reserved_maps();
7437 waited_for_reservations
= true;
7440 } else if (*p
!= whoami
&&
7441 osdmap
->is_down(*p
) &&
7442 newmap
->is_up(*p
)) {
7447 if (osdmap
->test_flag(CEPH_OSDMAP_NOUP
) !=
7448 newmap
->test_flag(CEPH_OSDMAP_NOUP
)) {
7449 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
7452 // this captures the case where we sent the boot message while
7453 // NOUP was being set on the mon and our boot request was
7454 // dropped, and then later it is cleared. it imperfectly
7455 // handles the case where our original boot message was not
7456 // dropped and we restart even though we might have booted, but
7457 // that is harmless (boot will just take slightly longer).
7465 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
7467 osdmap
->is_up(whoami
) &&
7468 osdmap
->get_inst(whoami
) == client_messenger
->get_myinst()) {
7469 up_epoch
= osdmap
->get_epoch();
7470 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
7472 boot_epoch
= osdmap
->get_epoch();
7473 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
7475 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
7479 had_map_since
= ceph_clock_now();
7481 epoch_t _bind_epoch
= service
.get_bind_epoch();
7482 if (osdmap
->is_up(whoami
) &&
7483 osdmap
->get_addr(whoami
) == client_messenger
->get_myaddr() &&
7484 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
7487 dout(1) << "state: booting -> active" << dendl
;
7488 set_state(STATE_ACTIVE
);
7490 // set incarnation so that osd_reqid_t's we generate for our
7491 // objecter requests are unique across restarts.
7492 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
7496 if (osdmap
->get_epoch() > 0 &&
7498 if (!osdmap
->exists(whoami
)) {
7499 dout(0) << "map says i do not exist. shutting down." << dendl
;
7500 do_shutdown
= true; // don't call shutdown() while we have
7501 // everything paused
7502 } else if (!osdmap
->is_up(whoami
) ||
7503 !osdmap
->get_addr(whoami
).probably_equals(
7504 client_messenger
->get_myaddr()) ||
7505 !osdmap
->get_cluster_addr(whoami
).probably_equals(
7506 cluster_messenger
->get_myaddr()) ||
7507 !osdmap
->get_hb_back_addr(whoami
).probably_equals(
7508 hb_back_server_messenger
->get_myaddr()) ||
7509 (osdmap
->get_hb_front_addr(whoami
) != entity_addr_t() &&
7510 !osdmap
->get_hb_front_addr(whoami
).probably_equals(
7511 hb_front_server_messenger
->get_myaddr()))) {
7512 if (!osdmap
->is_up(whoami
)) {
7513 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
7514 service
.got_stop_ack();
7516 clog
->warn() << "map e" << osdmap
->get_epoch()
7517 << " wrongly marked me down at e"
7518 << osdmap
->get_down_at(whoami
);
7520 } else if (!osdmap
->get_addr(whoami
).probably_equals(
7521 client_messenger
->get_myaddr())) {
7522 clog
->error() << "map e" << osdmap
->get_epoch()
7523 << " had wrong client addr (" << osdmap
->get_addr(whoami
)
7524 << " != my " << client_messenger
->get_myaddr() << ")";
7525 } else if (!osdmap
->get_cluster_addr(whoami
).probably_equals(
7526 cluster_messenger
->get_myaddr())) {
7527 clog
->error() << "map e" << osdmap
->get_epoch()
7528 << " had wrong cluster addr ("
7529 << osdmap
->get_cluster_addr(whoami
)
7530 << " != my " << cluster_messenger
->get_myaddr() << ")";
7531 } else if (!osdmap
->get_hb_back_addr(whoami
).probably_equals(
7532 hb_back_server_messenger
->get_myaddr())) {
7533 clog
->error() << "map e" << osdmap
->get_epoch()
7534 << " had wrong hb back addr ("
7535 << osdmap
->get_hb_back_addr(whoami
)
7536 << " != my " << hb_back_server_messenger
->get_myaddr()
7538 } else if (osdmap
->get_hb_front_addr(whoami
) != entity_addr_t() &&
7539 !osdmap
->get_hb_front_addr(whoami
).probably_equals(
7540 hb_front_server_messenger
->get_myaddr())) {
7541 clog
->error() << "map e" << osdmap
->get_epoch()
7542 << " had wrong hb front addr ("
7543 << osdmap
->get_hb_front_addr(whoami
)
7544 << " != my " << hb_front_server_messenger
->get_myaddr()
7548 if (!service
.is_stopping()) {
7549 epoch_t up_epoch
= 0;
7550 epoch_t bind_epoch
= osdmap
->get_epoch();
7551 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
7555 utime_t now
= ceph_clock_now();
7556 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
7557 osd_markdown_log
.push_back(now
);
7558 //clear all out-of-date log
7559 while (!osd_markdown_log
.empty() &&
7560 osd_markdown_log
.front() + grace
< now
)
7561 osd_markdown_log
.pop_front();
7562 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
7563 dout(0) << __func__
<< " marked down "
7564 << osd_markdown_log
.size()
7565 << " > osd_max_markdown_count "
7566 << cct
->_conf
->osd_max_markdown_count
7567 << " in last " << grace
<< " seconds, shutting down"
7573 start_waiting_for_healthy();
7575 set
<int> avoid_ports
;
7576 #if defined(__FreeBSD__)
7577 // prevent FreeBSD from grabbing the client_messenger port during
7578 // rebinding. In which case a cluster_meesneger will connect also
7580 avoid_ports
.insert(client_messenger
->get_myaddr().get_port());
7582 avoid_ports
.insert(cluster_messenger
->get_myaddr().get_port());
7583 avoid_ports
.insert(hb_back_server_messenger
->get_myaddr().get_port());
7584 avoid_ports
.insert(hb_front_server_messenger
->get_myaddr().get_port());
7586 int r
= cluster_messenger
->rebind(avoid_ports
);
7588 do_shutdown
= true; // FIXME: do_restart?
7589 network_error
= true;
7590 dout(0) << __func__
<< " marked down:"
7591 << " rebind cluster_messenger failed" << dendl
;
7594 r
= hb_back_server_messenger
->rebind(avoid_ports
);
7596 do_shutdown
= true; // FIXME: do_restart?
7597 network_error
= true;
7598 dout(0) << __func__
<< " marked down:"
7599 << " rebind hb_back_server_messenger failed" << dendl
;
7602 r
= hb_front_server_messenger
->rebind(avoid_ports
);
7604 do_shutdown
= true; // FIXME: do_restart?
7605 network_error
= true;
7606 dout(0) << __func__
<< " marked down:"
7607 << " rebind hb_front_server_messenger failed" << dendl
;
7610 hb_front_client_messenger
->mark_down_all();
7611 hb_back_client_messenger
->mark_down_all();
7613 reset_heartbeat_peers();
7618 map_lock
.put_write();
7620 check_osdmap_features(store
);
7625 if (is_active() || is_waiting_for_healthy())
7626 maybe_update_heartbeat_peers();
7629 dout(10) << " not yet active; waiting for peering wq to drain" << dendl
;
7635 if (m
->newest_map
&& m
->newest_map
> last
) {
7636 dout(10) << " msg say newest map is " << m
->newest_map
7637 << ", requesting more" << dendl
;
7638 osdmap_subscribe(osdmap
->get_epoch()+1, false);
7640 else if (do_shutdown
) {
7641 if (network_error
) {
7642 Mutex::Locker
l(heartbeat_lock
);
7643 map
<int,pair
<utime_t
,entity_inst_t
>>::iterator it
=
7644 failure_pending
.begin();
7645 while (it
!= failure_pending
.end()) {
7646 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
7647 << it
->first
<< dendl
;
7648 send_still_alive(osdmap
->get_epoch(), it
->second
.second
);
7649 failure_pending
.erase(it
++);
7652 // trigger shutdown in a different thread
7653 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
7654 queue_async_signal(SIGINT
);
7656 else if (is_preboot()) {
7657 if (m
->get_source().is_mon())
7658 _preboot(m
->oldest_map
, m
->newest_map
);
7662 else if (do_restart
)
7667 void OSD::check_osdmap_features(ObjectStore
*fs
)
7669 // adjust required feature bits?
7671 // we have to be a bit careful here, because we are accessing the
7672 // Policy structures without taking any lock. in particular, only
7673 // modify integer values that can safely be read by a racing CPU.
7674 // since we are only accessing existing Policy structures a their
7675 // current memory location, and setting or clearing bits in integer
7676 // fields, and we are the only writer, this is not a problem.
7679 Messenger::Policy p
= client_messenger
->get_default_policy();
7681 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
7682 if ((p
.features_required
& mask
) != features
) {
7683 dout(0) << "crush map has features " << features
7684 << ", adjusting msgr requires for clients" << dendl
;
7685 p
.features_required
= (p
.features_required
& ~mask
) | features
;
7686 client_messenger
->set_default_policy(p
);
7690 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
7692 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
7693 if ((p
.features_required
& mask
) != features
) {
7694 dout(0) << "crush map has features " << features
7695 << " was " << p
.features_required
7696 << ", adjusting msgr requires for mons" << dendl
;
7697 p
.features_required
= (p
.features_required
& ~mask
) | features
;
7698 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
7702 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
7704 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
7706 if ((p
.features_required
& mask
) != features
) {
7707 dout(0) << "crush map has features " << features
7708 << ", adjusting msgr requires for osds" << dendl
;
7709 p
.features_required
= (p
.features_required
& ~mask
) | features
;
7710 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
7713 if ((features
& CEPH_FEATURE_OSD_ERASURE_CODES
) &&
7714 !superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
7715 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
7716 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
7717 ObjectStore::Transaction t
;
7718 write_superblock(t
);
7719 int err
= store
->queue_transaction(service
.meta_osr
.get(), std::move(t
), NULL
);
7725 bool OSD::advance_pg(
7726 epoch_t osd_epoch
, PG
*pg
,
7727 ThreadPool::TPHandle
&handle
,
7728 PG::RecoveryCtx
*rctx
,
7729 set
<boost::intrusive_ptr
<PG
> > *new_pgs
)
7731 assert(pg
->is_locked());
7732 epoch_t next_epoch
= pg
->get_osdmap()->get_epoch() + 1;
7733 OSDMapRef lastmap
= pg
->get_osdmap();
7735 if (lastmap
->get_epoch() == osd_epoch
)
7737 assert(lastmap
->get_epoch() < osd_epoch
);
7739 epoch_t min_epoch
= service
.get_min_pg_epoch();
7742 max
= min_epoch
+ cct
->_conf
->osd_map_max_advance
;
7744 max
= next_epoch
+ cct
->_conf
->osd_map_max_advance
;
7748 next_epoch
<= osd_epoch
&& next_epoch
<= max
;
7750 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
7752 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
7753 // make sure max is bumped up so that we can get past any
7755 max
= MAX(max
, next_epoch
+ cct
->_conf
->osd_map_max_advance
);
7759 vector
<int> newup
, newacting
;
7760 int up_primary
, acting_primary
;
7761 nextmap
->pg_to_up_acting_osds(
7763 &newup
, &up_primary
,
7764 &newacting
, &acting_primary
);
7765 pg
->handle_advance_map(
7766 nextmap
, lastmap
, newup
, up_primary
,
7767 newacting
, acting_primary
, rctx
);
7770 set
<spg_t
> children
;
7771 spg_t
parent(pg
->info
.pgid
);
7772 if (parent
.is_split(
7773 lastmap
->get_pg_num(pg
->pool
.id
),
7774 nextmap
->get_pg_num(pg
->pool
.id
),
7776 service
.mark_split_in_progress(pg
->info
.pgid
, children
);
7778 pg
, children
, new_pgs
, lastmap
, nextmap
,
7783 handle
.reset_tp_timeout();
7785 service
.pg_update_epoch(pg
->info
.pgid
, lastmap
->get_epoch());
7786 pg
->handle_activate_map(rctx
);
7787 if (next_epoch
<= osd_epoch
) {
7788 dout(10) << __func__
<< " advanced to max " << max
7789 << " past min epoch " << min_epoch
7790 << " ... will requeue " << *pg
<< dendl
;
7796 void OSD::consume_map()
7798 assert(osd_lock
.is_locked());
7799 dout(7) << "consume_map version " << osdmap
->get_epoch() << dendl
;
7801 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
7802 list
<PGRef
> to_remove
;
7806 RWLock::RLocker
l(pg_map_lock
);
7807 for (ceph::unordered_map
<spg_t
,PG
*>::iterator it
= pg_map
.begin();
7810 PG
*pg
= it
->second
;
7812 if (pg
->is_primary())
7814 else if (pg
->is_replica())
7819 if (!osdmap
->have_pg_pool(pg
->info
.pgid
.pool())) {
7821 to_remove
.push_back(PGRef(pg
));
7823 service
.init_splits_between(it
->first
, service
.get_osdmap(), osdmap
);
7830 for (list
<PGRef
>::iterator i
= to_remove
.begin();
7831 i
!= to_remove
.end();
7832 to_remove
.erase(i
++)) {
7833 RWLock::WLocker
locker(pg_map_lock
);
7839 service
.expand_pg_num(service
.get_osdmap(), osdmap
);
7841 service
.pre_publish_map(osdmap
);
7842 service
.await_reserved_maps();
7843 service
.publish_map(osdmap
);
7845 service
.maybe_inject_dispatch_delay();
7847 dispatch_sessions_waiting_on_map();
7849 service
.maybe_inject_dispatch_delay();
7851 // remove any PGs which we no longer host from the session waiting_for_pg lists
7852 dout(20) << __func__
<< " checking waiting_for_pg" << dendl
;
7853 op_shardedwq
.prune_pg_waiters(osdmap
, whoami
);
7855 service
.maybe_inject_dispatch_delay();
7859 RWLock::RLocker
l(pg_map_lock
);
7860 for (ceph::unordered_map
<spg_t
,PG
*>::iterator it
= pg_map
.begin();
7863 PG
*pg
= it
->second
;
7865 pg
->queue_null(osdmap
->get_epoch(), osdmap
->get_epoch());
7869 logger
->set(l_osd_pg
, pg_map
.size());
7871 logger
->set(l_osd_pg_primary
, num_pg_primary
);
7872 logger
->set(l_osd_pg_replica
, num_pg_replica
);
7873 logger
->set(l_osd_pg_stray
, num_pg_stray
);
7876 void OSD::activate_map()
7878 assert(osd_lock
.is_locked());
7880 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
7882 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
7883 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
7887 if (osdmap
->test_flag(CEPH_OSDMAP_FULL
)) {
7888 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl
;
7889 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
7893 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
7894 if (!service
.recovery_is_paused()) {
7895 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
7896 service
.pause_recovery();
7899 if (service
.recovery_is_paused()) {
7900 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
7901 service
.unpause_recovery();
7905 service
.activate_map();
7908 take_waiters(waiting_for_osdmap
);
7911 bool OSD::require_mon_peer(const Message
*m
)
7913 if (!m
->get_connection()->peer_is_mon()) {
7914 dout(0) << "require_mon_peer received from non-mon "
7915 << m
->get_connection()->get_peer_addr()
7916 << " " << *m
<< dendl
;
7922 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
7924 if (!m
->get_connection()->peer_is_mon() &&
7925 !m
->get_connection()->peer_is_mgr()) {
7926 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
7927 << m
->get_connection()->get_peer_addr()
7928 << " " << *m
<< dendl
;
7934 bool OSD::require_osd_peer(const Message
*m
)
7936 if (!m
->get_connection()->peer_is_osd()) {
7937 dout(0) << "require_osd_peer received from non-osd "
7938 << m
->get_connection()->get_peer_addr()
7939 << " " << *m
<< dendl
;
7945 bool OSD::require_self_aliveness(const Message
*m
, epoch_t epoch
)
7947 epoch_t up_epoch
= service
.get_up_epoch();
7948 if (epoch
< up_epoch
) {
7949 dout(7) << "from pre-up epoch " << epoch
<< " < " << up_epoch
<< dendl
;
7954 dout(7) << "still in boot state, dropping message " << *m
<< dendl
;
7961 bool OSD::require_same_peer_instance(const Message
*m
, OSDMapRef
& map
,
7962 bool is_fast_dispatch
)
7964 int from
= m
->get_source().num();
7966 if (map
->is_down(from
) ||
7967 (map
->get_cluster_addr(from
) != m
->get_source_inst().addr
)) {
7968 dout(5) << "from dead osd." << from
<< ", marking down, "
7969 << " msg was " << m
->get_source_inst().addr
7970 << " expected " << (map
->is_up(from
) ?
7971 map
->get_cluster_addr(from
) : entity_addr_t())
7973 ConnectionRef con
= m
->get_connection();
7975 Session
*s
= static_cast<Session
*>(con
->get_priv());
7977 if (!is_fast_dispatch
)
7978 s
->session_dispatch_lock
.Lock();
7979 clear_session_waiting_on_map(s
);
7980 con
->set_priv(NULL
); // break ref <-> session cycle, if any
7981 if (!is_fast_dispatch
)
7982 s
->session_dispatch_lock
.Unlock();
7992 * require that we have same (or newer) map, and that
7993 * the source is the pg primary.
7995 bool OSD::require_same_or_newer_map(OpRequestRef
& op
, epoch_t epoch
,
7996 bool is_fast_dispatch
)
7998 const Message
*m
= op
->get_req();
7999 dout(15) << "require_same_or_newer_map " << epoch
8000 << " (i am " << osdmap
->get_epoch() << ") " << m
<< dendl
;
8002 assert(osd_lock
.is_locked());
8004 // do they have a newer map?
8005 if (epoch
> osdmap
->get_epoch()) {
8006 dout(7) << "waiting for newer map epoch " << epoch
8007 << " > my " << osdmap
->get_epoch() << " with " << m
<< dendl
;
8008 wait_for_new_map(op
);
8012 if (!require_self_aliveness(op
->get_req(), epoch
)) {
8016 // ok, our map is same or newer.. do they still exist?
8017 if (m
->get_connection()->get_messenger() == cluster_messenger
&&
8018 !require_same_peer_instance(op
->get_req(), osdmap
, is_fast_dispatch
)) {
8029 // ----------------------------------------
8032 void OSD::split_pgs(
8034 const set
<spg_t
> &childpgids
, set
<boost::intrusive_ptr
<PG
> > *out_pgs
,
8037 PG::RecoveryCtx
*rctx
)
8039 unsigned pg_num
= nextmap
->get_pg_num(
8041 parent
->update_snap_mapper_bits(
8042 parent
->info
.pgid
.get_split_bits(pg_num
)
8045 vector
<object_stat_sum_t
> updated_stats(childpgids
.size() + 1);
8046 parent
->info
.stats
.stats
.sum
.split(updated_stats
);
8048 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
8049 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
8050 i
!= childpgids
.end();
8052 assert(stat_iter
!= updated_stats
.end());
8053 dout(10) << "Splitting " << *parent
<< " into " << *i
<< dendl
;
8054 assert(service
.splitting(*i
));
8055 PG
* child
= _make_pg(nextmap
, *i
);
8057 out_pgs
->insert(child
);
8058 rctx
->created_pgs
.insert(child
);
8060 unsigned split_bits
= i
->get_split_bits(pg_num
);
8061 dout(10) << "pg_num is " << pg_num
<< dendl
;
8062 dout(10) << "m_seed " << i
->ps() << dendl
;
8063 dout(10) << "split_bits is " << split_bits
<< dendl
;
8065 parent
->split_colls(
8075 child
->info
.stats
.stats
.sum
= *stat_iter
;
8077 child
->write_if_dirty(*(rctx
->transaction
));
8080 assert(stat_iter
!= updated_stats
.end());
8081 parent
->info
.stats
.stats
.sum
= *stat_iter
;
8082 parent
->write_if_dirty(*(rctx
->transaction
));
8088 void OSD::handle_pg_create(OpRequestRef op
)
8090 const MOSDPGCreate
*m
= static_cast<const MOSDPGCreate
*>(op
->get_req());
8091 assert(m
->get_type() == MSG_OSD_PG_CREATE
);
8093 dout(10) << "handle_pg_create " << *m
<< dendl
;
8095 if (!require_mon_peer(op
->get_req())) {
8099 if (!require_same_or_newer_map(op
, m
->epoch
, false))
8104 map
<pg_t
,utime_t
>::const_iterator ci
= m
->ctimes
.begin();
8105 for (map
<pg_t
,pg_create_t
>::const_iterator p
= m
->mkpg
.begin();
8108 assert(ci
!= m
->ctimes
.end() && ci
->first
== p
->first
);
8109 epoch_t created
= p
->second
.created
;
8110 if (p
->second
.split_bits
) // Skip split pgs
8114 if (on
.preferred() >= 0) {
8115 dout(20) << "ignoring localized pg " << on
<< dendl
;
8119 if (!osdmap
->have_pg_pool(on
.pool())) {
8120 dout(20) << "ignoring pg on deleted pool " << on
<< dendl
;
8124 dout(20) << "mkpg " << on
<< " e" << created
<< "@" << ci
->second
<< dendl
;
8126 // is it still ours?
8127 vector
<int> up
, acting
;
8128 int up_primary
= -1;
8129 int acting_primary
= -1;
8130 osdmap
->pg_to_up_acting_osds(on
, &up
, &up_primary
, &acting
, &acting_primary
);
8131 int role
= osdmap
->calc_pg_role(whoami
, acting
, acting
.size());
8133 if (acting_primary
!= whoami
) {
8134 dout(10) << "mkpg " << on
<< " not acting_primary (" << acting_primary
8135 << "), my role=" << role
<< ", skipping" << dendl
;
8140 bool mapped
= osdmap
->get_primary_shard(on
, &pgid
);
8144 osdmap
->get_pools().at(pgid
.pool()).ec_pool(),
8146 pg_history_t history
;
8147 build_initial_pg_history(pgid
, created
, ci
->second
, &history
, &pi
);
8149 // The mon won't resend unless the primary changed, so
8150 // we ignore same_interval_since. We'll pass this history
8151 // to handle_pg_peering_evt with the current epoch as the
8152 // event -- the project_pg_history check in
8153 // handle_pg_peering_evt will be a noop.
8154 if (history
.same_primary_since
> m
->epoch
) {
8155 dout(10) << __func__
<< ": got obsolete pg create on pgid "
8156 << pgid
<< " from epoch " << m
->epoch
8157 << ", primary changed in " << history
.same_primary_since
8162 if (handle_pg_peering_evt(
8166 osdmap
->get_epoch(),
8167 PG::CephPeeringEvtRef(
8168 new PG::CephPeeringEvt(
8169 osdmap
->get_epoch(),
8170 osdmap
->get_epoch(),
8173 service
.send_pg_created(pgid
.pgid
);
8176 last_pg_create_epoch
= m
->epoch
;
8178 maybe_update_heartbeat_peers();
8182 // ----------------------------------------
8183 // peering and recovery
8185 PG::RecoveryCtx
OSD::create_context()
8187 ObjectStore::Transaction
*t
= new ObjectStore::Transaction
;
8188 C_Contexts
*on_applied
= new C_Contexts(cct
);
8189 C_Contexts
*on_safe
= new C_Contexts(cct
);
8190 map
<int, map
<spg_t
,pg_query_t
> > *query_map
=
8191 new map
<int, map
<spg_t
, pg_query_t
> >;
8192 map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > > *notify_list
=
8193 new map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > >;
8194 map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > > *info_map
=
8195 new map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > >;
8196 PG::RecoveryCtx
rctx(query_map
, info_map
, notify_list
,
8197 on_applied
, on_safe
, t
);
8201 struct C_OpenPGs
: public Context
{
8205 C_OpenPGs(set
<PGRef
>& p
, ObjectStore
*s
, OSD
* o
) : store(s
), osd(o
) {
8208 void finish(int r
) override
{
8209 RWLock::RLocker
l(osd
->pg_map_lock
);
8210 for (auto p
: pgs
) {
8211 if (osd
->pg_map
.count(p
->info
.pgid
)) {
8212 p
->ch
= store
->open_collection(p
->coll
);
8219 void OSD::dispatch_context_transaction(PG::RecoveryCtx
&ctx
, PG
*pg
,
8220 ThreadPool::TPHandle
*handle
)
8222 if (!ctx
.transaction
->empty()) {
8223 if (!ctx
.created_pgs
.empty()) {
8224 ctx
.on_applied
->add(new C_OpenPGs(ctx
.created_pgs
, store
, this));
8226 int tr
= store
->queue_transaction(
8228 std::move(*ctx
.transaction
), ctx
.on_applied
, ctx
.on_safe
, NULL
,
8229 TrackedOpRef(), handle
);
8230 delete (ctx
.transaction
);
8232 ctx
.transaction
= new ObjectStore::Transaction
;
8233 ctx
.on_applied
= new C_Contexts(cct
);
8234 ctx
.on_safe
= new C_Contexts(cct
);
8238 void OSD::dispatch_context(PG::RecoveryCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
8239 ThreadPool::TPHandle
*handle
)
8241 if (service
.get_osdmap()->is_up(whoami
) &&
8243 do_notifies(*ctx
.notify_list
, curmap
);
8244 do_queries(*ctx
.query_map
, curmap
);
8245 do_infos(*ctx
.info_map
, curmap
);
8247 delete ctx
.notify_list
;
8248 delete ctx
.query_map
;
8249 delete ctx
.info_map
;
8250 if ((ctx
.on_applied
->empty() &&
8251 ctx
.on_safe
->empty() &&
8252 ctx
.transaction
->empty() &&
8253 ctx
.created_pgs
.empty()) || !pg
) {
8254 delete ctx
.transaction
;
8255 delete ctx
.on_applied
;
8257 assert(ctx
.created_pgs
.empty());
8259 if (!ctx
.created_pgs
.empty()) {
8260 ctx
.on_applied
->add(new C_OpenPGs(ctx
.created_pgs
, store
, this));
8262 int tr
= store
->queue_transaction(
8264 std::move(*ctx
.transaction
), ctx
.on_applied
, ctx
.on_safe
, NULL
, TrackedOpRef(),
8266 delete (ctx
.transaction
);
8272 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8273 * content for, and they are primary for.
8276 void OSD::do_notifies(
8277 map
<int,vector
<pair
<pg_notify_t
,PastIntervals
> > >& notify_list
,
8281 vector
<pair
<pg_notify_t
,PastIntervals
> > >::iterator it
=
8282 notify_list
.begin();
8283 it
!= notify_list
.end();
8285 if (!curmap
->is_up(it
->first
)) {
8286 dout(20) << __func__
<< " skipping down osd." << it
->first
<< dendl
;
8289 ConnectionRef con
= service
.get_con_osd_cluster(
8290 it
->first
, curmap
->get_epoch());
8292 dout(20) << __func__
<< " skipping osd." << it
->first
8293 << " (NULL con)" << dendl
;
8296 service
.share_map_peer(it
->first
, con
.get(), curmap
);
8297 dout(7) << __func__
<< " osd " << it
->first
8298 << " on " << it
->second
.size() << " PGs" << dendl
;
8299 MOSDPGNotify
*m
= new MOSDPGNotify(curmap
->get_epoch(),
8301 con
->send_message(m
);
8307 * send out pending queries for info | summaries
8309 void OSD::do_queries(map
<int, map
<spg_t
,pg_query_t
> >& query_map
,
8312 for (map
<int, map
<spg_t
,pg_query_t
> >::iterator pit
= query_map
.begin();
8313 pit
!= query_map
.end();
8315 if (!curmap
->is_up(pit
->first
)) {
8316 dout(20) << __func__
<< " skipping down osd." << pit
->first
<< dendl
;
8319 int who
= pit
->first
;
8320 ConnectionRef con
= service
.get_con_osd_cluster(who
, curmap
->get_epoch());
8322 dout(20) << __func__
<< " skipping osd." << who
8323 << " (NULL con)" << dendl
;
8326 service
.share_map_peer(who
, con
.get(), curmap
);
8327 dout(7) << __func__
<< " querying osd." << who
8328 << " on " << pit
->second
.size() << " PGs" << dendl
;
8329 MOSDPGQuery
*m
= new MOSDPGQuery(curmap
->get_epoch(), pit
->second
);
8330 con
->send_message(m
);
8335 void OSD::do_infos(map
<int,
8336 vector
<pair
<pg_notify_t
, PastIntervals
> > >& info_map
,
8340 vector
<pair
<pg_notify_t
, PastIntervals
> > >::iterator p
=
8342 p
!= info_map
.end();
8344 if (!curmap
->is_up(p
->first
)) {
8345 dout(20) << __func__
<< " skipping down osd." << p
->first
<< dendl
;
8348 for (vector
<pair
<pg_notify_t
,PastIntervals
> >::iterator i
= p
->second
.begin();
8349 i
!= p
->second
.end();
8351 dout(20) << __func__
<< " sending info " << i
->first
.info
8352 << " to shard " << p
->first
<< dendl
;
8354 ConnectionRef con
= service
.get_con_osd_cluster(
8355 p
->first
, curmap
->get_epoch());
8357 dout(20) << __func__
<< " skipping osd." << p
->first
8358 << " (NULL con)" << dendl
;
8361 service
.share_map_peer(p
->first
, con
.get(), curmap
);
8362 MOSDPGInfo
*m
= new MOSDPGInfo(curmap
->get_epoch());
8363 m
->pg_list
= p
->second
;
8364 con
->send_message(m
);
8371 * from non-primary to primary
8372 * includes pg_info_t.
8373 * NOTE: called with opqueue active.
8375 void OSD::handle_pg_notify(OpRequestRef op
)
8377 const MOSDPGNotify
*m
= static_cast<const MOSDPGNotify
*>(op
->get_req());
8378 assert(m
->get_type() == MSG_OSD_PG_NOTIFY
);
8380 dout(7) << "handle_pg_notify from " << m
->get_source() << dendl
;
8381 int from
= m
->get_source().num();
8383 if (!require_osd_peer(op
->get_req()))
8386 if (!require_same_or_newer_map(op
, m
->get_epoch(), false))
8391 for (auto it
= m
->get_pg_list().begin();
8392 it
!= m
->get_pg_list().end();
8394 if (it
->first
.info
.pgid
.preferred() >= 0) {
8395 dout(20) << "ignoring localized pg " << it
->first
.info
.pgid
<< dendl
;
8399 handle_pg_peering_evt(
8400 spg_t(it
->first
.info
.pgid
.pgid
, it
->first
.to
),
8401 it
->first
.info
.history
, it
->second
,
8402 it
->first
.query_epoch
,
8403 PG::CephPeeringEvtRef(
8404 new PG::CephPeeringEvt(
8405 it
->first
.epoch_sent
, it
->first
.query_epoch
,
8406 PG::MNotifyRec(pg_shard_t(from
, it
->first
.from
), it
->first
,
8407 op
->get_req()->get_connection()->get_features())))
8412 void OSD::handle_pg_log(OpRequestRef op
)
8414 MOSDPGLog
*m
= static_cast<MOSDPGLog
*>(op
->get_nonconst_req());
8415 assert(m
->get_type() == MSG_OSD_PG_LOG
);
8416 dout(7) << "handle_pg_log " << *m
<< " from " << m
->get_source() << dendl
;
8418 if (!require_osd_peer(op
->get_req()))
8421 int from
= m
->get_source().num();
8422 if (!require_same_or_newer_map(op
, m
->get_epoch(), false))
8425 if (m
->info
.pgid
.preferred() >= 0) {
8426 dout(10) << "ignoring localized pg " << m
->info
.pgid
<< dendl
;
8431 handle_pg_peering_evt(
8432 spg_t(m
->info
.pgid
.pgid
, m
->to
),
8433 m
->info
.history
, m
->past_intervals
, m
->get_epoch(),
8434 PG::CephPeeringEvtRef(
8435 new PG::CephPeeringEvt(
8436 m
->get_epoch(), m
->get_query_epoch(),
8437 PG::MLogRec(pg_shard_t(from
, m
->from
), m
)))
8441 void OSD::handle_pg_info(OpRequestRef op
)
8443 const MOSDPGInfo
*m
= static_cast<const MOSDPGInfo
*>(op
->get_req());
8444 assert(m
->get_type() == MSG_OSD_PG_INFO
);
8445 dout(7) << "handle_pg_info " << *m
<< " from " << m
->get_source() << dendl
;
8447 if (!require_osd_peer(op
->get_req()))
8450 int from
= m
->get_source().num();
8451 if (!require_same_or_newer_map(op
, m
->get_epoch(), false))
8456 for (auto p
= m
->pg_list
.begin();
8457 p
!= m
->pg_list
.end();
8459 if (p
->first
.info
.pgid
.preferred() >= 0) {
8460 dout(10) << "ignoring localized pg " << p
->first
.info
.pgid
<< dendl
;
8464 handle_pg_peering_evt(
8465 spg_t(p
->first
.info
.pgid
.pgid
, p
->first
.to
),
8466 p
->first
.info
.history
, p
->second
, p
->first
.epoch_sent
,
8467 PG::CephPeeringEvtRef(
8468 new PG::CephPeeringEvt(
8469 p
->first
.epoch_sent
, p
->first
.query_epoch
,
8472 from
, p
->first
.from
), p
->first
.info
, p
->first
.epoch_sent
)))
8477 void OSD::handle_pg_trim(OpRequestRef op
)
8479 const MOSDPGTrim
*m
= static_cast<const MOSDPGTrim
*>(op
->get_req());
8480 assert(m
->get_type() == MSG_OSD_PG_TRIM
);
8482 dout(7) << "handle_pg_trim " << *m
<< " from " << m
->get_source() << dendl
;
8484 if (!require_osd_peer(op
->get_req()))
8487 int from
= m
->get_source().num();
8488 if (!require_same_or_newer_map(op
, m
->epoch
, false))
8491 if (m
->pgid
.preferred() >= 0) {
8492 dout(10) << "ignoring localized pg " << m
->pgid
<< dendl
;
8498 PG
*pg
= _lookup_lock_pg(m
->pgid
);
8500 dout(10) << " don't have pg " << m
->pgid
<< dendl
;
8504 if (m
->epoch
< pg
->info
.history
.same_interval_since
) {
8505 dout(10) << *pg
<< " got old trim to " << m
->trim_to
<< ", ignoring" << dendl
;
8510 if (pg
->is_primary()) {
8511 // peer is informing us of their last_complete_ondisk
8512 dout(10) << *pg
<< " replica osd." << from
<< " lcod " << m
->trim_to
<< dendl
;
8513 pg
->peer_last_complete_ondisk
[pg_shard_t(from
, m
->pgid
.shard
)] =
8515 // trim log when the pg is recovered
8516 pg
->calc_min_last_complete_ondisk();
8518 // primary is instructing us to trim
8519 ObjectStore::Transaction t
;
8520 pg
->pg_log
.trim(m
->trim_to
, pg
->info
);
8521 pg
->dirty_info
= true;
8522 pg
->write_if_dirty(t
);
8523 int tr
= store
->queue_transaction(pg
->osr
.get(), std::move(t
), NULL
);
8529 void OSD::handle_pg_backfill_reserve(OpRequestRef op
)
8531 const MBackfillReserve
*m
= static_cast<const MBackfillReserve
*>(op
->get_req());
8532 assert(m
->get_type() == MSG_OSD_BACKFILL_RESERVE
);
8534 if (!require_osd_peer(op
->get_req()))
8536 if (!require_same_or_newer_map(op
, m
->query_epoch
, false))
8539 PG::CephPeeringEvtRef evt
;
8540 if (m
->type
== MBackfillReserve::REQUEST
) {
8541 evt
= PG::CephPeeringEvtRef(
8542 new PG::CephPeeringEvt(
8545 PG::RequestBackfillPrio(m
->priority
)));
8546 } else if (m
->type
== MBackfillReserve::GRANT
) {
8547 evt
= PG::CephPeeringEvtRef(
8548 new PG::CephPeeringEvt(
8551 PG::RemoteBackfillReserved()));
8552 } else if (m
->type
== MBackfillReserve::REJECT
) {
8553 evt
= PG::CephPeeringEvtRef(
8554 new PG::CephPeeringEvt(
8557 PG::RemoteReservationRejected()));
8562 if (service
.splitting(m
->pgid
)) {
8563 peering_wait_for_split
[m
->pgid
].push_back(evt
);
8567 PG
*pg
= _lookup_lock_pg(m
->pgid
);
8569 dout(10) << " don't have pg " << m
->pgid
<< dendl
;
8573 pg
->queue_peering_event(evt
);
8577 void OSD::handle_pg_recovery_reserve(OpRequestRef op
)
8579 const MRecoveryReserve
*m
= static_cast<const MRecoveryReserve
*>(op
->get_req());
8580 assert(m
->get_type() == MSG_OSD_RECOVERY_RESERVE
);
8582 if (!require_osd_peer(op
->get_req()))
8584 if (!require_same_or_newer_map(op
, m
->query_epoch
, false))
8587 PG::CephPeeringEvtRef evt
;
8588 if (m
->type
== MRecoveryReserve::REQUEST
) {
8589 evt
= PG::CephPeeringEvtRef(
8590 new PG::CephPeeringEvt(
8593 PG::RequestRecovery()));
8594 } else if (m
->type
== MRecoveryReserve::GRANT
) {
8595 evt
= PG::CephPeeringEvtRef(
8596 new PG::CephPeeringEvt(
8599 PG::RemoteRecoveryReserved()));
8600 } else if (m
->type
== MRecoveryReserve::RELEASE
) {
8601 evt
= PG::CephPeeringEvtRef(
8602 new PG::CephPeeringEvt(
8605 PG::RecoveryDone()));
8610 if (service
.splitting(m
->pgid
)) {
8611 peering_wait_for_split
[m
->pgid
].push_back(evt
);
8615 PG
*pg
= _lookup_lock_pg(m
->pgid
);
8617 dout(10) << " don't have pg " << m
->pgid
<< dendl
;
8621 pg
->queue_peering_event(evt
);
8627 * from primary to replica | stray
8628 * NOTE: called with opqueue active.
8630 void OSD::handle_pg_query(OpRequestRef op
)
8632 assert(osd_lock
.is_locked());
8634 const MOSDPGQuery
*m
= static_cast<const MOSDPGQuery
*>(op
->get_req());
8635 assert(m
->get_type() == MSG_OSD_PG_QUERY
);
8637 if (!require_osd_peer(op
->get_req()))
8640 dout(7) << "handle_pg_query from " << m
->get_source() << " epoch " << m
->get_epoch() << dendl
;
8641 int from
= m
->get_source().num();
8643 if (!require_same_or_newer_map(op
, m
->get_epoch(), false))
8648 map
< int, vector
<pair
<pg_notify_t
, PastIntervals
> > > notify_list
;
8650 for (auto it
= m
->pg_list
.begin();
8651 it
!= m
->pg_list
.end();
8653 spg_t pgid
= it
->first
;
8655 if (pgid
.preferred() >= 0) {
8656 dout(10) << "ignoring localized pg " << pgid
<< dendl
;
8660 if (service
.splitting(pgid
)) {
8661 peering_wait_for_split
[pgid
].push_back(
8662 PG::CephPeeringEvtRef(
8663 new PG::CephPeeringEvt(
8664 it
->second
.epoch_sent
, it
->second
.epoch_sent
,
8665 PG::MQuery(pg_shard_t(from
, it
->second
.from
),
8666 it
->second
, it
->second
.epoch_sent
))));
8671 RWLock::RLocker
l(pg_map_lock
);
8672 if (pg_map
.count(pgid
)) {
8674 pg
= _lookup_lock_pg_with_map_lock_held(pgid
);
8676 it
->second
.epoch_sent
, it
->second
.epoch_sent
,
8677 pg_shard_t(from
, it
->second
.from
), it
->second
);
8683 if (!osdmap
->have_pg_pool(pgid
.pool()))
8686 // get active crush mapping
8687 int up_primary
, acting_primary
;
8688 vector
<int> up
, acting
;
8689 osdmap
->pg_to_up_acting_osds(
8690 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
8693 pg_history_t history
= it
->second
.history
;
8694 bool valid_history
= project_pg_history(
8695 pgid
, history
, it
->second
.epoch_sent
,
8696 up
, up_primary
, acting
, acting_primary
);
8698 if (!valid_history
||
8699 it
->second
.epoch_sent
< history
.same_interval_since
) {
8700 dout(10) << " pg " << pgid
<< " dne, and pg has changed in "
8701 << history
.same_interval_since
8702 << " (msg from " << it
->second
.epoch_sent
<< ")" << dendl
;
8706 dout(10) << " pg " << pgid
<< " dne" << dendl
;
8707 pg_info_t
empty(spg_t(pgid
.pgid
, it
->second
.to
));
8708 /* This is racy, but that should be ok: if we complete the deletion
8709 * before the pg is recreated, we'll just start it off backfilling
8710 * instead of just empty */
8711 if (service
.deleting_pgs
.lookup(pgid
))
8712 empty
.set_last_backfill(hobject_t());
8713 if (it
->second
.type
== pg_query_t::LOG
||
8714 it
->second
.type
== pg_query_t::FULLLOG
) {
8715 ConnectionRef con
= service
.get_con_osd_cluster(from
, osdmap
->get_epoch());
8717 MOSDPGLog
*mlog
= new MOSDPGLog(
8718 it
->second
.from
, it
->second
.to
,
8719 osdmap
->get_epoch(), empty
,
8720 it
->second
.epoch_sent
);
8721 service
.share_map_peer(from
, con
.get(), osdmap
);
8722 con
->send_message(mlog
);
8725 notify_list
[from
].push_back(
8728 it
->second
.from
, it
->second
.to
,
8729 it
->second
.epoch_sent
,
8730 osdmap
->get_epoch(),
8733 osdmap
->get_pools().at(pgid
.pool()).ec_pool(),
8737 do_notifies(notify_list
, osdmap
);
8741 void OSD::handle_pg_remove(OpRequestRef op
)
8743 const MOSDPGRemove
*m
= static_cast<const MOSDPGRemove
*>(op
->get_req());
8744 assert(m
->get_type() == MSG_OSD_PG_REMOVE
);
8745 assert(osd_lock
.is_locked());
8747 if (!require_osd_peer(op
->get_req()))
8750 dout(7) << "handle_pg_remove from " << m
->get_source() << " on "
8751 << m
->pg_list
.size() << " pgs" << dendl
;
8753 if (!require_same_or_newer_map(op
, m
->get_epoch(), false))
8758 for (auto it
= m
->pg_list
.begin();
8759 it
!= m
->pg_list
.end();
8762 if (pgid
.preferred() >= 0) {
8763 dout(10) << "ignoring localized pg " << pgid
<< dendl
;
8767 RWLock::WLocker
l(pg_map_lock
);
8768 if (pg_map
.count(pgid
) == 0) {
8769 dout(10) << " don't have pg " << pgid
<< dendl
;
8772 dout(5) << "queue_pg_for_deletion: " << pgid
<< dendl
;
8773 PG
*pg
= _lookup_lock_pg_with_map_lock_held(pgid
);
8774 pg_history_t history
= pg
->info
.history
;
8775 int up_primary
, acting_primary
;
8776 vector
<int> up
, acting
;
8777 osdmap
->pg_to_up_acting_osds(
8778 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
8779 bool valid_history
= project_pg_history(
8780 pg
->info
.pgid
, history
, pg
->get_osdmap()->get_epoch(),
8781 up
, up_primary
, acting
, acting_primary
);
8782 if (valid_history
&&
8783 history
.same_interval_since
<= m
->get_epoch()) {
8784 assert(pg
->get_primary().osd
== m
->get_source().num());
8789 dout(10) << *pg
<< " ignoring remove request, pg changed in epoch "
8790 << history
.same_interval_since
8791 << " > " << m
->get_epoch() << dendl
;
8797 void OSD::_remove_pg(PG
*pg
)
8799 ObjectStore::Transaction rmt
;
8801 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
8802 // the pg_map must be done together without unlocking the pg lock,
8803 // to avoid racing with watcher cleanup in ms_handle_reset
8804 // and handle_notify_timeout
8805 pg
->on_removal(&rmt
);
8807 service
.cancel_pending_splits_for_parent(pg
->info
.pgid
);
8808 int tr
= store
->queue_transaction(
8809 pg
->osr
.get(), std::move(rmt
), NULL
,
8810 new ContainerContext
<
8811 SequencerRef
>(pg
->osr
));
8814 DeletingStateRef deleting
= service
.deleting_pgs
.lookup_or_create(
8820 remove_wq
.queue(make_pair(PGRef(pg
), deleting
));
8822 service
.pg_remove_epoch(pg
->info
.pgid
);
8824 // dereference from op_wq
8825 op_shardedwq
.clear_pg_pointer(pg
->info
.pgid
);
8828 pg_map
.erase(pg
->info
.pgid
);
8829 pg
->put("PGMap"); // since we've taken it out of map
8833 // =========================================================
8836 void OSDService::_maybe_queue_recovery() {
8837 assert(recovery_lock
.is_locked_by_me());
8838 uint64_t available_pushes
;
8839 while (!awaiting_throttle
.empty() &&
8840 _recover_now(&available_pushes
)) {
8841 uint64_t to_start
= MIN(
8843 cct
->_conf
->osd_recovery_max_single_start
);
8844 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
8845 awaiting_throttle
.pop_front();
8846 recovery_ops_reserved
+= to_start
;
8850 bool OSDService::_recover_now(uint64_t *available_pushes
)
8852 if (available_pushes
)
8853 *available_pushes
= 0;
8855 if (ceph_clock_now() < defer_recovery_until
) {
8856 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
8860 if (recovery_paused
) {
8861 dout(15) << __func__
<< " paused" << dendl
;
8865 uint64_t max
= cct
->_conf
->osd_recovery_max_active
;
8866 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
8867 dout(15) << __func__
<< " active " << recovery_ops_active
8868 << " + reserved " << recovery_ops_reserved
8869 << " >= max " << max
<< dendl
;
8873 if (available_pushes
)
8874 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
8879 void OSD::do_recovery(
8880 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
,
8881 ThreadPool::TPHandle
&handle
)
8883 uint64_t started
= 0;
8884 if (cct
->_conf
->osd_recovery_sleep
> 0) {
8885 handle
.suspend_tp_timeout();
8888 t
.set_from_double(cct
->_conf
->osd_recovery_sleep
);
8890 dout(20) << __func__
<< " slept for " << t
<< dendl
;
8892 handle
.reset_tp_timeout();
8896 if (pg
->pg_has_reset_since(queued
)) {
8900 assert(!pg
->deleting
);
8901 assert(pg
->is_peered() && pg
->is_primary());
8903 assert(pg
->recovery_queued
);
8904 pg
->recovery_queued
= false;
8906 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
8907 #ifdef DEBUG_RECOVERY_OIDS
8908 dout(20) << " active was " << service
.recovery_oids
[pg
->info
.pgid
] << dendl
;
8911 bool more
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
8912 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
8913 << " on " << *pg
<< dendl
;
8915 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
8916 if (!started
&& (more
|| !pg
->have_unfound())) {
8920 PG::RecoveryCtx rctx
= create_context();
8921 rctx
.handle
= &handle
;
8924 * if we couldn't start any recovery ops and things are still
8925 * unfound, see if we can discover more missing object locations.
8926 * It may be that our initial locations were bad and we errored
8927 * out while trying to pull.
8929 if (!more
&& pg
->have_unfound()) {
8930 pg
->discover_all_missing(*rctx
.query_map
);
8931 if (rctx
.query_map
->empty()) {
8932 dout(10) << "do_recovery no luck, giving up on this pg for now" << dendl
;
8934 dout(10) << "do_recovery no luck, giving up on this pg for now" << dendl
;
8935 pg
->queue_recovery();
8939 pg
->write_if_dirty(*rctx
.transaction
);
8940 OSDMapRef curmap
= pg
->get_osdmap();
8941 dispatch_context(rctx
, pg
, curmap
);
8945 assert(started
<= reserved_pushes
);
8946 service
.release_reserved_pushes(reserved_pushes
);
8949 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
8951 Mutex::Locker
l(recovery_lock
);
8952 dout(10) << "start_recovery_op " << *pg
<< " " << soid
8953 << " (" << recovery_ops_active
<< "/"
8954 << cct
->_conf
->osd_recovery_max_active
<< " rops)"
8956 recovery_ops_active
++;
8958 #ifdef DEBUG_RECOVERY_OIDS
8959 dout(20) << " active was " << recovery_oids
[pg
->info
.pgid
] << dendl
;
8960 assert(recovery_oids
[pg
->info
.pgid
].count(soid
) == 0);
8961 recovery_oids
[pg
->info
.pgid
].insert(soid
);
8965 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
8967 Mutex::Locker
l(recovery_lock
);
8968 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
8969 << " dequeue=" << dequeue
8970 << " (" << recovery_ops_active
<< "/" << cct
->_conf
->osd_recovery_max_active
<< " rops)"
8974 assert(recovery_ops_active
> 0);
8975 recovery_ops_active
--;
8977 #ifdef DEBUG_RECOVERY_OIDS
8978 dout(20) << " active oids was " << recovery_oids
[pg
->info
.pgid
] << dendl
;
8979 assert(recovery_oids
[pg
->info
.pgid
].count(soid
));
8980 recovery_oids
[pg
->info
.pgid
].erase(soid
);
8983 _maybe_queue_recovery();
8986 bool OSDService::is_recovery_active()
8988 if (recovery_ops_active
> 0)
8994 // =========================================================
8997 bool OSD::op_is_discardable(const MOSDOp
*op
)
8999 // drop client request if they are not connected and can't get the
9001 if (!op
->get_connection()->is_connected()) {
9007 void OSD::enqueue_op(spg_t pg
, OpRequestRef
& op
, epoch_t epoch
)
9009 utime_t latency
= ceph_clock_now() - op
->get_req()->get_recv_stamp();
9010 dout(15) << "enqueue_op " << op
<< " prio " << op
->get_req()->get_priority()
9011 << " cost " << op
->get_req()->get_cost()
9012 << " latency " << latency
9013 << " epoch " << epoch
9014 << " " << *(op
->get_req()) << dendl
;
9015 op
->osd_trace
.event("enqueue op");
9016 op
->osd_trace
.keyval("priority", op
->get_req()->get_priority());
9017 op
->osd_trace
.keyval("cost", op
->get_req()->get_cost());
9018 op
->mark_queued_for_pg();
9019 op_shardedwq
.queue(make_pair(pg
, PGQueueable(op
, epoch
)));
9025 * NOTE: dequeue called in worker thread, with pg lock
9027 void OSD::dequeue_op(
9028 PGRef pg
, OpRequestRef op
,
9029 ThreadPool::TPHandle
&handle
)
9032 OID_EVENT_TRACE_WITH_MSG(op
->get_req(), "DEQUEUE_OP_BEGIN", false);
9034 utime_t now
= ceph_clock_now();
9035 op
->set_dequeued_time(now
);
9036 utime_t latency
= now
- op
->get_req()->get_recv_stamp();
9037 dout(10) << "dequeue_op " << op
<< " prio " << op
->get_req()->get_priority()
9038 << " cost " << op
->get_req()->get_cost()
9039 << " latency " << latency
9040 << " " << *(op
->get_req())
9041 << " pg " << *pg
<< dendl
;
9043 Session
*session
= static_cast<Session
*>(
9044 op
->get_req()->get_connection()->get_priv());
9046 maybe_share_map(session
, op
, pg
->get_osdmap());
9053 op
->mark_reached_pg();
9054 op
->osd_trace
.event("dequeue_op");
9056 pg
->do_request(op
, handle
);
9059 dout(10) << "dequeue_op " << op
<< " finish" << dendl
;
9060 OID_EVENT_TRACE_WITH_MSG(op
->get_req(), "DEQUEUE_OP_END", false);
9064 struct C_CompleteSplits
: public Context
{
9066 set
<boost::intrusive_ptr
<PG
> > pgs
;
9067 C_CompleteSplits(OSD
*osd
, const set
<boost::intrusive_ptr
<PG
> > &in
)
9068 : osd(osd
), pgs(in
) {}
9069 void finish(int r
) override
{
9070 Mutex::Locker
l(osd
->osd_lock
);
9071 if (osd
->is_stopping())
9073 PG::RecoveryCtx rctx
= osd
->create_context();
9074 for (set
<boost::intrusive_ptr
<PG
> >::iterator i
= pgs
.begin();
9077 osd
->pg_map_lock
.get_write();
9079 osd
->add_newly_split_pg(&**i
, &rctx
);
9080 if (!((*i
)->deleting
)) {
9081 set
<spg_t
> to_complete
;
9082 to_complete
.insert((*i
)->info
.pgid
);
9083 osd
->service
.complete_split(to_complete
);
9085 osd
->pg_map_lock
.put_write();
9086 osd
->dispatch_context_transaction(rctx
, &**i
);
9087 osd
->wake_pg_waiters(*i
);
9091 osd
->dispatch_context(rctx
, 0, osd
->service
.get_osdmap());
9095 void OSD::process_peering_events(
9096 const list
<PG
*> &pgs
,
9097 ThreadPool::TPHandle
&handle
9100 bool need_up_thru
= false;
9101 epoch_t same_interval_since
= 0;
9103 PG::RecoveryCtx rctx
= create_context();
9104 rctx
.handle
= &handle
;
9105 for (list
<PG
*>::const_iterator i
= pgs
.begin();
9108 set
<boost::intrusive_ptr
<PG
> > split_pgs
;
9110 pg
->lock_suspend_timeout(handle
);
9111 curmap
= service
.get_osdmap();
9116 if (!advance_pg(curmap
->get_epoch(), pg
, handle
, &rctx
, &split_pgs
)) {
9117 // we need to requeue the PG explicitly since we didn't actually
9119 peering_wq
.queue(pg
);
9121 assert(!pg
->peering_queue
.empty());
9122 PG::CephPeeringEvtRef evt
= pg
->peering_queue
.front();
9123 pg
->peering_queue
.pop_front();
9124 pg
->handle_peering_event(evt
, &rctx
);
9126 need_up_thru
= pg
->need_up_thru
|| need_up_thru
;
9127 same_interval_since
= MAX(pg
->info
.history
.same_interval_since
,
9128 same_interval_since
);
9129 pg
->write_if_dirty(*rctx
.transaction
);
9130 if (!split_pgs
.empty()) {
9131 rctx
.on_applied
->add(new C_CompleteSplits(this, split_pgs
));
9134 dispatch_context_transaction(rctx
, pg
, &handle
);
9138 queue_want_up_thru(same_interval_since
);
9139 dispatch_context(rctx
, 0, curmap
, &handle
);
9141 service
.send_pg_temp();
9144 // --------------------------------
9146 const char** OSD::get_tracked_conf_keys() const
9148 static const char* KEYS
[] = {
9149 "osd_max_backfills",
9150 "osd_min_recovery_priority",
9151 "osd_op_complaint_time", "osd_op_log_threshold",
9152 "osd_op_history_size", "osd_op_history_duration",
9153 "osd_enable_op_tracker",
9154 "osd_map_cache_size",
9155 "osd_map_max_advance",
9156 "osd_pg_epoch_persisted_max_stale",
9157 "osd_disk_thread_ioprio_class",
9158 "osd_disk_thread_ioprio_priority",
9159 // clog & admin clog
9162 "clog_to_syslog_facility",
9163 "clog_to_syslog_level",
9164 "osd_objectstore_fuse",
9166 "clog_to_graylog_host",
9167 "clog_to_graylog_port",
9170 "osd_recovery_delay_start",
9171 "osd_client_message_size_cap",
9172 "osd_client_message_cap",
9178 void OSD::handle_conf_change(const struct md_config_t
*conf
,
9179 const std::set
<std::string
> &changed
)
9181 if (changed
.count("osd_max_backfills")) {
9182 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9183 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9185 if (changed
.count("osd_min_recovery_priority")) {
9186 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9187 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9189 if (changed
.count("osd_max_trimming_pgs")) {
9190 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
9192 if (changed
.count("osd_op_complaint_time") ||
9193 changed
.count("osd_op_log_threshold")) {
9194 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
9195 cct
->_conf
->osd_op_log_threshold
);
9197 if (changed
.count("osd_op_history_size") ||
9198 changed
.count("osd_op_history_duration")) {
9199 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
9200 cct
->_conf
->osd_op_history_duration
);
9202 if (changed
.count("osd_op_history_slow_op_size") ||
9203 changed
.count("osd_op_history_slow_op_threshold")) {
9204 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
9205 cct
->_conf
->osd_op_history_slow_op_threshold
);
9207 if (changed
.count("osd_enable_op_tracker")) {
9208 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
9210 if (changed
.count("osd_disk_thread_ioprio_class") ||
9211 changed
.count("osd_disk_thread_ioprio_priority")) {
9212 set_disk_tp_priority();
9214 if (changed
.count("osd_map_cache_size")) {
9215 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9216 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9217 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9219 if (changed
.count("clog_to_monitors") ||
9220 changed
.count("clog_to_syslog") ||
9221 changed
.count("clog_to_syslog_level") ||
9222 changed
.count("clog_to_syslog_facility") ||
9223 changed
.count("clog_to_graylog") ||
9224 changed
.count("clog_to_graylog_host") ||
9225 changed
.count("clog_to_graylog_port") ||
9226 changed
.count("host") ||
9227 changed
.count("fsid")) {
9228 update_log_config();
9232 if (changed
.count("osd_objectstore_fuse")) {
9234 enable_disable_fuse(false);
9239 if (changed
.count("osd_recovery_delay_start")) {
9240 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
9241 service
.kick_recovery_queue();
9244 if (changed
.count("osd_client_message_cap")) {
9245 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
9246 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9247 if (pol
.throttler_messages
&& newval
> 0) {
9248 pol
.throttler_messages
->reset_max(newval
);
9251 if (changed
.count("osd_client_message_size_cap")) {
9252 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
9253 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9254 if (pol
.throttler_bytes
&& newval
> 0) {
9255 pol
.throttler_bytes
->reset_max(newval
);
9262 void OSD::update_log_config()
9264 map
<string
,string
> log_to_monitors
;
9265 map
<string
,string
> log_to_syslog
;
9266 map
<string
,string
> log_channel
;
9267 map
<string
,string
> log_prio
;
9268 map
<string
,string
> log_to_graylog
;
9269 map
<string
,string
> log_to_graylog_host
;
9270 map
<string
,string
> log_to_graylog_port
;
9274 if (parse_log_client_options(cct
, log_to_monitors
, log_to_syslog
,
9275 log_channel
, log_prio
, log_to_graylog
,
9276 log_to_graylog_host
, log_to_graylog_port
,
9278 clog
->update_config(log_to_monitors
, log_to_syslog
,
9279 log_channel
, log_prio
, log_to_graylog
,
9280 log_to_graylog_host
, log_to_graylog_port
,
9282 derr
<< "log_to_monitors " << log_to_monitors
<< dendl
;
9285 void OSD::check_config()
9287 // some sanity checks
9288 if (cct
->_conf
->osd_map_cache_size
<= cct
->_conf
->osd_map_max_advance
+ 2) {
9289 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
9290 << " is not > osd_map_max_advance ("
9291 << cct
->_conf
->osd_map_max_advance
<< ")";
9293 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
9294 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
9295 << " is not > osd_pg_epoch_persisted_max_stale ("
9296 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
9300 void OSD::set_disk_tp_priority()
9302 dout(10) << __func__
9303 << " class " << cct
->_conf
->osd_disk_thread_ioprio_class
9304 << " priority " << cct
->_conf
->osd_disk_thread_ioprio_priority
9306 if (cct
->_conf
->osd_disk_thread_ioprio_class
.empty() ||
9307 cct
->_conf
->osd_disk_thread_ioprio_priority
< 0)
9310 ceph_ioprio_string_to_class(cct
->_conf
->osd_disk_thread_ioprio_class
);
9312 derr
<< __func__
<< cpp_strerror(cls
) << ": "
9313 << "osd_disk_thread_ioprio_class is " << cct
->_conf
->osd_disk_thread_ioprio_class
9314 << " but only the following values are allowed: idle, be or rt" << dendl
;
9316 disk_tp
.set_ioprio(cls
, cct
->_conf
->osd_disk_thread_ioprio_priority
);
9319 // --------------------------------
9321 void OSD::get_latest_osdmap()
9323 dout(10) << __func__
<< " -- start" << dendl
;
9326 service
.objecter
->wait_for_latest_osdmap(&cond
);
9329 dout(10) << __func__
<< " -- finish" << dendl
;
9332 // --------------------------------
9334 int OSD::init_op_flags(OpRequestRef
& op
)
9336 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
9337 vector
<OSDOp
>::const_iterator iter
;
9339 // client flags have no bearing on whether an op is a read, write, etc.
9342 if (m
->has_flag(CEPH_OSD_FLAG_RWORDERED
)) {
9343 op
->set_force_rwordered();
9346 // set bits based on op codes, called methods.
9347 for (iter
= m
->ops
.begin(); iter
!= m
->ops
.end(); ++iter
) {
9348 if ((iter
->op
.op
== CEPH_OSD_OP_WATCH
&&
9349 iter
->op
.watch
.op
== CEPH_OSD_WATCH_OP_PING
)) {
9350 /* This a bit odd. PING isn't actually a write. It can't
9351 * result in an update to the object_info. PINGs also aren'ty
9352 * resent, so there's no reason to write out a log entry
9354 * However, we pipeline them behind writes, so let's force
9355 * the write_ordered flag.
9357 op
->set_force_rwordered();
9359 if (ceph_osd_op_mode_modify(iter
->op
.op
))
9362 if (ceph_osd_op_mode_read(iter
->op
.op
))
9365 // set READ flag if there are src_oids
9366 if (iter
->soid
.oid
.name
.length())
9369 // set PGOP flag if there are PG ops
9370 if (ceph_osd_op_type_pg(iter
->op
.op
))
9373 if (ceph_osd_op_mode_cache(iter
->op
.op
))
9376 // check for ec base pool
9377 int64_t poolid
= m
->get_pg().pool();
9378 const pg_pool_t
*pool
= osdmap
->get_pg_pool(poolid
);
9379 if (pool
&& pool
->is_tier()) {
9380 const pg_pool_t
*base_pool
= osdmap
->get_pg_pool(pool
->tier_of
);
9381 if (base_pool
&& base_pool
->require_rollback()) {
9382 if ((iter
->op
.op
!= CEPH_OSD_OP_READ
) &&
9383 (iter
->op
.op
!= CEPH_OSD_OP_CHECKSUM
) &&
9384 (iter
->op
.op
!= CEPH_OSD_OP_STAT
) &&
9385 (iter
->op
.op
!= CEPH_OSD_OP_ISDIRTY
) &&
9386 (iter
->op
.op
!= CEPH_OSD_OP_UNDIRTY
) &&
9387 (iter
->op
.op
!= CEPH_OSD_OP_GETXATTR
) &&
9388 (iter
->op
.op
!= CEPH_OSD_OP_GETXATTRS
) &&
9389 (iter
->op
.op
!= CEPH_OSD_OP_CMPXATTR
) &&
9390 (iter
->op
.op
!= CEPH_OSD_OP_ASSERT_VER
) &&
9391 (iter
->op
.op
!= CEPH_OSD_OP_LIST_WATCHERS
) &&
9392 (iter
->op
.op
!= CEPH_OSD_OP_LIST_SNAPS
) &&
9393 (iter
->op
.op
!= CEPH_OSD_OP_SETALLOCHINT
) &&
9394 (iter
->op
.op
!= CEPH_OSD_OP_WRITEFULL
) &&
9395 (iter
->op
.op
!= CEPH_OSD_OP_ROLLBACK
) &&
9396 (iter
->op
.op
!= CEPH_OSD_OP_CREATE
) &&
9397 (iter
->op
.op
!= CEPH_OSD_OP_DELETE
) &&
9398 (iter
->op
.op
!= CEPH_OSD_OP_SETXATTR
) &&
9399 (iter
->op
.op
!= CEPH_OSD_OP_RMXATTR
) &&
9400 (iter
->op
.op
!= CEPH_OSD_OP_STARTSYNC
) &&
9401 (iter
->op
.op
!= CEPH_OSD_OP_COPY_GET
) &&
9402 (iter
->op
.op
!= CEPH_OSD_OP_COPY_FROM
)) {
9408 switch (iter
->op
.op
) {
9409 case CEPH_OSD_OP_CALL
:
9411 bufferlist::iterator bp
= const_cast<bufferlist
&>(iter
->indata
).begin();
9412 int is_write
, is_read
;
9413 string cname
, mname
;
9414 bp
.copy(iter
->op
.cls
.class_len
, cname
);
9415 bp
.copy(iter
->op
.cls
.method_len
, mname
);
9417 ClassHandler::ClassData
*cls
;
9418 int r
= class_handler
->open_class(cname
, &cls
);
9420 derr
<< "class " << cname
<< " open got " << cpp_strerror(r
) << dendl
;
9423 else if (r
!= -EPERM
) // propagate permission errors
9427 int flags
= cls
->get_method_flags(mname
.c_str());
9429 if (flags
== -ENOENT
)
9435 is_read
= flags
& CLS_METHOD_RD
;
9436 is_write
= flags
& CLS_METHOD_WR
;
9437 bool is_promote
= flags
& CLS_METHOD_PROMOTE
;
9439 dout(10) << "class " << cname
<< " method " << mname
<< " "
9440 << "flags=" << (is_read
? "r" : "")
9441 << (is_write
? "w" : "")
9442 << (is_promote
? "p" : "")
9445 op
->set_class_read();
9447 op
->set_class_write();
9450 op
->add_class(cname
, is_read
, is_write
, cls
->whitelisted
);
9454 case CEPH_OSD_OP_WATCH
:
9455 // force the read bit for watch since it is depends on previous
9456 // watch state (and may return early if the watch exists) or, in
9457 // the case of ping, is simply a read op.
9460 case CEPH_OSD_OP_NOTIFY
:
9461 case CEPH_OSD_OP_NOTIFY_ACK
:
9467 case CEPH_OSD_OP_DELETE
:
9468 // if we get a delete with FAILOK we can skip handle cache. without
9469 // FAILOK we still need to promote (or do something smarter) to
9470 // determine whether to return ENOENT or 0.
9471 if (iter
== m
->ops
.begin() &&
9472 iter
->op
.flags
== CEPH_OSD_OP_FLAG_FAILOK
) {
9473 op
->set_skip_handle_cache();
9475 // skip promotion when proxying a delete op
9476 if (m
->ops
.size() == 1) {
9477 op
->set_skip_promote();
9481 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
9482 case CEPH_OSD_OP_CACHE_FLUSH
:
9483 case CEPH_OSD_OP_CACHE_EVICT
:
9484 // If try_flush/flush/evict is the only op, can skip handle cache.
9485 if (m
->ops
.size() == 1) {
9486 op
->set_skip_handle_cache();
9490 case CEPH_OSD_OP_READ
:
9491 case CEPH_OSD_OP_SYNC_READ
:
9492 case CEPH_OSD_OP_SPARSE_READ
:
9493 case CEPH_OSD_OP_CHECKSUM
:
9494 case CEPH_OSD_OP_WRITEFULL
:
9495 if (m
->ops
.size() == 1 &&
9496 (iter
->op
.flags
& CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
||
9497 iter
->op
.flags
& CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
)) {
9498 op
->set_skip_promote();
9502 // force promotion when pin an object in cache tier
9503 case CEPH_OSD_OP_CACHE_PIN
:
9512 if (op
->rmw_flags
== 0)
9518 void OSD::PeeringWQ::_dequeue(list
<PG
*> *out
) {
9519 for (list
<PG
*>::iterator i
= peering_queue
.begin();
9520 i
!= peering_queue
.end() &&
9521 out
->size() < osd
->cct
->_conf
->osd_peering_wq_batch_size
;
9523 if (in_use
.count(*i
)) {
9527 peering_queue
.erase(i
++);
9530 in_use
.insert(out
->begin(), out
->end());
9533 // =============================================================
9536 #define dout_context osd->cct
9538 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
9540 void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid
)
9542 uint32_t shard_index
= pgid
.hash_to_shard(shard_list
.size());
9543 auto sdata
= shard_list
[shard_index
];
9544 bool queued
= false;
9545 unsigned pushes_to_free
= 0;
9547 Mutex::Locker
l(sdata
->sdata_op_ordering_lock
);
9548 auto p
= sdata
->pg_slots
.find(pgid
);
9549 if (p
!= sdata
->pg_slots
.end()) {
9550 dout(20) << __func__
<< " " << pgid
9551 << " to_process " << p
->second
.to_process
9552 << " waiting_for_pg=" << (int)p
->second
.waiting_for_pg
<< dendl
;
9553 for (auto i
= p
->second
.to_process
.rbegin();
9554 i
!= p
->second
.to_process
.rend();
9556 sdata
->_enqueue_front(make_pair(pgid
, *i
), osd
->op_prio_cutoff
);
9558 for (auto& q
: p
->second
.to_process
) {
9559 pushes_to_free
+= q
.get_reserved_pushes();
9561 p
->second
.to_process
.clear();
9562 p
->second
.waiting_for_pg
= false;
9563 ++p
->second
.requeue_seq
;
9567 if (pushes_to_free
> 0) {
9568 osd
->service
.release_reserved_pushes(pushes_to_free
);
9571 sdata
->sdata_lock
.Lock();
9572 sdata
->sdata_cond
.SignalOne();
9573 sdata
->sdata_lock
.Unlock();
9577 void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap
, int whoami
)
9579 unsigned pushes_to_free
= 0;
9580 for (auto sdata
: shard_list
) {
9581 Mutex::Locker
l(sdata
->sdata_op_ordering_lock
);
9582 sdata
->waiting_for_pg_osdmap
= osdmap
;
9583 auto p
= sdata
->pg_slots
.begin();
9584 while (p
!= sdata
->pg_slots
.end()) {
9585 ShardData::pg_slot
& slot
= p
->second
;
9586 if (!slot
.to_process
.empty() && slot
.num_running
== 0) {
9587 if (osdmap
->is_up_acting_osd_shard(p
->first
, whoami
)) {
9588 dout(20) << __func__
<< " " << p
->first
<< " maps to us, keeping"
9593 while (!slot
.to_process
.empty() &&
9594 slot
.to_process
.front().get_map_epoch() <= osdmap
->get_epoch()) {
9595 auto& qi
= slot
.to_process
.front();
9596 dout(20) << __func__
<< " " << p
->first
9598 << " epoch " << qi
.get_map_epoch()
9599 << " <= " << osdmap
->get_epoch()
9600 << ", stale, dropping" << dendl
;
9601 pushes_to_free
+= qi
.get_reserved_pushes();
9602 slot
.to_process
.pop_front();
9605 if (slot
.to_process
.empty() &&
9606 slot
.num_running
== 0 &&
9608 dout(20) << __func__
<< " " << p
->first
<< " empty, pruning" << dendl
;
9609 p
= sdata
->pg_slots
.erase(p
);
9615 if (pushes_to_free
> 0) {
9616 osd
->service
.release_reserved_pushes(pushes_to_free
);
9620 void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid
)
9622 uint32_t shard_index
= pgid
.hash_to_shard(shard_list
.size());
9623 auto sdata
= shard_list
[shard_index
];
9624 Mutex::Locker
l(sdata
->sdata_op_ordering_lock
);
9625 auto p
= sdata
->pg_slots
.find(pgid
);
9626 if (p
!= sdata
->pg_slots
.end()) {
9627 auto& slot
= p
->second
;
9628 dout(20) << __func__
<< " " << pgid
<< " pg " << slot
.pg
<< dendl
;
9629 assert(!slot
.pg
|| slot
.pg
->deleting
);
9634 void OSD::ShardedOpWQ::clear_pg_slots()
9636 for (auto sdata
: shard_list
) {
9637 Mutex::Locker
l(sdata
->sdata_op_ordering_lock
);
9638 sdata
->pg_slots
.clear();
9639 sdata
->waiting_for_pg_osdmap
.reset();
9640 // don't bother with reserved pushes; we are shutting down
9645 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
9647 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
9649 uint32_t shard_index
= thread_index
% num_shards
;
9650 ShardData
*sdata
= shard_list
[shard_index
];
9651 assert(NULL
!= sdata
);
9654 sdata
->sdata_op_ordering_lock
.Lock();
9655 if (sdata
->pqueue
->empty()) {
9656 dout(20) << __func__
<< " empty q, waiting" << dendl
;
9657 // optimistically sleep a moment; maybe another work item will come along.
9658 sdata
->sdata_op_ordering_lock
.Unlock();
9659 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
9660 osd
->cct
->_conf
->threadpool_default_timeout
, 0);
9661 sdata
->sdata_lock
.Lock();
9662 sdata
->sdata_cond
.WaitInterval(sdata
->sdata_lock
,
9663 utime_t(osd
->cct
->_conf
->threadpool_empty_queue_max_wait
, 0));
9664 sdata
->sdata_lock
.Unlock();
9665 sdata
->sdata_op_ordering_lock
.Lock();
9666 if (sdata
->pqueue
->empty()) {
9667 sdata
->sdata_op_ordering_lock
.Unlock();
9671 pair
<spg_t
, PGQueueable
> item
= sdata
->pqueue
->dequeue();
9672 if (osd
->is_stopping()) {
9673 sdata
->sdata_op_ordering_lock
.Unlock();
9674 return; // OSD shutdown, discard.
9677 uint64_t requeue_seq
;
9679 auto& slot
= sdata
->pg_slots
[item
.first
];
9680 dout(30) << __func__
<< " " << item
.first
9681 << " to_process " << slot
.to_process
9682 << " waiting_for_pg=" << (int)slot
.waiting_for_pg
<< dendl
;
9683 slot
.to_process
.push_back(item
.second
);
9684 // note the requeue seq now...
9685 requeue_seq
= slot
.requeue_seq
;
9686 if (slot
.waiting_for_pg
) {
9687 // save ourselves a bit of effort
9688 dout(20) << __func__
<< " " << item
.first
<< " item " << item
.second
9689 << " queued, waiting_for_pg" << dendl
;
9690 sdata
->sdata_op_ordering_lock
.Unlock();
9694 dout(20) << __func__
<< " " << item
.first
<< " item " << item
.second
9695 << " queued" << dendl
;
9698 sdata
->sdata_op_ordering_lock
.Unlock();
9700 osd
->service
.maybe_inject_dispatch_delay();
9702 // [lookup +] lock pg (if we have it)
9704 pg
= osd
->_lookup_lock_pg(item
.first
);
9709 osd
->service
.maybe_inject_dispatch_delay();
9711 boost::optional
<PGQueueable
> qi
;
9713 // we don't use a Mutex::Locker here because of the
9714 // osd->service.release_reserved_pushes() call below
9715 sdata
->sdata_op_ordering_lock
.Lock();
9717 auto q
= sdata
->pg_slots
.find(item
.first
);
9718 assert(q
!= sdata
->pg_slots
.end());
9719 auto& slot
= q
->second
;
9722 if (slot
.to_process
.empty()) {
9723 // raced with wake_pg_waiters or prune_pg_waiters
9724 dout(20) << __func__
<< " " << item
.first
<< " nothing queued" << dendl
;
9728 sdata
->sdata_op_ordering_lock
.Unlock();
9731 if (requeue_seq
!= slot
.requeue_seq
) {
9732 dout(20) << __func__
<< " " << item
.first
9733 << " requeue_seq " << slot
.requeue_seq
<< " > our "
9734 << requeue_seq
<< ", we raced with wake_pg_waiters"
9739 sdata
->sdata_op_ordering_lock
.Unlock();
9742 if (pg
&& !slot
.pg
&& !pg
->deleting
) {
9743 dout(20) << __func__
<< " " << item
.first
<< " set pg to " << pg
<< dendl
;
9746 dout(30) << __func__
<< " " << item
.first
<< " to_process " << slot
.to_process
9747 << " waiting_for_pg=" << (int)slot
.waiting_for_pg
<< dendl
;
9749 // make sure we're not already waiting for this pg
9750 if (slot
.waiting_for_pg
) {
9751 dout(20) << __func__
<< " " << item
.first
<< " item " << item
.second
9752 << " slot is waiting_for_pg" << dendl
;
9756 sdata
->sdata_op_ordering_lock
.Unlock();
9761 qi
= slot
.to_process
.front();
9762 slot
.to_process
.pop_front();
9763 dout(20) << __func__
<< " " << item
.first
<< " item " << *qi
9764 << " pg " << pg
<< dendl
;
9767 // should this pg shard exist on this osd in this (or a later) epoch?
9768 OSDMapRef osdmap
= sdata
->waiting_for_pg_osdmap
;
9769 if (osdmap
->is_up_acting_osd_shard(item
.first
, osd
->whoami
)) {
9770 dout(20) << __func__
<< " " << item
.first
9771 << " no pg, should exist, will wait" << " on " << *qi
<< dendl
;
9772 slot
.to_process
.push_front(*qi
);
9773 slot
.waiting_for_pg
= true;
9774 } else if (qi
->get_map_epoch() > osdmap
->get_epoch()) {
9775 dout(20) << __func__
<< " " << item
.first
<< " no pg, item epoch is "
9776 << qi
->get_map_epoch() << " > " << osdmap
->get_epoch()
9777 << ", will wait on " << *qi
<< dendl
;
9778 slot
.to_process
.push_front(*qi
);
9779 slot
.waiting_for_pg
= true;
9781 dout(20) << __func__
<< " " << item
.first
<< " no pg, shouldn't exist,"
9782 << " dropping " << *qi
<< dendl
;
9783 // share map with client?
9784 if (boost::optional
<OpRequestRef
> _op
= qi
->maybe_get_op()) {
9785 Session
*session
= static_cast<Session
*>(
9786 (*_op
)->get_req()->get_connection()->get_priv());
9788 osd
->maybe_share_map(session
, *_op
, sdata
->waiting_for_pg_osdmap
);
9792 unsigned pushes_to_free
= qi
->get_reserved_pushes();
9793 if (pushes_to_free
> 0) {
9794 sdata
->sdata_op_ordering_lock
.Unlock();
9795 osd
->service
.release_reserved_pushes(pushes_to_free
);
9799 sdata
->sdata_op_ordering_lock
.Unlock();
9802 sdata
->sdata_op_ordering_lock
.Unlock();
9805 // osd_opwq_process marks the point at which an operation has been dequeued
9806 // and will begin to be handled by a worker thread.
9810 if (boost::optional
<OpRequestRef
> _op
= qi
->maybe_get_op()) {
9811 reqid
= (*_op
)->get_reqid();
9814 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
9815 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
9818 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
9819 Formatter
*f
= Formatter::create("json");
9820 f
->open_object_section("q");
9827 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
9829 qi
->run(osd
, pg
, tp_handle
);
9834 if (boost::optional
<OpRequestRef
> _op
= qi
->maybe_get_op()) {
9835 reqid
= (*_op
)->get_reqid();
9838 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
9839 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
9845 void OSD::ShardedOpWQ::_enqueue(pair
<spg_t
, PGQueueable
> item
) {
9846 uint32_t shard_index
=
9847 item
.first
.hash_to_shard(shard_list
.size());
9849 ShardData
* sdata
= shard_list
[shard_index
];
9850 assert (NULL
!= sdata
);
9851 unsigned priority
= item
.second
.get_priority();
9852 unsigned cost
= item
.second
.get_cost();
9853 sdata
->sdata_op_ordering_lock
.Lock();
9855 dout(20) << __func__
<< " " << item
.first
<< " " << item
.second
<< dendl
;
9856 if (priority
>= osd
->op_prio_cutoff
)
9857 sdata
->pqueue
->enqueue_strict(
9858 item
.second
.get_owner(), priority
, item
);
9860 sdata
->pqueue
->enqueue(
9861 item
.second
.get_owner(),
9862 priority
, cost
, item
);
9863 sdata
->sdata_op_ordering_lock
.Unlock();
9865 sdata
->sdata_lock
.Lock();
9866 sdata
->sdata_cond
.SignalOne();
9867 sdata
->sdata_lock
.Unlock();
9871 void OSD::ShardedOpWQ::_enqueue_front(pair
<spg_t
, PGQueueable
> item
)
9873 uint32_t shard_index
= item
.first
.hash_to_shard(shard_list
.size());
9874 ShardData
* sdata
= shard_list
[shard_index
];
9875 assert (NULL
!= sdata
);
9876 sdata
->sdata_op_ordering_lock
.Lock();
9877 auto p
= sdata
->pg_slots
.find(item
.first
);
9878 if (p
!= sdata
->pg_slots
.end() && !p
->second
.to_process
.empty()) {
9879 // we may be racing with _process, which has dequeued a new item
9880 // from pqueue, put it on to_process, and is now busy taking the
9881 // pg lock. ensure this old requeued item is ordered before any
9882 // such newer item in to_process.
9883 p
->second
.to_process
.push_front(item
.second
);
9884 item
.second
= p
->second
.to_process
.back();
9885 p
->second
.to_process
.pop_back();
9886 dout(20) << __func__
<< " " << item
.first
9887 << " " << p
->second
.to_process
.front()
9888 << " shuffled w/ " << item
.second
<< dendl
;
9890 dout(20) << __func__
<< " " << item
.first
<< " " << item
.second
<< dendl
;
9892 sdata
->_enqueue_front(item
, osd
->op_prio_cutoff
);
9893 sdata
->sdata_op_ordering_lock
.Unlock();
9894 sdata
->sdata_lock
.Lock();
9895 sdata
->sdata_cond
.SignalOne();
9896 sdata
->sdata_lock
.Unlock();
9900 namespace osd_cmds
{
9902 int heap(CephContext
& cct
, cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
)
9904 if (!ceph_using_tcmalloc()) {
9905 os
<< "could not issue heap profiler command -- not using tcmalloc!";
9910 if (!cmd_getval(&cct
, cmdmap
, "heapcmd", cmd
)) {
9911 os
<< "unable to get value for command \"" << cmd
<< "\"";
9915 std::vector
<std::string
> cmd_vec
;
9916 get_str_vec(cmd
, cmd_vec
);
9918 ceph_heap_profiler_handle_command(cmd_vec
, os
);
9923 }} // namespace ceph::osd_cmds