1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
23 #include <boost/scoped_ptr.hpp>
26 #ifdef HAVE_SYS_PARAM_H
27 #include <sys/param.h>
30 #ifdef HAVE_SYS_MOUNT_H
31 #include <sys/mount.h>
36 #include "include/types.h"
37 #include "include/compat.h"
42 #include "osdc/Objecter.h"
44 #include "common/errno.h"
45 #include "common/ceph_argparse.h"
46 #include "common/ceph_time.h"
47 #include "common/version.h"
48 #include "common/io_priority.h"
49 #include "common/pick_address.h"
51 #include "os/ObjectStore.h"
53 #include "os/FuseStore.h"
56 #include "PrimaryLogPG.h"
59 #include "msg/Messenger.h"
60 #include "msg/Message.h"
62 #include "mon/MonClient.h"
64 #include "messages/MLog.h"
66 #include "messages/MGenericMessage.h"
67 #include "messages/MOSDPing.h"
68 #include "messages/MOSDFailure.h"
69 #include "messages/MOSDMarkMeDown.h"
70 #include "messages/MOSDFull.h"
71 #include "messages/MOSDOp.h"
72 #include "messages/MOSDOpReply.h"
73 #include "messages/MOSDBackoff.h"
74 #include "messages/MOSDBeacon.h"
75 #include "messages/MOSDRepOp.h"
76 #include "messages/MOSDRepOpReply.h"
77 #include "messages/MOSDBoot.h"
78 #include "messages/MOSDPGTemp.h"
80 #include "messages/MOSDMap.h"
81 #include "messages/MMonGetOSDMap.h"
82 #include "messages/MOSDPGNotify.h"
83 #include "messages/MOSDPGQuery.h"
84 #include "messages/MOSDPGLog.h"
85 #include "messages/MOSDPGRemove.h"
86 #include "messages/MOSDPGInfo.h"
87 #include "messages/MOSDPGCreate.h"
88 #include "messages/MOSDPGTrim.h"
89 #include "messages/MOSDPGScan.h"
90 #include "messages/MOSDPGBackfill.h"
91 #include "messages/MBackfillReserve.h"
92 #include "messages/MRecoveryReserve.h"
93 #include "messages/MOSDForceRecovery.h"
94 #include "messages/MOSDECSubOpWrite.h"
95 #include "messages/MOSDECSubOpWriteReply.h"
96 #include "messages/MOSDECSubOpRead.h"
97 #include "messages/MOSDECSubOpReadReply.h"
98 #include "messages/MOSDPGCreated.h"
99 #include "messages/MOSDPGUpdateLogMissing.h"
100 #include "messages/MOSDPGUpdateLogMissingReply.h"
102 #include "messages/MOSDAlive.h"
104 #include "messages/MOSDScrub.h"
105 #include "messages/MOSDScrubReserve.h"
106 #include "messages/MOSDRepScrub.h"
108 #include "messages/MMonCommand.h"
109 #include "messages/MCommand.h"
110 #include "messages/MCommandReply.h"
112 #include "messages/MPGStats.h"
113 #include "messages/MPGStatsAck.h"
115 #include "messages/MWatchNotify.h"
116 #include "messages/MOSDPGPush.h"
117 #include "messages/MOSDPGPushReply.h"
118 #include "messages/MOSDPGPull.h"
120 #include "common/perf_counters.h"
121 #include "common/Timer.h"
122 #include "common/LogClient.h"
123 #include "common/AsyncReserver.h"
124 #include "common/HeartbeatMap.h"
125 #include "common/admin_socket.h"
126 #include "common/ceph_context.h"
128 #include "global/signal_handler.h"
129 #include "global/pidfile.h"
131 #include "include/color.h"
132 #include "perfglue/cpu_profiler.h"
133 #include "perfglue/heap_profiler.h"
135 #include "osd/OpRequest.h"
137 #include "auth/AuthAuthorizeHandler.h"
138 #include "auth/RotatingKeyRing.h"
139 #include "common/errno.h"
141 #include "objclass/objclass.h"
143 #include "common/cmdparse.h"
144 #include "include/str_list.h"
145 #include "include/util.h"
147 #include "include/assert.h"
148 #include "common/config.h"
149 #include "common/EventTrace.h"
152 #define TRACEPOINT_DEFINE
153 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
154 #include "tracing/osd.h"
155 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
156 #undef TRACEPOINT_DEFINE
158 #define tracepoint(...)
161 #define dout_context cct
162 #define dout_subsys ceph_subsys_osd
164 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
167 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
168 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
171 //Initial features in new superblock.
172 //Features here are also automatically upgraded
173 CompatSet
OSD::get_osd_initial_compat_set() {
174 CompatSet::FeatureSet ceph_osd_feature_compat
;
175 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
176 CompatSet::FeatureSet ceph_osd_feature_incompat
;
177 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
178 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
179 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
180 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
181 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
182 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
183 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
184 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
185 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
186 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
187 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
188 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
189 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
190 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
191 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES
);
192 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
193 ceph_osd_feature_incompat
);
196 //Features are added here that this OSD supports.
197 CompatSet
OSD::get_osd_compat_set() {
198 CompatSet compat
= get_osd_initial_compat_set();
199 //Any features here can be set in code, but not in initial superblock
200 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
204 OSDService::OSDService(OSD
*osd
) :
207 meta_osr(new ObjectStore::Sequencer("meta")),
208 whoami(osd
->whoami
), store(osd
->store
),
209 log_client(osd
->log_client
), clog(osd
->clog
),
210 pg_recovery_stats(osd
->pg_recovery_stats
),
211 cluster_messenger(osd
->cluster_messenger
),
212 client_messenger(osd
->client_messenger
),
214 recoverystate_perf(osd
->recoverystate_perf
),
216 peering_wq(osd
->peering_wq
),
217 recovery_gen_wq("recovery_gen_wq", cct
->_conf
->osd_recovery_thread_timeout
,
219 class_handler(osd
->class_handler
),
220 pg_epoch_lock("OSDService::pg_epoch_lock"),
221 publish_lock("OSDService::publish_lock"),
222 pre_publish_lock("OSDService::pre_publish_lock"),
224 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
225 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
227 agent_lock("OSDService::agent_lock"),
228 agent_valid_iterator(false),
230 flush_mode_high_count(0),
233 agent_stop_flag(false),
234 agent_timer_lock("OSDService::agent_timer_lock"),
235 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
236 last_recalibrate(ceph_clock_now()),
237 promote_max_objects(0),
238 promote_max_bytes(0),
239 objecter(new Objecter(osd
->client_messenger
->cct
, osd
->objecter_messenger
, osd
->monc
, NULL
, 0, 0)),
240 objecter_finisher(osd
->client_messenger
->cct
),
241 watch_lock("OSDService::watch_lock"),
242 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
244 recovery_request_lock("OSDService::recovery_request_lock"),
245 recovery_request_timer(cct
, recovery_request_lock
, false),
246 recovery_sleep_lock("OSDService::recovery_sleep_lock"),
247 recovery_sleep_timer(cct
, recovery_sleep_lock
, false),
248 reserver_finisher(cct
),
249 local_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
250 cct
->_conf
->osd_min_recovery_priority
),
251 remote_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
252 cct
->_conf
->osd_min_recovery_priority
),
253 pg_temp_lock("OSDService::pg_temp_lock"),
254 snap_sleep_lock("OSDService::snap_sleep_lock"),
256 osd
->client_messenger
->cct
, snap_sleep_lock
, false /* relax locking */),
257 scrub_sleep_lock("OSDService::scrub_sleep_lock"),
259 osd
->client_messenger
->cct
, scrub_sleep_lock
, false /* relax locking */),
260 snap_reserver(cct
, &reserver_finisher
,
261 cct
->_conf
->osd_max_trimming_pgs
),
262 recovery_lock("OSDService::recovery_lock"),
263 recovery_ops_active(0),
264 recovery_ops_reserved(0),
265 recovery_paused(false),
266 map_cache_lock("OSDService::map_cache_lock"),
267 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
268 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
269 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
270 in_progress_split_lock("OSDService::in_progress_split_lock"),
271 stat_lock("OSDService::stat_lock"),
272 full_status_lock("OSDService::full_status_lock"),
275 epoch_lock("OSDService::epoch_lock"),
276 boot_epoch(0), up_epoch(0), bind_epoch(0),
277 is_stopping_lock("OSDService::is_stopping_lock")
279 , pgid_lock("OSDService::pgid_lock")
285 OSDService::~OSDService()
293 void OSDService::add_pgid(spg_t pgid
, PG
*pg
){
294 Mutex::Locker
l(pgid_lock
);
295 if (!pgid_tracker
.count(pgid
)) {
298 pgid_tracker
[pgid
]++;
300 void OSDService::remove_pgid(spg_t pgid
, PG
*pg
)
302 Mutex::Locker
l(pgid_lock
);
303 assert(pgid_tracker
.count(pgid
));
304 assert(pgid_tracker
[pgid
] > 0);
305 pgid_tracker
[pgid
]--;
306 if (pgid_tracker
[pgid
] == 0) {
307 pgid_tracker
.erase(pgid
);
308 live_pgs
.erase(pgid
);
311 void OSDService::dump_live_pgids()
313 Mutex::Locker
l(pgid_lock
);
314 derr
<< "live pgids:" << dendl
;
315 for (map
<spg_t
, int>::const_iterator i
= pgid_tracker
.cbegin();
316 i
!= pgid_tracker
.cend();
318 derr
<< "\t" << *i
<< dendl
;
319 live_pgs
[i
->first
]->dump_live_ids();
325 void OSDService::_start_split(spg_t parent
, const set
<spg_t
> &children
)
327 for (set
<spg_t
>::const_iterator i
= children
.begin();
330 dout(10) << __func__
<< ": Starting split on pg " << *i
331 << ", parent=" << parent
<< dendl
;
332 assert(!pending_splits
.count(*i
));
333 assert(!in_progress_splits
.count(*i
));
334 pending_splits
.insert(make_pair(*i
, parent
));
336 assert(!rev_pending_splits
[parent
].count(*i
));
337 rev_pending_splits
[parent
].insert(*i
);
341 void OSDService::mark_split_in_progress(spg_t parent
, const set
<spg_t
> &children
)
343 Mutex::Locker
l(in_progress_split_lock
);
344 map
<spg_t
, set
<spg_t
> >::iterator piter
= rev_pending_splits
.find(parent
);
345 assert(piter
!= rev_pending_splits
.end());
346 for (set
<spg_t
>::const_iterator i
= children
.begin();
349 assert(piter
->second
.count(*i
));
350 assert(pending_splits
.count(*i
));
351 assert(!in_progress_splits
.count(*i
));
352 assert(pending_splits
[*i
] == parent
);
354 pending_splits
.erase(*i
);
355 piter
->second
.erase(*i
);
356 in_progress_splits
.insert(*i
);
358 if (piter
->second
.empty())
359 rev_pending_splits
.erase(piter
);
362 void OSDService::cancel_pending_splits_for_parent(spg_t parent
)
364 Mutex::Locker
l(in_progress_split_lock
);
365 _cancel_pending_splits_for_parent(parent
);
368 void OSDService::_cancel_pending_splits_for_parent(spg_t parent
)
370 map
<spg_t
, set
<spg_t
> >::iterator piter
= rev_pending_splits
.find(parent
);
371 if (piter
== rev_pending_splits
.end())
374 for (set
<spg_t
>::iterator i
= piter
->second
.begin();
375 i
!= piter
->second
.end();
377 assert(pending_splits
.count(*i
));
378 assert(!in_progress_splits
.count(*i
));
379 pending_splits
.erase(*i
);
380 dout(10) << __func__
<< ": Completing split on pg " << *i
381 << " for parent: " << parent
<< dendl
;
382 _cancel_pending_splits_for_parent(*i
);
384 rev_pending_splits
.erase(piter
);
387 void OSDService::_maybe_split_pgid(OSDMapRef old_map
,
391 assert(old_map
->have_pg_pool(pgid
.pool()));
392 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
393 if (pgid
.ps() < static_cast<unsigned>(old_pgnum
)) {
395 if (pgid
.is_split(old_pgnum
,
396 new_map
->get_pg_num(pgid
.pool()), &children
)) {
397 _start_split(pgid
, children
); }
399 assert(pgid
.ps() < static_cast<unsigned>(new_map
->get_pg_num(pgid
.pool())));
403 void OSDService::init_splits_between(spg_t pgid
,
407 // First, check whether we can avoid this potentially expensive check
408 if (tomap
->have_pg_pool(pgid
.pool()) &&
410 frommap
->get_pg_num(pgid
.pool()),
411 tomap
->get_pg_num(pgid
.pool()),
413 // Ok, a split happened, so we need to walk the osdmaps
414 set
<spg_t
> new_pgs
; // pgs to scan on each map
415 new_pgs
.insert(pgid
);
416 OSDMapRef
curmap(get_map(frommap
->get_epoch()));
417 for (epoch_t e
= frommap
->get_epoch() + 1;
418 e
<= tomap
->get_epoch();
420 OSDMapRef
nextmap(try_get_map(e
));
423 set
<spg_t
> even_newer_pgs
; // pgs added in this loop
424 for (set
<spg_t
>::iterator i
= new_pgs
.begin(); i
!= new_pgs
.end(); ++i
) {
425 set
<spg_t
> split_pgs
;
426 if (i
->is_split(curmap
->get_pg_num(i
->pool()),
427 nextmap
->get_pg_num(i
->pool()),
429 start_split(*i
, split_pgs
);
430 even_newer_pgs
.insert(split_pgs
.begin(), split_pgs
.end());
433 new_pgs
.insert(even_newer_pgs
.begin(), even_newer_pgs
.end());
436 assert(curmap
== tomap
); // we must have had both frommap and tomap
440 void OSDService::expand_pg_num(OSDMapRef old_map
,
443 Mutex::Locker
l(in_progress_split_lock
);
444 for (set
<spg_t
>::iterator i
= in_progress_splits
.begin();
445 i
!= in_progress_splits
.end();
447 if (!new_map
->have_pg_pool(i
->pool())) {
448 in_progress_splits
.erase(i
++);
450 _maybe_split_pgid(old_map
, new_map
, *i
);
454 for (map
<spg_t
, spg_t
>::iterator i
= pending_splits
.begin();
455 i
!= pending_splits
.end();
457 if (!new_map
->have_pg_pool(i
->first
.pool())) {
458 rev_pending_splits
.erase(i
->second
);
459 pending_splits
.erase(i
++);
461 _maybe_split_pgid(old_map
, new_map
, i
->first
);
467 bool OSDService::splitting(spg_t pgid
)
469 Mutex::Locker
l(in_progress_split_lock
);
470 return in_progress_splits
.count(pgid
) ||
471 pending_splits
.count(pgid
);
474 void OSDService::complete_split(const set
<spg_t
> &pgs
)
476 Mutex::Locker
l(in_progress_split_lock
);
477 for (set
<spg_t
>::const_iterator i
= pgs
.begin();
480 dout(10) << __func__
<< ": Completing split on pg " << *i
<< dendl
;
481 assert(!pending_splits
.count(*i
));
482 assert(in_progress_splits
.count(*i
));
483 in_progress_splits
.erase(*i
);
487 void OSDService::need_heartbeat_peer_update()
489 osd
->need_heartbeat_peer_update();
492 void OSDService::pg_stat_queue_enqueue(PG
*pg
)
494 osd
->pg_stat_queue_enqueue(pg
);
497 void OSDService::pg_stat_queue_dequeue(PG
*pg
)
499 osd
->pg_stat_queue_dequeue(pg
);
502 void OSDService::start_shutdown()
505 Mutex::Locker
l(agent_timer_lock
);
506 agent_timer
.shutdown();
510 Mutex::Locker
l(recovery_sleep_lock
);
511 recovery_sleep_timer
.shutdown();
515 void OSDService::shutdown_reserver()
517 reserver_finisher
.wait_for_empty();
518 reserver_finisher
.stop();
521 void OSDService::shutdown()
524 Mutex::Locker
l(watch_lock
);
525 watch_timer
.shutdown();
528 objecter
->shutdown();
529 objecter_finisher
.wait_for_empty();
530 objecter_finisher
.stop();
533 Mutex::Locker
l(recovery_request_lock
);
534 recovery_request_timer
.shutdown();
538 Mutex::Locker
l(snap_sleep_lock
);
539 snap_sleep_timer
.shutdown();
543 Mutex::Locker
l(scrub_sleep_lock
);
544 scrub_sleep_timer
.shutdown();
547 osdmap
= OSDMapRef();
548 next_osdmap
= OSDMapRef();
551 void OSDService::init()
553 reserver_finisher
.start();
554 objecter_finisher
.start();
555 objecter
->set_client_incarnation(0);
557 // deprioritize objecter in daemonperf output
558 objecter
->get_logger()->set_prio_adjust(-3);
562 snap_sleep_timer
.init();
563 scrub_sleep_timer
.init();
565 agent_thread
.create("osd_srv_agent");
567 if (cct
->_conf
->osd_recovery_delay_start
)
568 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
571 void OSDService::final_init()
573 objecter
->start(osdmap
.get());
576 void OSDService::activate_map()
578 // wake/unwake the tiering agent
581 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
587 void OSDService::request_osdmap_update(epoch_t e
)
589 osd
->osdmap_subscribe(e
, false);
592 class AgentTimeoutCB
: public Context
{
595 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
596 void finish(int) override
{
597 pg
->agent_choose_mode_restart();
601 void OSDService::agent_entry()
603 dout(10) << __func__
<< " start" << dendl
;
606 while (!agent_stop_flag
) {
607 if (agent_queue
.empty()) {
608 dout(20) << __func__
<< " empty queue" << dendl
;
609 agent_cond
.Wait(agent_lock
);
612 uint64_t level
= agent_queue
.rbegin()->first
;
613 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
615 << " tiers " << agent_queue
.size()
616 << ", top is " << level
617 << " with pgs " << top
.size()
618 << ", ops " << agent_ops
<< "/"
619 << cct
->_conf
->osd_agent_max_ops
620 << (agent_active
? " active" : " NOT ACTIVE")
622 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
623 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
624 int agent_flush_quota
= max
;
625 if (!flush_mode_high_count
)
626 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
627 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
628 agent_cond
.Wait(agent_lock
);
632 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
633 agent_queue_pos
= top
.begin();
634 agent_valid_iterator
= true;
636 PGRef pg
= *agent_queue_pos
;
637 dout(10) << "high_count " << flush_mode_high_count
638 << " agent_ops " << agent_ops
639 << " flush_quota " << agent_flush_quota
<< dendl
;
641 if (!pg
->agent_work(max
, agent_flush_quota
)) {
642 dout(10) << __func__
<< " " << pg
->get_pgid()
643 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
644 << " seconds" << dendl
;
646 osd
->logger
->inc(l_osd_tier_delay
);
647 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
648 agent_timer_lock
.Lock();
649 Context
*cb
= new AgentTimeoutCB(pg
);
650 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
651 agent_timer_lock
.Unlock();
656 dout(10) << __func__
<< " finish" << dendl
;
659 void OSDService::agent_stop()
662 Mutex::Locker
l(agent_lock
);
664 // By this time all ops should be cancelled
665 assert(agent_ops
== 0);
666 // By this time all PGs are shutdown and dequeued
667 if (!agent_queue
.empty()) {
668 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
669 derr
<< "agent queue not empty, for example " << (*top
.begin())->info
.pgid
<< dendl
;
670 assert(0 == "agent queue not empty");
673 agent_stop_flag
= true;
679 // -------------------------------------
681 void OSDService::promote_throttle_recalibrate()
683 utime_t now
= ceph_clock_now();
684 double dur
= now
- last_recalibrate
;
685 last_recalibrate
= now
;
686 unsigned prob
= promote_probability_millis
;
688 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
689 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
691 unsigned min_prob
= 1;
693 uint64_t attempts
, obj
, bytes
;
694 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
695 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
696 << obj
<< " objects and " << byte_u_t(bytes
) << "; target "
697 << target_obj_sec
<< " obj/sec or "
698 << byte_u_t(target_bytes_sec
) << "/sec"
701 // calculate what the probability *should* be, given the targets
703 if (attempts
&& dur
> 0) {
704 uint64_t avg_size
= 1;
706 avg_size
= MAX(bytes
/ obj
, 1);
707 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
708 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
710 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
711 << avg_size
<< dendl
;
712 if (target_obj_sec
&& target_bytes_sec
)
713 new_prob
= MIN(po
, pb
);
714 else if (target_obj_sec
)
716 else if (target_bytes_sec
)
723 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
725 // correct for persistent skew between target rate and actual rate, adjust
728 if (attempts
&& obj
) {
729 actual
= obj
* 1000 / attempts
;
730 ratio
= (double)actual
/ (double)prob
;
731 new_prob
= (double)new_prob
/ ratio
;
733 new_prob
= MAX(new_prob
, min_prob
);
734 new_prob
= MIN(new_prob
, 1000);
737 prob
= (prob
+ new_prob
) / 2;
738 prob
= MAX(prob
, min_prob
);
739 prob
= MIN(prob
, 1000);
740 dout(10) << __func__
<< " actual " << actual
741 << ", actual/prob ratio " << ratio
742 << ", adjusted new_prob " << new_prob
743 << ", prob " << promote_probability_millis
<< " -> " << prob
745 promote_probability_millis
= prob
;
747 // set hard limits for this interval to mitigate stampedes
748 promote_max_objects
= target_obj_sec
* osd
->OSD_TICK_INTERVAL
* 2;
749 promote_max_bytes
= target_bytes_sec
* osd
->OSD_TICK_INTERVAL
* 2;
752 // -------------------------------------
754 float OSDService::get_failsafe_full_ratio()
756 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
757 if (full_ratio
> 1.0) full_ratio
/= 100.0;
761 void OSDService::check_full_status(float ratio
)
763 Mutex::Locker
l(full_status_lock
);
767 // The OSDMap ratios take precendence. So if the failsafe is .95 and
768 // the admin sets the cluster full to .96, the failsafe moves up to .96
769 // too. (Not that having failsafe == full is ideal, but it's better than
770 // dropping writes before the clusters appears full.)
771 OSDMapRef osdmap
= get_osdmap();
772 if (!osdmap
|| osdmap
->get_epoch() == 0) {
776 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
777 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
778 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
779 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
781 if (osdmap
->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
782 // use the failsafe for nearfull and full; the mon isn't using the
783 // flags anyway because we're mid-upgrade.
784 full_ratio
= failsafe_ratio
;
785 backfillfull_ratio
= failsafe_ratio
;
786 nearfull_ratio
= failsafe_ratio
;
787 } else if (full_ratio
<= 0 ||
788 backfillfull_ratio
<= 0 ||
789 nearfull_ratio
<= 0) {
790 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
791 // use failsafe flag. ick. the monitor did something wrong or the user
792 // did something stupid.
793 full_ratio
= failsafe_ratio
;
794 backfillfull_ratio
= failsafe_ratio
;
795 nearfull_ratio
= failsafe_ratio
;
800 if (injectfull_state
> NONE
&& injectfull
) {
801 new_state
= injectfull_state
;
802 inject
= "(Injected)";
803 } else if (ratio
> failsafe_ratio
) {
804 new_state
= FAILSAFE
;
805 } else if (ratio
> full_ratio
) {
807 } else if (ratio
> backfillfull_ratio
) {
808 new_state
= BACKFILLFULL
;
809 } else if (ratio
> nearfull_ratio
) {
810 new_state
= NEARFULL
;
814 dout(20) << __func__
<< " cur ratio " << ratio
815 << ". nearfull_ratio " << nearfull_ratio
816 << ". backfillfull_ratio " << backfillfull_ratio
817 << ", full_ratio " << full_ratio
818 << ", failsafe_ratio " << failsafe_ratio
819 << ", new state " << get_full_state_name(new_state
)
824 if (cur_state
!= new_state
) {
825 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
826 << " -> " << get_full_state_name(new_state
) << dendl
;
827 if (new_state
== FAILSAFE
) {
828 clog
->error() << "full status failsafe engaged, dropping updates, now "
829 << (int)roundf(ratio
* 100) << "% full";
830 } else if (cur_state
== FAILSAFE
) {
831 clog
->error() << "full status failsafe disengaged, no longer dropping "
832 << "updates, now " << (int)roundf(ratio
* 100) << "% full";
834 cur_state
= new_state
;
838 bool OSDService::need_fullness_update()
840 OSDMapRef osdmap
= get_osdmap();
842 if (osdmap
->exists(whoami
)) {
843 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
845 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
847 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
854 else if (is_backfillfull())
856 else if (is_nearfull())
861 bool OSDService::_check_full(s_names type
, ostream
&ss
) const
863 Mutex::Locker
l(full_status_lock
);
865 if (injectfull
&& injectfull_state
>= type
) {
866 // injectfull is either a count of the number of times to return failsafe full
867 // or if -1 then always return full
870 ss
<< "Injected " << get_full_state_name(type
) << " OSD ("
871 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")";
875 ss
<< "current usage is " << cur_ratio
;
876 return cur_state
>= type
;
879 bool OSDService::check_failsafe_full(ostream
&ss
) const
881 return _check_full(FAILSAFE
, ss
);
884 bool OSDService::check_full(ostream
&ss
) const
886 return _check_full(FULL
, ss
);
889 bool OSDService::check_backfill_full(ostream
&ss
) const
891 return _check_full(BACKFILLFULL
, ss
);
894 bool OSDService::check_nearfull(ostream
&ss
) const
896 return _check_full(NEARFULL
, ss
);
899 bool OSDService::is_failsafe_full() const
901 Mutex::Locker
l(full_status_lock
);
902 return cur_state
== FAILSAFE
;
905 bool OSDService::is_full() const
907 Mutex::Locker
l(full_status_lock
);
908 return cur_state
>= FULL
;
911 bool OSDService::is_backfillfull() const
913 Mutex::Locker
l(full_status_lock
);
914 return cur_state
>= BACKFILLFULL
;
917 bool OSDService::is_nearfull() const
919 Mutex::Locker
l(full_status_lock
);
920 return cur_state
>= NEARFULL
;
923 void OSDService::set_injectfull(s_names type
, int64_t count
)
925 Mutex::Locker
l(full_status_lock
);
926 injectfull_state
= type
;
930 osd_stat_t
OSDService::set_osd_stat(const struct store_statfs_t
&stbuf
,
931 vector
<int>& hb_peers
,
934 uint64_t bytes
= stbuf
.total
;
935 uint64_t used
= bytes
- stbuf
.available
;
936 uint64_t avail
= stbuf
.available
;
938 osd
->logger
->set(l_osd_stat_bytes
, bytes
);
939 osd
->logger
->set(l_osd_stat_bytes_used
, used
);
940 osd
->logger
->set(l_osd_stat_bytes_avail
, avail
);
943 Mutex::Locker
l(stat_lock
);
944 osd_stat
.hb_peers
.swap(hb_peers
);
945 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
946 osd_stat
.kb
= bytes
>> 10;
947 osd_stat
.kb_used
= used
>> 10;
948 osd_stat
.kb_avail
= avail
>> 10;
949 osd_stat
.num_pgs
= num_pgs
;
954 void OSDService::update_osd_stat(vector
<int>& hb_peers
)
956 // load osd stats first
957 struct store_statfs_t stbuf
;
958 int r
= osd
->store
->statfs(&stbuf
);
960 derr
<< "statfs() failed: " << cpp_strerror(r
) << dendl
;
964 auto new_stat
= set_osd_stat(stbuf
, hb_peers
, osd
->get_num_pgs());
965 dout(20) << "update_osd_stat " << new_stat
<< dendl
;
967 float ratio
= ((float)new_stat
.kb_used
) / ((float)new_stat
.kb
);
968 check_full_status(ratio
);
971 bool OSDService::check_osdmap_full(const set
<pg_shard_t
> &missing_on
)
973 OSDMapRef osdmap
= get_osdmap();
974 for (auto shard
: missing_on
) {
975 if (osdmap
->get_state(shard
.osd
) & CEPH_OSD_FULL
)
981 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
983 OSDMapRef next_map
= get_nextmap_reserved();
984 // service map is always newer/newest
985 assert(from_epoch
<= next_map
->get_epoch());
987 if (next_map
->is_down(peer
) ||
988 next_map
->get_info(peer
).up_from
> from_epoch
) {
990 release_map(next_map
);
993 const entity_inst_t
& peer_inst
= next_map
->get_cluster_inst(peer
);
994 ConnectionRef peer_con
= osd
->cluster_messenger
->get_connection(peer_inst
);
995 share_map_peer(peer
, peer_con
.get(), next_map
);
996 peer_con
->send_message(m
);
997 release_map(next_map
);
1000 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
1002 OSDMapRef next_map
= get_nextmap_reserved();
1003 // service map is always newer/newest
1004 assert(from_epoch
<= next_map
->get_epoch());
1006 if (next_map
->is_down(peer
) ||
1007 next_map
->get_info(peer
).up_from
> from_epoch
) {
1008 release_map(next_map
);
1011 ConnectionRef con
= osd
->cluster_messenger
->get_connection(next_map
->get_cluster_inst(peer
));
1012 release_map(next_map
);
1016 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
1018 OSDMapRef next_map
= get_nextmap_reserved();
1019 // service map is always newer/newest
1020 assert(from_epoch
<= next_map
->get_epoch());
1022 pair
<ConnectionRef
,ConnectionRef
> ret
;
1023 if (next_map
->is_down(peer
) ||
1024 next_map
->get_info(peer
).up_from
> from_epoch
) {
1025 release_map(next_map
);
1028 ret
.first
= osd
->hb_back_client_messenger
->get_connection(next_map
->get_hb_back_inst(peer
));
1029 if (next_map
->get_hb_front_addr(peer
) != entity_addr_t())
1030 ret
.second
= osd
->hb_front_client_messenger
->get_connection(next_map
->get_hb_front_inst(peer
));
1031 release_map(next_map
);
1036 void OSDService::queue_want_pg_temp(pg_t pgid
,
1037 const vector
<int>& want
,
1040 Mutex::Locker
l(pg_temp_lock
);
1041 auto p
= pg_temp_pending
.find(pgid
);
1042 if (p
== pg_temp_pending
.end() ||
1043 p
->second
.acting
!= want
||
1045 pg_temp_wanted
[pgid
] = pg_temp_t
{want
, forced
};
1049 void OSDService::remove_want_pg_temp(pg_t pgid
)
1051 Mutex::Locker
l(pg_temp_lock
);
1052 pg_temp_wanted
.erase(pgid
);
1053 pg_temp_pending
.erase(pgid
);
1056 void OSDService::_sent_pg_temp()
1058 pg_temp_pending
.insert(make_move_iterator(begin(pg_temp_wanted
)),
1059 make_move_iterator(end(pg_temp_wanted
)));
1060 pg_temp_wanted
.clear();
1063 void OSDService::requeue_pg_temp()
1065 Mutex::Locker
l(pg_temp_lock
);
1066 // wanted overrides pending. note that remove_want_pg_temp
1067 // clears the item out of both.
1068 unsigned old_wanted
= pg_temp_wanted
.size();
1069 unsigned old_pending
= pg_temp_pending
.size();
1071 pg_temp_wanted
.swap(pg_temp_pending
);
1072 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1073 << pg_temp_wanted
.size() << dendl
;
1076 std::ostream
& operator<<(std::ostream
& out
,
1077 const OSDService::pg_temp_t
& pg_temp
)
1079 out
<< pg_temp
.acting
;
1080 if (pg_temp
.forced
) {
1086 void OSDService::send_pg_temp()
1088 Mutex::Locker
l(pg_temp_lock
);
1089 if (pg_temp_wanted
.empty())
1091 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1092 MOSDPGTemp
*ms
[2] = {nullptr, nullptr};
1093 for (auto& pg_temp
: pg_temp_wanted
) {
1094 auto& m
= ms
[pg_temp
.second
.forced
];
1096 m
= new MOSDPGTemp(osdmap
->get_epoch());
1097 m
->forced
= pg_temp
.second
.forced
;
1099 m
->pg_temp
.emplace(pg_temp
.first
,
1100 pg_temp
.second
.acting
);
1104 monc
->send_mon_message(m
);
1110 void OSDService::send_pg_created(pg_t pgid
)
1112 dout(20) << __func__
<< dendl
;
1113 if (osdmap
->require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
1114 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1118 // --------------------------------------
1121 epoch_t
OSDService::get_peer_epoch(int peer
)
1123 Mutex::Locker
l(peer_map_epoch_lock
);
1124 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1125 if (p
== peer_map_epoch
.end())
1130 epoch_t
OSDService::note_peer_epoch(int peer
, epoch_t e
)
1132 Mutex::Locker
l(peer_map_epoch_lock
);
1133 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1134 if (p
!= peer_map_epoch
.end()) {
1135 if (p
->second
< e
) {
1136 dout(10) << "note_peer_epoch osd." << peer
<< " has " << e
<< dendl
;
1139 dout(30) << "note_peer_epoch osd." << peer
<< " has " << p
->second
<< " >= " << e
<< dendl
;
1143 dout(10) << "note_peer_epoch osd." << peer
<< " now has " << e
<< dendl
;
1144 peer_map_epoch
[peer
] = e
;
1149 void OSDService::forget_peer_epoch(int peer
, epoch_t as_of
)
1151 Mutex::Locker
l(peer_map_epoch_lock
);
1152 map
<int,epoch_t
>::iterator p
= peer_map_epoch
.find(peer
);
1153 if (p
!= peer_map_epoch
.end()) {
1154 if (p
->second
<= as_of
) {
1155 dout(10) << "forget_peer_epoch osd." << peer
<< " as_of " << as_of
1156 << " had " << p
->second
<< dendl
;
1157 peer_map_epoch
.erase(p
);
1159 dout(10) << "forget_peer_epoch osd." << peer
<< " as_of " << as_of
1160 << " has " << p
->second
<< " - not forgetting" << dendl
;
1165 bool OSDService::should_share_map(entity_name_t name
, Connection
*con
,
1166 epoch_t epoch
, const OSDMapRef
& osdmap
,
1167 const epoch_t
*sent_epoch_p
)
1169 dout(20) << "should_share_map "
1170 << name
<< " " << con
->get_peer_addr()
1171 << " " << epoch
<< dendl
;
1173 // does client have old map?
1174 if (name
.is_client()) {
1175 bool message_sendmap
= epoch
< osdmap
->get_epoch();
1176 if (message_sendmap
&& sent_epoch_p
) {
1177 dout(20) << "client session last_sent_epoch: "
1179 << " versus osdmap epoch " << osdmap
->get_epoch() << dendl
;
1180 if (*sent_epoch_p
< osdmap
->get_epoch()) {
1182 } // else we don't need to send it out again
1186 if (con
->get_messenger() == osd
->cluster_messenger
&&
1187 con
!= osd
->cluster_messenger
->get_loopback_connection() &&
1188 osdmap
->is_up(name
.num()) &&
1189 (osdmap
->get_cluster_addr(name
.num()) == con
->get_peer_addr() ||
1190 osdmap
->get_hb_back_addr(name
.num()) == con
->get_peer_addr())) {
1192 epoch_t has
= MAX(get_peer_epoch(name
.num()), epoch
);
1195 if (has
< osdmap
->get_epoch()) {
1196 dout(10) << name
<< " " << con
->get_peer_addr()
1197 << " has old map " << epoch
<< " < "
1198 << osdmap
->get_epoch() << dendl
;
1206 void OSDService::share_map(
1211 epoch_t
*sent_epoch_p
)
1213 dout(20) << "share_map "
1214 << name
<< " " << con
->get_peer_addr()
1215 << " " << epoch
<< dendl
;
1217 if (!osd
->is_active()) {
1218 /*It is safe not to proceed as OSD is not in healthy state*/
1222 bool want_shared
= should_share_map(name
, con
, epoch
,
1223 osdmap
, sent_epoch_p
);
1226 if (name
.is_client()) {
1227 dout(10) << name
<< " has old map " << epoch
1228 << " < " << osdmap
->get_epoch() << dendl
;
1229 // we know the Session is valid or we wouldn't be sending
1231 *sent_epoch_p
= osdmap
->get_epoch();
1233 send_incremental_map(epoch
, con
, osdmap
);
1234 } else if (con
->get_messenger() == osd
->cluster_messenger
&&
1235 osdmap
->is_up(name
.num()) &&
1236 (osdmap
->get_cluster_addr(name
.num()) == con
->get_peer_addr() ||
1237 osdmap
->get_hb_back_addr(name
.num()) == con
->get_peer_addr())) {
1238 dout(10) << name
<< " " << con
->get_peer_addr()
1239 << " has old map " << epoch
<< " < "
1240 << osdmap
->get_epoch() << dendl
;
1241 note_peer_epoch(name
.num(), osdmap
->get_epoch());
1242 send_incremental_map(epoch
, con
, osdmap
);
1247 void OSDService::share_map_peer(int peer
, Connection
*con
, OSDMapRef map
)
1253 epoch_t pe
= get_peer_epoch(peer
);
1255 if (pe
< map
->get_epoch()) {
1256 send_incremental_map(pe
, con
, map
);
1257 note_peer_epoch(peer
, map
->get_epoch());
1259 dout(20) << "share_map_peer " << con
<< " already has epoch " << pe
<< dendl
;
1261 dout(20) << "share_map_peer " << con
<< " don't know epoch, doing nothing" << dendl
;
1262 // no idea about peer's epoch.
1263 // ??? send recent ???
1268 bool OSDService::can_inc_scrubs_pending()
1270 bool can_inc
= false;
1271 Mutex::Locker
l(sched_scrub_lock
);
1273 if (scrubs_pending
+ scrubs_active
< cct
->_conf
->osd_max_scrubs
) {
1274 dout(20) << __func__
<< " " << scrubs_pending
<< " -> " << (scrubs_pending
+1)
1275 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", active " << scrubs_active
1279 dout(20) << __func__
<< " " << scrubs_pending
<< " + " << scrubs_active
1280 << " active >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1286 bool OSDService::inc_scrubs_pending()
1288 bool result
= false;
1290 sched_scrub_lock
.Lock();
1291 if (scrubs_pending
+ scrubs_active
< cct
->_conf
->osd_max_scrubs
) {
1292 dout(20) << "inc_scrubs_pending " << scrubs_pending
<< " -> " << (scrubs_pending
+1)
1293 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", active " << scrubs_active
<< ")" << dendl
;
1297 dout(20) << "inc_scrubs_pending " << scrubs_pending
<< " + " << scrubs_active
<< " active >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1299 sched_scrub_lock
.Unlock();
1304 void OSDService::dec_scrubs_pending()
1306 sched_scrub_lock
.Lock();
1307 dout(20) << "dec_scrubs_pending " << scrubs_pending
<< " -> " << (scrubs_pending
-1)
1308 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", active " << scrubs_active
<< ")" << dendl
;
1310 assert(scrubs_pending
>= 0);
1311 sched_scrub_lock
.Unlock();
1314 void OSDService::inc_scrubs_active(bool reserved
)
1316 sched_scrub_lock
.Lock();
1320 dout(20) << "inc_scrubs_active " << (scrubs_active
-1) << " -> " << scrubs_active
1321 << " (max " << cct
->_conf
->osd_max_scrubs
1322 << ", pending " << (scrubs_pending
+1) << " -> " << scrubs_pending
<< ")" << dendl
;
1323 assert(scrubs_pending
>= 0);
1325 dout(20) << "inc_scrubs_active " << (scrubs_active
-1) << " -> " << scrubs_active
1326 << " (max " << cct
->_conf
->osd_max_scrubs
1327 << ", pending " << scrubs_pending
<< ")" << dendl
;
1329 sched_scrub_lock
.Unlock();
1332 void OSDService::dec_scrubs_active()
1334 sched_scrub_lock
.Lock();
1335 dout(20) << "dec_scrubs_active " << scrubs_active
<< " -> " << (scrubs_active
-1)
1336 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", pending " << scrubs_pending
<< ")" << dendl
;
1338 assert(scrubs_active
>= 0);
1339 sched_scrub_lock
.Unlock();
1342 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1343 epoch_t
*_bind_epoch
) const
1345 Mutex::Locker
l(epoch_lock
);
1347 *_boot_epoch
= boot_epoch
;
1349 *_up_epoch
= up_epoch
;
1351 *_bind_epoch
= bind_epoch
;
1354 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1355 const epoch_t
*_bind_epoch
)
1357 Mutex::Locker
l(epoch_lock
);
1359 assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1360 boot_epoch
= *_boot_epoch
;
1363 assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1364 up_epoch
= *_up_epoch
;
1367 assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1368 bind_epoch
= *_bind_epoch
;
1372 bool OSDService::prepare_to_stop()
1374 Mutex::Locker
l(is_stopping_lock
);
1375 if (get_state() != NOT_STOPPING
)
1378 OSDMapRef osdmap
= get_osdmap();
1379 if (osdmap
&& osdmap
->is_up(whoami
)) {
1380 dout(0) << __func__
<< " telling mon we are shutting down" << dendl
;
1381 set_state(PREPARING_TO_STOP
);
1382 monc
->send_mon_message(new MOSDMarkMeDown(monc
->get_fsid(),
1383 osdmap
->get_inst(whoami
),
1384 osdmap
->get_epoch(),
1387 utime_t now
= ceph_clock_now();
1389 timeout
.set_from_double(now
+ cct
->_conf
->osd_mon_shutdown_timeout
);
1390 while ((ceph_clock_now() < timeout
) &&
1391 (get_state() != STOPPING
)) {
1392 is_stopping_cond
.WaitUntil(is_stopping_lock
, timeout
);
1395 dout(0) << __func__
<< " starting shutdown" << dendl
;
1396 set_state(STOPPING
);
1400 void OSDService::got_stop_ack()
1402 Mutex::Locker
l(is_stopping_lock
);
1403 if (get_state() == PREPARING_TO_STOP
) {
1404 dout(0) << __func__
<< " starting shutdown" << dendl
;
1405 set_state(STOPPING
);
1406 is_stopping_cond
.Signal();
1408 dout(10) << __func__
<< " ignoring msg" << dendl
;
1412 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1413 OSDSuperblock
& sblock
)
1415 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1416 osdmap
->get_encoding_features());
1417 m
->oldest_map
= max_oldest_map
;
1418 m
->newest_map
= sblock
.newest_map
;
1420 for (epoch_t e
= to
; e
> since
; e
--) {
1422 if (e
> m
->oldest_map
&& get_inc_map_bl(e
, bl
)) {
1423 m
->incremental_maps
[e
].claim(bl
);
1424 } else if (get_map_bl(e
, bl
)) {
1425 m
->maps
[e
].claim(bl
);
1428 derr
<< "since " << since
<< " to " << to
1429 << " oldest " << m
->oldest_map
<< " newest " << m
->newest_map
1439 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1441 con
->send_message(m
);
1444 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1447 epoch_t to
= osdmap
->get_epoch();
1448 dout(10) << "send_incremental_map " << since
<< " -> " << to
1449 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1453 OSDSuperblock
sblock(get_superblock());
1454 if (since
< sblock
.oldest_map
) {
1455 // just send latest full map
1456 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1457 osdmap
->get_encoding_features());
1458 m
->oldest_map
= max_oldest_map
;
1459 m
->newest_map
= sblock
.newest_map
;
1460 get_map_bl(to
, m
->maps
[to
]);
1465 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1466 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1467 << ", only sending most recent" << dendl
;
1468 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1471 if (to
- since
> (epoch_t
)cct
->_conf
->osd_map_message_max
)
1472 to
= since
+ cct
->_conf
->osd_map_message_max
;
1473 m
= build_incremental_map_msg(since
, to
, sblock
);
1478 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1480 bool found
= map_bl_cache
.lookup(e
, &bl
);
1483 logger
->inc(l_osd_map_bl_cache_hit
);
1487 logger
->inc(l_osd_map_bl_cache_miss
);
1488 found
= store
->read(coll_t::meta(),
1489 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
,
1490 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1497 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1499 Mutex::Locker
l(map_cache_lock
);
1500 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1503 logger
->inc(l_osd_map_bl_cache_hit
);
1507 logger
->inc(l_osd_map_bl_cache_miss
);
1508 found
= store
->read(coll_t::meta(),
1509 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
,
1510 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1512 _add_map_inc_bl(e
, bl
);
1517 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1519 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1520 // cache a contiguous buffer
1521 if (bl
.get_num_buffers() > 1) {
1524 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1525 map_bl_cache
.add(e
, bl
);
1528 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1530 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1531 // cache a contiguous buffer
1532 if (bl
.get_num_buffers() > 1) {
1535 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1536 map_bl_inc_cache
.add(e
, bl
);
1539 void OSDService::pin_map_inc_bl(epoch_t e
, bufferlist
&bl
)
1541 Mutex::Locker
l(map_cache_lock
);
1542 // cache a contiguous buffer
1543 if (bl
.get_num_buffers() > 1) {
1546 map_bl_inc_cache
.pin(e
, bl
);
1549 void OSDService::pin_map_bl(epoch_t e
, bufferlist
&bl
)
1551 Mutex::Locker
l(map_cache_lock
);
1552 // cache a contiguous buffer
1553 if (bl
.get_num_buffers() > 1) {
1556 map_bl_cache
.pin(e
, bl
);
1559 void OSDService::clear_map_bl_cache_pins(epoch_t e
)
1561 Mutex::Locker
l(map_cache_lock
);
1562 map_bl_inc_cache
.clear_pinned(e
);
1563 map_bl_cache
.clear_pinned(e
);
1566 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1568 epoch_t e
= o
->get_epoch();
1570 if (cct
->_conf
->osd_map_dedup
) {
1571 // Dedup against an existing map at a nearby epoch
1572 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1574 OSDMap::dedup(for_dedup
.get(), o
);
1578 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1585 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1587 Mutex::Locker
l(map_cache_lock
);
1588 OSDMapRef retval
= map_cache
.lookup(epoch
);
1590 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1592 logger
->inc(l_osd_map_cache_hit
);
1597 logger
->inc(l_osd_map_cache_miss
);
1598 epoch_t lb
= map_cache
.cached_key_lower_bound();
1600 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1601 logger
->inc(l_osd_map_cache_miss_low
);
1602 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1606 OSDMap
*map
= new OSDMap
;
1608 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1610 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1611 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1617 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1619 return _add_map(map
);
1625 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1627 reply_op_error(op
, err
, eversion_t(), 0);
1630 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1633 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1634 assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1636 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1638 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
,
1640 reply
->set_reply_versions(v
, uv
);
1641 m
->get_connection()->send_message(reply
);
1644 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1646 if (!cct
->_conf
->osd_debug_misdirected_ops
) {
1650 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1651 assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1653 assert(m
->get_map_epoch() >= pg
->info
.history
.same_primary_since
);
1655 if (pg
->is_ec_pg()) {
1657 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1658 * can get this result:
1659 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1660 * [CRUSH_ITEM_NONE, 2, 3]/3
1661 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1663 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1665 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1668 * We can't compute the op target based on the sending map epoch due to
1669 * splitting. The simplest thing is to detect such cases here and drop
1670 * them without an error (the client will resend anyway).
1672 assert(m
->get_map_epoch() <= superblock
.newest_map
);
1673 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1675 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1676 << m
->get_map_epoch() << ", dropping" << dendl
;
1679 pg_t _pgid
= m
->get_raw_pg();
1681 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1682 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1683 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1684 pgid
.shard
!= pg
->info
.pgid
.shard
) {
1685 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1686 << m
->get_map_epoch() << ", dropping" << dendl
;
1691 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1692 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1693 << " pg " << m
->get_raw_pg()
1694 << " to osd." << whoami
1695 << " not " << pg
->acting
1696 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1699 void OSDService::enqueue_back(spg_t pgid
, PGQueueable qi
)
1701 osd
->op_shardedwq
.queue(make_pair(pgid
, qi
));
1704 void OSDService::enqueue_front(spg_t pgid
, PGQueueable qi
)
1706 osd
->op_shardedwq
.queue_front(make_pair(pgid
, qi
));
1709 void OSDService::queue_for_peering(PG
*pg
)
1711 peering_wq
.queue(pg
);
1714 void OSDService::queue_for_snap_trim(PG
*pg
)
1716 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1717 osd
->op_shardedwq
.queue(
1721 PGSnapTrim(pg
->get_osdmap()->get_epoch()),
1722 cct
->_conf
->osd_snap_trim_cost
,
1723 cct
->_conf
->osd_snap_trim_priority
,
1726 pg
->get_osdmap()->get_epoch())));
1730 // ====================================================================
1734 #define dout_prefix *_dout
1736 // Commands shared between OSD's console and admin console:
1738 namespace osd_cmds
{
1740 int heap(CephContext
& cct
, cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
);
1742 }} // namespace ceph::osd_cmds
1744 int OSD::mkfs(CephContext
*cct
, ObjectStore
*store
, const string
&dev
,
1745 uuid_d fsid
, int whoami
)
1749 ceph::shared_ptr
<ObjectStore::Sequencer
> osr(
1750 new ObjectStore::Sequencer("mkfs"));
1755 // if we are fed a uuid for this osd, use it.
1756 store
->set_fsid(cct
->_conf
->osd_uuid
);
1758 ret
= store
->mkfs();
1760 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error "
1761 << cpp_strerror(ret
) << dendl
;
1765 store
->set_cache_shards(1); // doesn't matter for mkfs!
1767 ret
= store
->mount();
1769 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error "
1770 << cpp_strerror(ret
) << dendl
;
1774 ret
= store
->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
1776 /* if we already have superblock, check content of superblock */
1777 dout(0) << " have superblock" << dendl
;
1778 bufferlist::iterator p
;
1781 if (whoami
!= sb
.whoami
) {
1782 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
1787 if (fsid
!= sb
.cluster_fsid
) {
1788 derr
<< "provided cluster fsid " << fsid
1789 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
1794 // create superblock
1795 sb
.cluster_fsid
= fsid
;
1796 sb
.osd_fsid
= store
->get_fsid();
1798 sb
.compat_features
= get_osd_initial_compat_set();
1803 ObjectStore::Transaction t
;
1804 t
.create_collection(coll_t::meta(), 0);
1805 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
1806 ret
= store
->apply_transaction(osr
.get(), std::move(t
));
1808 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1809 << "apply_transaction returned " << cpp_strerror(ret
) << dendl
;
1814 if (!osr
->flush_commit(&waiter
)) {
1818 ret
= write_meta(cct
, store
, sb
.cluster_fsid
, sb
.osd_fsid
, whoami
);
1820 derr
<< "OSD::mkfs: failed to write fsid file: error "
1821 << cpp_strerror(ret
) << dendl
;
1832 int OSD::write_meta(CephContext
*cct
, ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
)
1837 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
1838 r
= store
->write_meta("magic", val
);
1842 snprintf(val
, sizeof(val
), "%d", whoami
);
1843 r
= store
->write_meta("whoami", val
);
1847 cluster_fsid
.print(val
);
1848 r
= store
->write_meta("ceph_fsid", val
);
1852 string key
= cct
->_conf
->get_val
<string
>("key");
1854 r
= store
->write_meta("osd_key", key
);
1858 string keyfile
= cct
->_conf
->get_val
<string
>("keyfile");
1859 if (!keyfile
.empty()) {
1862 if (keyfile
== "-") {
1863 static_assert(1024 * 1024 >
1864 (sizeof(CryptoKey
) - sizeof(bufferptr
) +
1865 sizeof(__u16
) + 16 /* AES_KEY_LEN */ + 3 - 1) / 3. * 4.,
1866 "1MB should be enough for a base64 encoded CryptoKey");
1867 r
= keybl
.read_fd(STDIN_FILENO
, 1024 * 1024);
1869 r
= keybl
.read_file(keyfile
.c_str(), &err
);
1872 derr
<< __func__
<< " failed to read keyfile " << keyfile
<< ": "
1873 << err
<< ": " << cpp_strerror(r
) << dendl
;
1876 r
= store
->write_meta("osd_key", keybl
.to_str());
1882 r
= store
->write_meta("ready", "ready");
1889 int OSD::peek_meta(ObjectStore
*store
, std::string
& magic
,
1890 uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int& whoami
)
1894 int r
= store
->read_meta("magic", &val
);
1899 r
= store
->read_meta("whoami", &val
);
1902 whoami
= atoi(val
.c_str());
1904 r
= store
->read_meta("ceph_fsid", &val
);
1907 r
= cluster_fsid
.parse(val
.c_str());
1911 r
= store
->read_meta("fsid", &val
);
1913 osd_fsid
= uuid_d();
1915 r
= osd_fsid
.parse(val
.c_str());
1925 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1929 OSD::OSD(CephContext
*cct_
, ObjectStore
*store_
,
1931 Messenger
*internal_messenger
,
1932 Messenger
*external_messenger
,
1933 Messenger
*hb_client_front
,
1934 Messenger
*hb_client_back
,
1935 Messenger
*hb_front_serverm
,
1936 Messenger
*hb_back_serverm
,
1937 Messenger
*osdc_messenger
,
1939 const std::string
&dev
, const std::string
&jdev
) :
1941 osd_lock("OSD::osd_lock"),
1942 tick_timer(cct
, osd_lock
),
1943 tick_timer_lock("OSD::tick_timer_lock"),
1944 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
1945 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct
,
1946 cct
->_conf
->auth_supported
.empty() ?
1947 cct
->_conf
->auth_cluster_required
:
1948 cct
->_conf
->auth_supported
)),
1949 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct
,
1950 cct
->_conf
->auth_supported
.empty() ?
1951 cct
->_conf
->auth_service_required
:
1952 cct
->_conf
->auth_supported
)),
1953 cluster_messenger(internal_messenger
),
1954 client_messenger(external_messenger
),
1955 objecter_messenger(osdc_messenger
),
1957 mgrc(cct_
, client_messenger
),
1959 recoverystate_perf(NULL
),
1961 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
1962 clog(log_client
.create_channel()),
1964 dev_path(dev
), journal_path(jdev
),
1965 store_is_rotational(store
->is_rotational()),
1966 trace_endpoint("0.0.0.0", 0, "osd"),
1968 osd_compat(get_osd_compat_set()),
1969 peering_tp(cct
, "OSD::peering_tp", "tp_peering",
1970 cct
->_conf
->osd_peering_wq_threads
,
1971 "osd_peering_tp_threads"),
1972 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
1973 get_num_op_threads()),
1974 remove_tp(cct
, "OSD::remove_tp", "tp_osd_remove", cct
->_conf
->osd_remove_threads
, "osd_remove_threads"),
1975 recovery_tp(cct
, "OSD::recovery_tp", "tp_osd_recovery", cct
->_conf
->osd_recovery_threads
, "osd_recovery_threads"),
1976 command_tp(cct
, "OSD::command_tp", "tp_osd_cmd", 1),
1977 session_waiting_lock("OSD::session_waiting_lock"),
1978 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
1979 heartbeat_lock("OSD::heartbeat_lock"),
1980 heartbeat_stop(false),
1981 heartbeat_need_update(true),
1982 hb_front_client_messenger(hb_client_front
),
1983 hb_back_client_messenger(hb_client_back
),
1984 hb_front_server_messenger(hb_front_serverm
),
1985 hb_back_server_messenger(hb_back_serverm
),
1987 heartbeat_thread(this),
1988 heartbeat_dispatcher(this),
1989 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
1990 cct
->_conf
->osd_num_op_tracker_shard
),
1991 test_ops_hook(NULL
),
1992 op_queue(get_io_queue()),
1993 op_prio_cutoff(get_io_prio_cut()),
1995 get_num_op_shards(),
1997 cct
->_conf
->osd_op_thread_timeout
,
1998 cct
->_conf
->osd_op_thread_suicide_timeout
,
2002 cct
->_conf
->osd_op_thread_timeout
,
2003 cct
->_conf
->osd_op_thread_suicide_timeout
,
2005 map_lock("OSD::map_lock"),
2006 pg_map_lock("OSD::pg_map_lock"),
2007 last_pg_create_epoch(0),
2008 mon_report_lock("OSD::mon_report_lock"),
2009 stats_ack_timeout(cct
->_conf
->osd_mon_ack_timeout
),
2011 requested_full_first(0),
2012 requested_full_last(0),
2013 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
2014 osd_stat_updated(false),
2015 pg_stat_tid(0), pg_stat_tid_flushed(0),
2018 cct
->_conf
->osd_command_thread_timeout
,
2019 cct
->_conf
->osd_command_thread_suicide_timeout
,
2024 cct
->_conf
->osd_remove_thread_timeout
,
2025 cct
->_conf
->osd_remove_thread_suicide_timeout
,
2029 monc
->set_messenger(client_messenger
);
2030 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
2031 cct
->_conf
->osd_op_log_threshold
);
2032 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
2033 cct
->_conf
->osd_op_history_duration
);
2034 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
2035 cct
->_conf
->osd_op_history_slow_op_threshold
);
2037 std::stringstream ss
;
2038 ss
<< "osd." << whoami
;
2039 trace_endpoint
.copy_name(ss
.str());
2045 delete authorize_handler_cluster_registry
;
2046 delete authorize_handler_service_registry
;
2047 delete class_handler
;
2048 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
2049 cct
->get_perfcounters_collection()->remove(logger
);
2050 delete recoverystate_perf
;
2055 double OSD::get_tick_interval() const
2057 // vary +/- 5% to avoid scrub scheduling livelocks
2058 constexpr auto delta
= 0.05;
2059 std::default_random_engine rng
{static_cast<unsigned>(whoami
)};
2060 return (OSD_TICK_INTERVAL
*
2061 std::uniform_real_distribution
<>{1.0 - delta
, 1.0 + delta
}(rng
));
2064 void cls_initialize(ClassHandler
*ch
);
2066 void OSD::handle_signal(int signum
)
2068 assert(signum
== SIGINT
|| signum
== SIGTERM
);
2069 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
2075 Mutex::Locker
lock(osd_lock
);
2079 if (store
->test_mount_in_use()) {
2080 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
2081 << "currently in use. (Is ceph-osd already running?)" << dendl
;
2085 cct
->_conf
->add_observer(this);
2091 class OSDSocketHook
: public AdminSocketHook
{
2094 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
2095 bool call(std::string admin_command
, cmdmap_t
& cmdmap
, std::string format
,
2096 bufferlist
& out
) override
{
2098 bool r
= osd
->asok_command(admin_command
, cmdmap
, format
, ss
);
2104 bool OSD::asok_command(string admin_command
, cmdmap_t
& cmdmap
, string format
,
2107 Formatter
*f
= Formatter::create(format
, "json-pretty", "json-pretty");
2108 if (admin_command
== "status") {
2109 f
->open_object_section("status");
2110 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
2111 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
2112 f
->dump_unsigned("whoami", superblock
.whoami
);
2113 f
->dump_string("state", get_state_name(get_state()));
2114 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
2115 f
->dump_unsigned("newest_map", superblock
.newest_map
);
2117 RWLock::RLocker
l(pg_map_lock
);
2118 f
->dump_unsigned("num_pgs", pg_map
.size());
2121 } else if (admin_command
== "flush_journal") {
2122 store
->flush_journal();
2123 } else if (admin_command
== "dump_ops_in_flight" ||
2124 admin_command
== "ops" ||
2125 admin_command
== "dump_blocked_ops" ||
2126 admin_command
== "dump_historic_ops" ||
2127 admin_command
== "dump_historic_ops_by_duration" ||
2128 admin_command
== "dump_historic_slow_ops") {
2130 const string error_str
= "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2131 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2132 will start to track new ops received afterwards.";
2134 set
<string
> filters
;
2135 vector
<string
> filter_str
;
2136 if (cmd_getval(cct
, cmdmap
, "filterstr", filter_str
)) {
2137 copy(filter_str
.begin(), filter_str
.end(),
2138 inserter(filters
, filters
.end()));
2141 if (admin_command
== "dump_ops_in_flight" ||
2142 admin_command
== "ops") {
2143 if (!op_tracker
.dump_ops_in_flight(f
, false, filters
)) {
2147 if (admin_command
== "dump_blocked_ops") {
2148 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
)) {
2152 if (admin_command
== "dump_historic_ops") {
2153 if (!op_tracker
.dump_historic_ops(f
, false, filters
)) {
2157 if (admin_command
== "dump_historic_ops_by_duration") {
2158 if (!op_tracker
.dump_historic_ops(f
, true, filters
)) {
2162 if (admin_command
== "dump_historic_slow_ops") {
2163 if (!op_tracker
.dump_historic_slow_ops(f
, filters
)) {
2167 } else if (admin_command
== "dump_op_pq_state") {
2168 f
->open_object_section("pq");
2169 op_shardedwq
.dump(f
);
2171 } else if (admin_command
== "dump_blacklist") {
2172 list
<pair
<entity_addr_t
,utime_t
> > bl
;
2173 OSDMapRef curmap
= service
.get_osdmap();
2175 f
->open_array_section("blacklist");
2176 curmap
->get_blacklist(&bl
);
2177 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
2178 it
!= bl
.end(); ++it
) {
2179 f
->open_object_section("entry");
2180 f
->open_object_section("entity_addr_t");
2182 f
->close_section(); //entity_addr_t
2183 it
->second
.localtime(f
->dump_stream("expire_time"));
2184 f
->close_section(); //entry
2186 f
->close_section(); //blacklist
2187 } else if (admin_command
== "dump_watchers") {
2188 list
<obj_watch_item_t
> watchers
;
2191 Mutex::Locker
l(osd_lock
);
2192 RWLock::RLocker
l2(pg_map_lock
);
2193 for (ceph::unordered_map
<spg_t
,PG
*>::iterator it
= pg_map
.begin();
2197 list
<obj_watch_item_t
> pg_watchers
;
2198 PG
*pg
= it
->second
;
2200 pg
->get_watchers(pg_watchers
);
2202 watchers
.splice(watchers
.end(), pg_watchers
);
2206 f
->open_array_section("watchers");
2207 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2208 it
!= watchers
.end(); ++it
) {
2210 f
->open_object_section("watch");
2212 f
->dump_string("namespace", it
->obj
.nspace
);
2213 f
->dump_string("object", it
->obj
.oid
.name
);
2215 f
->open_object_section("entity_name");
2216 it
->wi
.name
.dump(f
);
2217 f
->close_section(); //entity_name_t
2219 f
->dump_unsigned("cookie", it
->wi
.cookie
);
2220 f
->dump_unsigned("timeout", it
->wi
.timeout_seconds
);
2222 f
->open_object_section("entity_addr_t");
2223 it
->wi
.addr
.dump(f
);
2224 f
->close_section(); //entity_addr_t
2226 f
->close_section(); //watch
2229 f
->close_section(); //watchers
2230 } else if (admin_command
== "dump_reservations") {
2231 f
->open_object_section("reservations");
2232 f
->open_object_section("local_reservations");
2233 service
.local_reserver
.dump(f
);
2235 f
->open_object_section("remote_reservations");
2236 service
.remote_reserver
.dump(f
);
2239 } else if (admin_command
== "get_latest_osdmap") {
2240 get_latest_osdmap();
2241 } else if (admin_command
== "heap") {
2242 auto result
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ss
);
2244 // Note: Failed heap profile commands won't necessarily trigger an error:
2245 f
->open_object_section("result");
2246 f
->dump_string("error", cpp_strerror(result
));
2247 f
->dump_bool("success", result
>= 0);
2249 } else if (admin_command
== "set_heap_property") {
2253 bool success
= false;
2254 if (!cmd_getval(cct
, cmdmap
, "property", property
)) {
2255 error
= "unable to get property";
2257 } else if (!cmd_getval(cct
, cmdmap
, "value", value
)) {
2258 error
= "unable to get value";
2260 } else if (value
< 0) {
2261 error
= "negative value not allowed";
2263 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2264 error
= "invalid property";
2269 f
->open_object_section("result");
2270 f
->dump_string("error", error
);
2271 f
->dump_bool("success", success
);
2273 } else if (admin_command
== "get_heap_property") {
2277 bool success
= false;
2278 if (!cmd_getval(cct
, cmdmap
, "property", property
)) {
2279 error
= "unable to get property";
2281 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2282 error
= "invalid property";
2287 f
->open_object_section("result");
2288 f
->dump_string("error", error
);
2289 f
->dump_bool("success", success
);
2290 f
->dump_int("value", value
);
2292 } else if (admin_command
== "dump_objectstore_kv_stats") {
2293 store
->get_db_statistics(f
);
2294 } else if (admin_command
== "dump_scrubs") {
2295 service
.dumps_scrub(f
);
2296 } else if (admin_command
== "calc_objectstore_db_histogram") {
2297 store
->generate_db_histogram(f
);
2298 } else if (admin_command
== "flush_store_cache") {
2299 store
->flush_cache();
2300 } else if (admin_command
== "dump_pgstate_history") {
2301 f
->open_object_section("pgstate_history");
2302 RWLock::RLocker
l2(pg_map_lock
);
2303 for (ceph::unordered_map
<spg_t
,PG
*>::iterator it
= pg_map
.begin();
2307 PG
*pg
= it
->second
;
2308 f
->dump_stream("pg") << pg
->get_pgid();
2310 pg
->pgstate_history
.dump(f
);
2314 } else if (admin_command
== "compact") {
2315 dout(1) << "triggering manual compaction" << dendl
;
2316 auto start
= ceph::coarse_mono_clock::now();
2318 auto end
= ceph::coarse_mono_clock::now();
2319 auto time_span
= chrono::duration_cast
<chrono::duration
<double>>(end
- start
);
2320 dout(1) << "finished manual compaction in "
2321 << time_span
.count()
2322 << " seconds" << dendl
;
2323 f
->open_object_section("compact_result");
2324 f
->dump_float("elapsed_time", time_span
.count());
2327 assert(0 == "broken asok registration");
2334 class TestOpsSocketHook
: public AdminSocketHook
{
2335 OSDService
*service
;
2338 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
2339 bool call(std::string command
, cmdmap_t
& cmdmap
, std::string format
,
2340 bufferlist
& out
) override
{
2342 test_ops(service
, store
, command
, cmdmap
, ss
);
2346 void test_ops(OSDService
*service
, ObjectStore
*store
,
2347 const std::string
&command
, cmdmap_t
& cmdmap
, ostream
&ss
);
2351 class OSD::C_Tick
: public Context
{
2354 explicit C_Tick(OSD
*o
) : osd(o
) {}
2355 void finish(int r
) override
{
2360 class OSD::C_Tick_WithoutOSDLock
: public Context
{
2363 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
2364 void finish(int r
) override
{
2365 osd
->tick_without_osd_lock();
2369 int OSD::enable_disable_fuse(bool stop
)
2373 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
2374 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
2375 dout(1) << __func__
<< " disabling" << dendl
;
2379 r
= ::rmdir(mntpath
.c_str());
2382 derr
<< __func__
<< " failed to rmdir " << mntpath
<< ": "
2383 << cpp_strerror(r
) << dendl
;
2388 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
2389 dout(1) << __func__
<< " enabling" << dendl
;
2390 r
= ::mkdir(mntpath
.c_str(), 0700);
2393 if (r
< 0 && r
!= -EEXIST
) {
2394 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
2395 << cpp_strerror(r
) << dendl
;
2398 fuse_store
= new FuseStore(store
, mntpath
);
2399 r
= fuse_store
->start();
2401 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
2407 #endif // HAVE_LIBFUSE
2411 int OSD::get_num_op_shards()
2413 if (cct
->_conf
->osd_op_num_shards
)
2414 return cct
->_conf
->osd_op_num_shards
;
2415 if (store_is_rotational
)
2416 return cct
->_conf
->osd_op_num_shards_hdd
;
2418 return cct
->_conf
->osd_op_num_shards_ssd
;
2421 int OSD::get_num_op_threads()
2423 if (cct
->_conf
->osd_op_num_threads_per_shard
)
2424 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard
;
2425 if (store_is_rotational
)
2426 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_hdd
;
2428 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_ssd
;
2431 float OSD::get_osd_recovery_sleep()
2433 if (cct
->_conf
->osd_recovery_sleep
)
2434 return cct
->_conf
->osd_recovery_sleep
;
2435 if (!store_is_rotational
&& !journal_is_rotational
)
2436 return cct
->_conf
->osd_recovery_sleep_ssd
;
2437 else if (store_is_rotational
&& !journal_is_rotational
)
2438 return cct
->_conf
->get_val
<double>("osd_recovery_sleep_hybrid");
2440 return cct
->_conf
->osd_recovery_sleep_hdd
;
2445 CompatSet initial
, diff
;
2446 Mutex::Locker
lock(osd_lock
);
2451 tick_timer_without_osd_lock
.init();
2452 service
.recovery_request_timer
.init();
2453 service
.recovery_sleep_timer
.init();
2456 dout(2) << "init " << dev_path
2457 << " (looks like " << (store_is_rotational
? "hdd" : "ssd") << ")"
2459 dout(2) << "journal " << journal_path
<< dendl
;
2460 assert(store
); // call pre_init() first!
2462 store
->set_cache_shards(get_num_op_shards());
2464 int r
= store
->mount();
2466 derr
<< "OSD:init: unable to mount object store" << dendl
;
2469 journal_is_rotational
= store
->is_journal_rotational();
2470 dout(2) << "journal looks like " << (journal_is_rotational
? "hdd" : "ssd")
2473 enable_disable_fuse(false);
2475 dout(2) << "boot" << dendl
;
2477 // initialize the daily loadavg with current 15min loadavg
2479 if (getloadavg(loadavgs
, 3) == 3) {
2480 daily_loadavg
= loadavgs
[2];
2482 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
2483 daily_loadavg
= 1.0;
2486 int rotating_auth_attempts
= 0;
2488 // sanity check long object name handling
2491 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
2492 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
2493 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
2494 r
= store
->validate_hobject_key(l
);
2496 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
2497 << "object name[space] len" << dendl
;
2498 derr
<< " osd max object name len = "
2499 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
2500 derr
<< " osd max object namespace len = "
2501 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
2502 derr
<< cpp_strerror(r
) << dendl
;
2503 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
2506 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
2509 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
2514 r
= read_superblock();
2516 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
2521 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
2522 derr
<< "The disk uses features unsupported by the executable." << dendl
;
2523 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
2524 derr
<< " daemon features " << osd_compat
<< dendl
;
2526 if (osd_compat
.writeable(superblock
.compat_features
)) {
2527 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
2528 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
2533 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
2534 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
2540 assert_warn(whoami
== superblock
.whoami
);
2541 if (whoami
!= superblock
.whoami
) {
2542 derr
<< "OSD::init: superblock says osd"
2543 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
2548 initial
= get_osd_initial_compat_set();
2549 diff
= superblock
.compat_features
.unsupported(initial
);
2550 if (superblock
.compat_features
.merge(initial
)) {
2551 // We need to persist the new compat_set before we
2553 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
2554 ObjectStore::Transaction t
;
2555 write_superblock(t
);
2556 r
= store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
2561 // make sure snap mapper object exists
2562 if (!store
->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2563 dout(10) << "init creating/touching snapmapper object" << dendl
;
2564 ObjectStore::Transaction t
;
2565 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2566 r
= store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
2571 class_handler
= new ClassHandler(cct
);
2572 cls_initialize(class_handler
);
2574 if (cct
->_conf
->osd_open_classes_on_start
) {
2575 int r
= class_handler
->open_all_classes();
2577 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
2580 // load up "current" osdmap
2581 assert_warn(!osdmap
);
2583 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
2587 osdmap
= get_map(superblock
.current_epoch
);
2588 check_osdmap_features(store
);
2590 create_recoverystate_perf();
2593 epoch_t bind_epoch
= osdmap
->get_epoch();
2594 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
2597 clear_temp_objects();
2599 // initialize osdmap references in sharded wq
2600 op_shardedwq
.prune_pg_waiters(osdmap
, whoami
);
2602 // load up pgs (as they previously existed)
2605 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
2606 dout(0) << "using " << op_queue
<< " op queue with priority op cut off at " <<
2607 op_prio_cutoff
<< "." << dendl
;
2612 client_messenger
->add_dispatcher_head(this);
2613 cluster_messenger
->add_dispatcher_head(this);
2615 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
2616 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
2617 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
2618 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
2620 objecter_messenger
->add_dispatcher_head(service
.objecter
);
2622 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
2623 | CEPH_ENTITY_TYPE_MGR
);
2629 * FIXME: this is a placeholder implementation that unconditionally
2630 * sends every is_primary PG's stats every time we're called, unlike
2631 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2632 * This has equivalent cost to the existing worst case where all
2633 * PGs are busy and their stats are always enqueued for sending.
2635 mgrc
.set_pgstats_cb([this](){
2636 RWLock::RLocker
l(map_lock
);
2638 utime_t had_for
= ceph_clock_now() - had_map_since
;
2639 osd_stat_t cur_stat
= service
.get_osd_stat();
2640 cur_stat
.os_perf_stat
= store
->get_cur_stats();
2642 MPGStats
*m
= new MPGStats(monc
->get_fsid(), osdmap
->get_epoch(), had_for
);
2643 m
->osd_stat
= cur_stat
;
2645 Mutex::Locker lec
{min_last_epoch_clean_lock
};
2646 min_last_epoch_clean
= osdmap
->get_epoch();
2647 min_last_epoch_clean_pgs
.clear();
2648 RWLock::RLocker
lpg(pg_map_lock
);
2649 for (const auto &i
: pg_map
) {
2651 if (!pg
->is_primary()) {
2655 pg
->pg_stats_publish_lock
.Lock();
2656 if (pg
->pg_stats_publish_valid
) {
2657 m
->pg_stat
[pg
->info
.pgid
.pgid
] = pg
->pg_stats_publish
;
2658 const auto lec
= pg
->pg_stats_publish
.get_effective_last_epoch_clean();
2659 min_last_epoch_clean
= min(min_last_epoch_clean
, lec
);
2660 min_last_epoch_clean_pgs
.push_back(pg
->info
.pgid
.pgid
);
2662 pg
->pg_stats_publish_lock
.Unlock();
2669 client_messenger
->add_dispatcher_head(&mgrc
);
2671 // tell monc about log_client so it will know about mon session resets
2672 monc
->set_log_client(&log_client
);
2673 update_log_config();
2678 service
.publish_map(osdmap
);
2679 service
.publish_superblock(superblock
);
2680 service
.max_oldest_map
= superblock
.oldest_map
;
2684 recovery_tp
.start();
2687 set_disk_tp_priority();
2689 // start the heartbeat
2690 heartbeat_thread
.create("osd_srv_heartbt");
2693 tick_timer
.add_event_after(get_tick_interval(),
2696 Mutex::Locker
l(tick_timer_lock
);
2697 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
2698 new C_Tick_WithoutOSDLock(this));
2703 r
= monc
->authenticate();
2705 derr
<< __func__
<< " authentication failed: " << cpp_strerror(r
)
2707 osd_lock
.Lock(); // locker is going to unlock this on function exit
2713 while (monc
->wait_auth_rotating(30.0) < 0) {
2714 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
2715 ++rotating_auth_attempts
;
2716 if (rotating_auth_attempts
> g_conf
->max_rotating_auth_attempts
) {
2717 derr
<< __func__
<< " wait_auth_rotating timed out" << dendl
;
2718 osd_lock
.Lock(); // make locker happy
2719 if (!is_stopping()) {
2726 r
= update_crush_device_class();
2728 derr
<< __func__
<< " unable to update_crush_device_class: "
2729 << cpp_strerror(r
) << dendl
;
2734 r
= update_crush_location();
2736 derr
<< __func__
<< " unable to update_crush_location: "
2737 << cpp_strerror(r
) << dendl
;
2746 // start objecter *after* we have authenticated, so that we don't ignore
2747 // the OSDMaps it requests.
2748 service
.final_init();
2752 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
2756 dout(0) << "done with init, starting boot process" << dendl
;
2758 // subscribe to any pg creations
2759 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
2761 // MgrClient needs this (it doesn't have MonClient reference itself)
2762 monc
->sub_want("mgrmap", 0, 0);
2764 // we don't need to ask for an osdmap here; objecter will
2765 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2776 enable_disable_fuse(true);
2783 void OSD::final_init()
2785 AdminSocket
*admin_socket
= cct
->get_admin_socket();
2786 asok_hook
= new OSDSocketHook(this);
2787 int r
= admin_socket
->register_command("status", "status", asok_hook
,
2788 "high-level status of OSD");
2790 r
= admin_socket
->register_command("flush_journal", "flush_journal",
2792 "flush the journal to permanent store");
2794 r
= admin_socket
->register_command("dump_ops_in_flight",
2795 "dump_ops_in_flight " \
2796 "name=filterstr,type=CephString,n=N,req=false",
2798 "show the ops currently in flight");
2800 r
= admin_socket
->register_command("ops",
2802 "name=filterstr,type=CephString,n=N,req=false",
2804 "show the ops currently in flight");
2806 r
= admin_socket
->register_command("dump_blocked_ops",
2807 "dump_blocked_ops " \
2808 "name=filterstr,type=CephString,n=N,req=false",
2810 "show the blocked ops currently in flight");
2812 r
= admin_socket
->register_command("dump_historic_ops",
2813 "dump_historic_ops " \
2814 "name=filterstr,type=CephString,n=N,req=false",
2818 r
= admin_socket
->register_command("dump_historic_slow_ops",
2819 "dump_historic_slow_ops " \
2820 "name=filterstr,type=CephString,n=N,req=false",
2822 "show slowest recent ops");
2824 r
= admin_socket
->register_command("dump_historic_ops_by_duration",
2825 "dump_historic_ops_by_duration " \
2826 "name=filterstr,type=CephString,n=N,req=false",
2828 "show slowest recent ops, sorted by duration");
2830 r
= admin_socket
->register_command("dump_op_pq_state", "dump_op_pq_state",
2832 "dump op priority queue state");
2834 r
= admin_socket
->register_command("dump_blacklist", "dump_blacklist",
2836 "dump blacklisted clients and times");
2838 r
= admin_socket
->register_command("dump_watchers", "dump_watchers",
2840 "show clients which have active watches,"
2841 " and on which objects");
2843 r
= admin_socket
->register_command("dump_reservations", "dump_reservations",
2845 "show recovery reservations");
2847 r
= admin_socket
->register_command("get_latest_osdmap", "get_latest_osdmap",
2849 "force osd to update the latest map from "
2853 r
= admin_socket
->register_command( "heap",
2855 "name=heapcmd,type=CephString",
2857 "show heap usage info (available only if "
2858 "compiled with tcmalloc)");
2861 r
= admin_socket
->register_command("set_heap_property",
2862 "set_heap_property " \
2863 "name=property,type=CephString " \
2864 "name=value,type=CephInt",
2866 "update malloc extension heap property");
2869 r
= admin_socket
->register_command("get_heap_property",
2870 "get_heap_property " \
2871 "name=property,type=CephString",
2873 "get malloc extension heap property");
2876 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
2877 "dump_objectstore_kv_stats",
2879 "print statistics of kvdb which used by bluestore");
2882 r
= admin_socket
->register_command("dump_scrubs",
2885 "print scheduled scrubs");
2888 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
2889 "calc_objectstore_db_histogram",
2891 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2894 r
= admin_socket
->register_command("flush_store_cache",
2895 "flush_store_cache",
2897 "Flush bluestore internal cache");
2899 r
= admin_socket
->register_command("dump_pgstate_history", "dump_pgstate_history",
2901 "show recent state history");
2904 r
= admin_socket
->register_command("compact", "compact",
2906 "Commpact object store's omap."
2907 " WARNING: Compaction probably slows your requests");
2910 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
);
2911 // Note: pools are CephString instead of CephPoolname because
2912 // these commands traditionally support both pool names and numbers
2913 r
= admin_socket
->register_command(
2916 "name=pool,type=CephString " \
2917 "name=objname,type=CephObjectname " \
2918 "name=key,type=CephString "\
2919 "name=val,type=CephString",
2923 r
= admin_socket
->register_command(
2926 "name=pool,type=CephString " \
2927 "name=objname,type=CephObjectname " \
2928 "name=key,type=CephString",
2932 r
= admin_socket
->register_command(
2935 "name=pool,type=CephString " \
2936 "name=objname,type=CephObjectname " \
2937 "name=header,type=CephString",
2942 r
= admin_socket
->register_command(
2945 "name=pool,type=CephString " \
2946 "name=objname,type=CephObjectname",
2948 "output entire object map");
2951 r
= admin_socket
->register_command(
2954 "name=pool,type=CephString " \
2955 "name=objname,type=CephObjectname " \
2956 "name=len,type=CephInt",
2958 "truncate object to length");
2961 r
= admin_socket
->register_command(
2964 "name=pool,type=CephString " \
2965 "name=objname,type=CephObjectname " \
2966 "name=shardid,type=CephInt,req=false,range=0|255",
2968 "inject data error to an object");
2971 r
= admin_socket
->register_command(
2974 "name=pool,type=CephString " \
2975 "name=objname,type=CephObjectname " \
2976 "name=shardid,type=CephInt,req=false,range=0|255",
2978 "inject metadata error to an object");
2980 r
= admin_socket
->register_command(
2981 "set_recovery_delay",
2982 "set_recovery_delay " \
2983 "name=utime,type=CephInt,req=false",
2985 "Delay osd recovery by specified seconds");
2987 r
= admin_socket
->register_command(
2990 "name=pgid,type=CephString " \
2991 "name=time,type=CephInt,req=false",
2993 "Trigger a scheduled scrub ");
2995 r
= admin_socket
->register_command(
2996 "trigger_deep_scrub",
2997 "trigger_deep_scrub " \
2998 "name=pgid,type=CephString " \
2999 "name=time,type=CephInt,req=false",
3001 "Trigger a scheduled deep scrub ");
3002 ceph_assert(r
== 0);
3003 r
= admin_socket
->register_command(
3006 "name=type,type=CephString,req=false " \
3007 "name=count,type=CephInt,req=false ",
3009 "Inject a full disk (optional count times)");
3013 void OSD::create_logger()
3015 dout(10) << "create_logger" << dendl
;
3017 PerfCountersBuilder
osd_plb(cct
, "osd", l_osd_first
, l_osd_last
);
3019 // Latency axis configuration for op histograms, values are in nanoseconds
3020 PerfHistogramCommon::axis_config_d op_hist_x_axis_config
{
3022 PerfHistogramCommon::SCALE_LOG2
, ///< Latency in logarithmic scale
3024 100000, ///< Quantization unit is 100usec
3025 32, ///< Enough to cover much longer than slow requests
3028 // Op size axis configuration for op histograms, values are in bytes
3029 PerfHistogramCommon::axis_config_d op_hist_y_axis_config
{
3030 "Request size (bytes)",
3031 PerfHistogramCommon::SCALE_LOG2
, ///< Request size in logarithmic scale
3033 512, ///< Quantization unit is 512 bytes
3034 32, ///< Enough to cover requests larger than GB
3038 // All the basic OSD operation stats are to be considered useful
3039 osd_plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
3042 l_osd_op_wip
, "op_wip",
3043 "Replication operations currently being processed (primary)");
3044 osd_plb
.add_u64_counter(
3046 "Client operations",
3047 "ops", PerfCountersBuilder::PRIO_CRITICAL
);
3048 osd_plb
.add_u64_counter(
3049 l_osd_op_inb
, "op_in_bytes",
3050 "Client operations total write size",
3051 "wr", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(BYTES
));
3052 osd_plb
.add_u64_counter(
3053 l_osd_op_outb
, "op_out_bytes",
3054 "Client operations total read size",
3055 "rd", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(BYTES
));
3056 osd_plb
.add_time_avg(
3057 l_osd_op_lat
, "op_latency",
3058 "Latency of client operations (including queue time)",
3060 osd_plb
.add_time_avg(
3061 l_osd_op_process_lat
, "op_process_latency",
3062 "Latency of client operations (excluding queue time)");
3063 osd_plb
.add_time_avg(
3064 l_osd_op_prepare_lat
, "op_prepare_latency",
3065 "Latency of client operations (excluding queue time and wait for finished)");
3067 osd_plb
.add_u64_counter(
3068 l_osd_op_r
, "op_r", "Client read operations");
3069 osd_plb
.add_u64_counter(
3070 l_osd_op_r_outb
, "op_r_out_bytes", "Client data read", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(BYTES
));
3071 osd_plb
.add_time_avg(
3072 l_osd_op_r_lat
, "op_r_latency",
3073 "Latency of read operation (including queue time)");
3074 osd_plb
.add_u64_counter_histogram(
3075 l_osd_op_r_lat_outb_hist
, "op_r_latency_out_bytes_histogram",
3076 op_hist_x_axis_config
, op_hist_y_axis_config
,
3077 "Histogram of operation latency (including queue time) + data read");
3078 osd_plb
.add_time_avg(
3079 l_osd_op_r_process_lat
, "op_r_process_latency",
3080 "Latency of read operation (excluding queue time)");
3081 osd_plb
.add_time_avg(
3082 l_osd_op_r_prepare_lat
, "op_r_prepare_latency",
3083 "Latency of read operations (excluding queue time and wait for finished)");
3084 osd_plb
.add_u64_counter(
3085 l_osd_op_w
, "op_w", "Client write operations");
3086 osd_plb
.add_u64_counter(
3087 l_osd_op_w_inb
, "op_w_in_bytes", "Client data written");
3088 osd_plb
.add_time_avg(
3089 l_osd_op_w_lat
, "op_w_latency",
3090 "Latency of write operation (including queue time)");
3091 osd_plb
.add_u64_counter_histogram(
3092 l_osd_op_w_lat_inb_hist
, "op_w_latency_in_bytes_histogram",
3093 op_hist_x_axis_config
, op_hist_y_axis_config
,
3094 "Histogram of operation latency (including queue time) + data written");
3095 osd_plb
.add_time_avg(
3096 l_osd_op_w_process_lat
, "op_w_process_latency",
3097 "Latency of write operation (excluding queue time)");
3098 osd_plb
.add_time_avg(
3099 l_osd_op_w_prepare_lat
, "op_w_prepare_latency",
3100 "Latency of write operations (excluding queue time and wait for finished)");
3101 osd_plb
.add_u64_counter(
3102 l_osd_op_rw
, "op_rw",
3103 "Client read-modify-write operations");
3104 osd_plb
.add_u64_counter(
3105 l_osd_op_rw_inb
, "op_rw_in_bytes",
3106 "Client read-modify-write operations write in", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(BYTES
));
3107 osd_plb
.add_u64_counter(
3108 l_osd_op_rw_outb
,"op_rw_out_bytes",
3109 "Client read-modify-write operations read out ", NULL
, PerfCountersBuilder::PRIO_USEFUL
, unit_t(BYTES
));
3110 osd_plb
.add_time_avg(
3111 l_osd_op_rw_lat
, "op_rw_latency",
3112 "Latency of read-modify-write operation (including queue time)");
3113 osd_plb
.add_u64_counter_histogram(
3114 l_osd_op_rw_lat_inb_hist
, "op_rw_latency_in_bytes_histogram",
3115 op_hist_x_axis_config
, op_hist_y_axis_config
,
3116 "Histogram of rw operation latency (including queue time) + data written");
3117 osd_plb
.add_u64_counter_histogram(
3118 l_osd_op_rw_lat_outb_hist
, "op_rw_latency_out_bytes_histogram",
3119 op_hist_x_axis_config
, op_hist_y_axis_config
,
3120 "Histogram of rw operation latency (including queue time) + data read");
3121 osd_plb
.add_time_avg(
3122 l_osd_op_rw_process_lat
, "op_rw_process_latency",
3123 "Latency of read-modify-write operation (excluding queue time)");
3124 osd_plb
.add_time_avg(
3125 l_osd_op_rw_prepare_lat
, "op_rw_prepare_latency",
3126 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3128 // Now we move on to some more obscure stats, revert to assuming things
3129 // are low priority unless otherwise specified.
3130 osd_plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
3132 osd_plb
.add_time_avg(l_osd_op_before_queue_op_lat
, "op_before_queue_op_lat",
3133 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3134 osd_plb
.add_time_avg(l_osd_op_before_dequeue_op_lat
, "op_before_dequeue_op_lat",
3135 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3137 osd_plb
.add_u64_counter(
3138 l_osd_sop
, "subop", "Suboperations");
3139 osd_plb
.add_u64_counter(
3140 l_osd_sop_inb
, "subop_in_bytes", "Suboperations total size", NULL
, 0, unit_t(BYTES
));
3141 osd_plb
.add_time_avg(l_osd_sop_lat
, "subop_latency", "Suboperations latency");
3143 osd_plb
.add_u64_counter(l_osd_sop_w
, "subop_w", "Replicated writes");
3144 osd_plb
.add_u64_counter(
3145 l_osd_sop_w_inb
, "subop_w_in_bytes", "Replicated written data size", NULL
, 0, unit_t(BYTES
));
3146 osd_plb
.add_time_avg(
3147 l_osd_sop_w_lat
, "subop_w_latency", "Replicated writes latency");
3148 osd_plb
.add_u64_counter(
3149 l_osd_sop_pull
, "subop_pull", "Suboperations pull requests");
3150 osd_plb
.add_time_avg(
3151 l_osd_sop_pull_lat
, "subop_pull_latency", "Suboperations pull latency");
3152 osd_plb
.add_u64_counter(
3153 l_osd_sop_push
, "subop_push", "Suboperations push messages");
3154 osd_plb
.add_u64_counter(
3155 l_osd_sop_push_inb
, "subop_push_in_bytes", "Suboperations pushed size", NULL
, 0, unit_t(BYTES
));
3156 osd_plb
.add_time_avg(
3157 l_osd_sop_push_lat
, "subop_push_latency", "Suboperations push latency");
3159 osd_plb
.add_u64_counter(l_osd_pull
, "pull", "Pull requests sent");
3160 osd_plb
.add_u64_counter(l_osd_push
, "push", "Push messages sent");
3161 osd_plb
.add_u64_counter(l_osd_push_outb
, "push_out_bytes", "Pushed size", NULL
, 0, unit_t(BYTES
));
3163 osd_plb
.add_u64_counter(
3164 l_osd_rop
, "recovery_ops",
3165 "Started recovery operations",
3166 "rop", PerfCountersBuilder::PRIO_INTERESTING
);
3168 osd_plb
.add_u64(l_osd_loadavg
, "loadavg", "CPU load");
3169 osd_plb
.add_u64(l_osd_buf
, "buffer_bytes", "Total allocated buffer size", NULL
, 0, unit_t(BYTES
));
3170 osd_plb
.add_u64(l_osd_history_alloc_bytes
, "history_alloc_Mbytes", NULL
, 0, unit_t(BYTES
));
3171 osd_plb
.add_u64(l_osd_history_alloc_num
, "history_alloc_num");
3173 l_osd_cached_crc
, "cached_crc", "Total number getting crc from crc_cache");
3175 l_osd_cached_crc_adjusted
, "cached_crc_adjusted",
3176 "Total number getting crc from crc_cache with adjusting");
3177 osd_plb
.add_u64(l_osd_missed_crc
, "missed_crc",
3178 "Total number of crc cache misses");
3180 osd_plb
.add_u64(l_osd_pg
, "numpg", "Placement groups",
3181 "pgs", PerfCountersBuilder::PRIO_USEFUL
);
3183 l_osd_pg_primary
, "numpg_primary",
3184 "Placement groups for which this osd is primary");
3186 l_osd_pg_replica
, "numpg_replica",
3187 "Placement groups for which this osd is replica");
3189 l_osd_pg_stray
, "numpg_stray",
3190 "Placement groups ready to be deleted from this osd");
3192 l_osd_pg_removing
, "numpg_removing",
3193 "Placement groups queued for local deletion", "pgsr",
3194 PerfCountersBuilder::PRIO_USEFUL
);
3196 l_osd_hb_to
, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3197 osd_plb
.add_u64_counter(l_osd_map
, "map_messages", "OSD map messages");
3198 osd_plb
.add_u64_counter(l_osd_mape
, "map_message_epochs", "OSD map epochs");
3199 osd_plb
.add_u64_counter(
3200 l_osd_mape_dup
, "map_message_epoch_dups", "OSD map duplicates");
3201 osd_plb
.add_u64_counter(
3202 l_osd_waiting_for_map
, "messages_delayed_for_map",
3203 "Operations waiting for OSD map");
3205 osd_plb
.add_u64_counter(
3206 l_osd_map_cache_hit
, "osd_map_cache_hit", "osdmap cache hit");
3207 osd_plb
.add_u64_counter(
3208 l_osd_map_cache_miss
, "osd_map_cache_miss", "osdmap cache miss");
3209 osd_plb
.add_u64_counter(
3210 l_osd_map_cache_miss_low
, "osd_map_cache_miss_low",
3211 "osdmap cache miss below cache lower bound");
3212 osd_plb
.add_u64_avg(
3213 l_osd_map_cache_miss_low_avg
, "osd_map_cache_miss_low_avg",
3214 "osdmap cache miss, avg distance below cache lower bound");
3215 osd_plb
.add_u64_counter(
3216 l_osd_map_bl_cache_hit
, "osd_map_bl_cache_hit",
3217 "OSDMap buffer cache hits");
3218 osd_plb
.add_u64_counter(
3219 l_osd_map_bl_cache_miss
, "osd_map_bl_cache_miss",
3220 "OSDMap buffer cache misses");
3223 l_osd_stat_bytes
, "stat_bytes", "OSD size", "size",
3224 PerfCountersBuilder::PRIO_USEFUL
, unit_t(BYTES
));
3226 l_osd_stat_bytes_used
, "stat_bytes_used", "Used space", "used",
3227 PerfCountersBuilder::PRIO_USEFUL
, unit_t(BYTES
));
3228 osd_plb
.add_u64(l_osd_stat_bytes_avail
, "stat_bytes_avail", "Available space", NULL
, 0, unit_t(BYTES
));
3230 osd_plb
.add_u64_counter(
3231 l_osd_copyfrom
, "copyfrom", "Rados \"copy-from\" operations");
3233 osd_plb
.add_u64_counter(l_osd_tier_promote
, "tier_promote", "Tier promotions");
3234 osd_plb
.add_u64_counter(l_osd_tier_flush
, "tier_flush", "Tier flushes");
3235 osd_plb
.add_u64_counter(
3236 l_osd_tier_flush_fail
, "tier_flush_fail", "Failed tier flushes");
3237 osd_plb
.add_u64_counter(
3238 l_osd_tier_try_flush
, "tier_try_flush", "Tier flush attempts");
3239 osd_plb
.add_u64_counter(
3240 l_osd_tier_try_flush_fail
, "tier_try_flush_fail",
3241 "Failed tier flush attempts");
3242 osd_plb
.add_u64_counter(
3243 l_osd_tier_evict
, "tier_evict", "Tier evictions");
3244 osd_plb
.add_u64_counter(
3245 l_osd_tier_whiteout
, "tier_whiteout", "Tier whiteouts");
3246 osd_plb
.add_u64_counter(
3247 l_osd_tier_dirty
, "tier_dirty", "Dirty tier flag set");
3248 osd_plb
.add_u64_counter(
3249 l_osd_tier_clean
, "tier_clean", "Dirty tier flag cleaned");
3250 osd_plb
.add_u64_counter(
3251 l_osd_tier_delay
, "tier_delay", "Tier delays (agent waiting)");
3252 osd_plb
.add_u64_counter(
3253 l_osd_tier_proxy_read
, "tier_proxy_read", "Tier proxy reads");
3254 osd_plb
.add_u64_counter(
3255 l_osd_tier_proxy_write
, "tier_proxy_write", "Tier proxy writes");
3257 osd_plb
.add_u64_counter(
3258 l_osd_agent_wake
, "agent_wake", "Tiering agent wake up");
3259 osd_plb
.add_u64_counter(
3260 l_osd_agent_skip
, "agent_skip", "Objects skipped by agent");
3261 osd_plb
.add_u64_counter(
3262 l_osd_agent_flush
, "agent_flush", "Tiering agent flushes");
3263 osd_plb
.add_u64_counter(
3264 l_osd_agent_evict
, "agent_evict", "Tiering agent evictions");
3266 osd_plb
.add_u64_counter(
3267 l_osd_object_ctx_cache_hit
, "object_ctx_cache_hit", "Object context cache hits");
3268 osd_plb
.add_u64_counter(
3269 l_osd_object_ctx_cache_total
, "object_ctx_cache_total", "Object context cache lookups");
3271 osd_plb
.add_u64_counter(l_osd_op_cache_hit
, "op_cache_hit");
3272 osd_plb
.add_time_avg(
3273 l_osd_tier_flush_lat
, "osd_tier_flush_lat", "Object flush latency");
3274 osd_plb
.add_time_avg(
3275 l_osd_tier_promote_lat
, "osd_tier_promote_lat", "Object promote latency");
3276 osd_plb
.add_time_avg(
3277 l_osd_tier_r_lat
, "osd_tier_r_lat", "Object proxy read latency");
3279 osd_plb
.add_u64_counter(
3280 l_osd_pg_info
, "osd_pg_info", "PG updated its info (using any method)");
3281 osd_plb
.add_u64_counter(
3282 l_osd_pg_fastinfo
, "osd_pg_fastinfo",
3283 "PG updated its info using fastinfo attr");
3284 osd_plb
.add_u64_counter(
3285 l_osd_pg_biginfo
, "osd_pg_biginfo", "PG updated its biginfo attr");
3287 logger
= osd_plb
.create_perf_counters();
3288 cct
->get_perfcounters_collection()->add(logger
);
3291 void OSD::create_recoverystate_perf()
3293 dout(10) << "create_recoverystate_perf" << dendl
;
3295 PerfCountersBuilder
rs_perf(cct
, "recoverystate_perf", rs_first
, rs_last
);
3297 rs_perf
.add_time_avg(rs_initial_latency
, "initial_latency", "Initial recovery state latency");
3298 rs_perf
.add_time_avg(rs_started_latency
, "started_latency", "Started recovery state latency");
3299 rs_perf
.add_time_avg(rs_reset_latency
, "reset_latency", "Reset recovery state latency");
3300 rs_perf
.add_time_avg(rs_start_latency
, "start_latency", "Start recovery state latency");
3301 rs_perf
.add_time_avg(rs_primary_latency
, "primary_latency", "Primary recovery state latency");
3302 rs_perf
.add_time_avg(rs_peering_latency
, "peering_latency", "Peering recovery state latency");
3303 rs_perf
.add_time_avg(rs_backfilling_latency
, "backfilling_latency", "Backfilling recovery state latency");
3304 rs_perf
.add_time_avg(rs_waitremotebackfillreserved_latency
, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3305 rs_perf
.add_time_avg(rs_waitlocalbackfillreserved_latency
, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3306 rs_perf
.add_time_avg(rs_notbackfilling_latency
, "notbackfilling_latency", "Notbackfilling recovery state latency");
3307 rs_perf
.add_time_avg(rs_repnotrecovering_latency
, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3308 rs_perf
.add_time_avg(rs_repwaitrecoveryreserved_latency
, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3309 rs_perf
.add_time_avg(rs_repwaitbackfillreserved_latency
, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3310 rs_perf
.add_time_avg(rs_reprecovering_latency
, "reprecovering_latency", "RepRecovering recovery state latency");
3311 rs_perf
.add_time_avg(rs_activating_latency
, "activating_latency", "Activating recovery state latency");
3312 rs_perf
.add_time_avg(rs_waitlocalrecoveryreserved_latency
, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3313 rs_perf
.add_time_avg(rs_waitremoterecoveryreserved_latency
, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3314 rs_perf
.add_time_avg(rs_recovering_latency
, "recovering_latency", "Recovering recovery state latency");
3315 rs_perf
.add_time_avg(rs_recovered_latency
, "recovered_latency", "Recovered recovery state latency");
3316 rs_perf
.add_time_avg(rs_clean_latency
, "clean_latency", "Clean recovery state latency");
3317 rs_perf
.add_time_avg(rs_active_latency
, "active_latency", "Active recovery state latency");
3318 rs_perf
.add_time_avg(rs_replicaactive_latency
, "replicaactive_latency", "Replicaactive recovery state latency");
3319 rs_perf
.add_time_avg(rs_stray_latency
, "stray_latency", "Stray recovery state latency");
3320 rs_perf
.add_time_avg(rs_getinfo_latency
, "getinfo_latency", "Getinfo recovery state latency");
3321 rs_perf
.add_time_avg(rs_getlog_latency
, "getlog_latency", "Getlog recovery state latency");
3322 rs_perf
.add_time_avg(rs_waitactingchange_latency
, "waitactingchange_latency", "Waitactingchange recovery state latency");
3323 rs_perf
.add_time_avg(rs_incomplete_latency
, "incomplete_latency", "Incomplete recovery state latency");
3324 rs_perf
.add_time_avg(rs_down_latency
, "down_latency", "Down recovery state latency");
3325 rs_perf
.add_time_avg(rs_getmissing_latency
, "getmissing_latency", "Getmissing recovery state latency");
3326 rs_perf
.add_time_avg(rs_waitupthru_latency
, "waitupthru_latency", "Waitupthru recovery state latency");
3327 rs_perf
.add_time_avg(rs_notrecovering_latency
, "notrecovering_latency", "Notrecovering recovery state latency");
3329 recoverystate_perf
= rs_perf
.create_perf_counters();
3330 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
3335 if (!service
.prepare_to_stop())
3336 return 0; // already shutting down
3338 if (is_stopping()) {
3342 derr
<< "shutdown" << dendl
;
3344 set_state(STATE_STOPPING
);
3347 if (cct
->_conf
->get_val
<bool>("osd_debug_shutdown")) {
3348 cct
->_conf
->set_val("debug_osd", "100");
3349 cct
->_conf
->set_val("debug_journal", "100");
3350 cct
->_conf
->set_val("debug_filestore", "100");
3351 cct
->_conf
->set_val("debug_bluestore", "100");
3352 cct
->_conf
->set_val("debug_ms", "100");
3353 cct
->_conf
->apply_changes(NULL
);
3356 // stop MgrClient earlier as it's more like an internal consumer of OSD
3359 service
.start_shutdown();
3361 // stop sending work to pgs. this just prevents any new work in _process
3362 // from racing with on_shutdown and potentially entering the pg after.
3363 op_shardedwq
.drain();
3367 RWLock::RLocker
l(pg_map_lock
);
3368 for (ceph::unordered_map
<spg_t
, PG
*>::iterator p
= pg_map
.begin();
3371 dout(20) << " kicking pg " << p
->first
<< dendl
;
3373 p
->second
->on_shutdown();
3374 p
->second
->unlock();
3375 p
->second
->osr
->flush();
3378 clear_pg_stat_queue();
3380 // drain op queue again (in case PGs requeued something)
3381 op_shardedwq
.drain();
3383 finished
.clear(); // zap waiters (bleh, this is messy)
3386 op_shardedwq
.clear_pg_slots();
3388 // unregister commands
3389 cct
->get_admin_socket()->unregister_command("status");
3390 cct
->get_admin_socket()->unregister_command("flush_journal");
3391 cct
->get_admin_socket()->unregister_command("dump_ops_in_flight");
3392 cct
->get_admin_socket()->unregister_command("ops");
3393 cct
->get_admin_socket()->unregister_command("dump_blocked_ops");
3394 cct
->get_admin_socket()->unregister_command("dump_historic_ops");
3395 cct
->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3396 cct
->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3397 cct
->get_admin_socket()->unregister_command("dump_op_pq_state");
3398 cct
->get_admin_socket()->unregister_command("dump_blacklist");
3399 cct
->get_admin_socket()->unregister_command("dump_watchers");
3400 cct
->get_admin_socket()->unregister_command("dump_reservations");
3401 cct
->get_admin_socket()->unregister_command("get_latest_osdmap");
3402 cct
->get_admin_socket()->unregister_command("heap");
3403 cct
->get_admin_socket()->unregister_command("set_heap_property");
3404 cct
->get_admin_socket()->unregister_command("get_heap_property");
3405 cct
->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
3406 cct
->get_admin_socket()->unregister_command("dump_scrubs");
3407 cct
->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3408 cct
->get_admin_socket()->unregister_command("flush_store_cache");
3409 cct
->get_admin_socket()->unregister_command("dump_pgstate_history");
3410 cct
->get_admin_socket()->unregister_command("compact");
3414 cct
->get_admin_socket()->unregister_command("setomapval");
3415 cct
->get_admin_socket()->unregister_command("rmomapkey");
3416 cct
->get_admin_socket()->unregister_command("setomapheader");
3417 cct
->get_admin_socket()->unregister_command("getomap");
3418 cct
->get_admin_socket()->unregister_command("truncobj");
3419 cct
->get_admin_socket()->unregister_command("injectdataerr");
3420 cct
->get_admin_socket()->unregister_command("injectmdataerr");
3421 cct
->get_admin_socket()->unregister_command("set_recovery_delay");
3422 cct
->get_admin_socket()->unregister_command("trigger_scrub");
3423 cct
->get_admin_socket()->unregister_command("injectfull");
3424 delete test_ops_hook
;
3425 test_ops_hook
= NULL
;
3429 heartbeat_lock
.Lock();
3430 heartbeat_stop
= true;
3431 heartbeat_cond
.Signal();
3432 heartbeat_lock
.Unlock();
3433 heartbeat_thread
.join();
3438 dout(10) << "osd tp stopped" << dendl
;
3442 dout(10) << "op sharded tp stopped" << dendl
;
3446 dout(10) << "command tp stopped" << dendl
;
3450 dout(10) << "remove tp paused (new)" << dendl
;
3452 recovery_tp
.drain();
3454 dout(10) << "recovery tp paused (new)" << dendl
;
3456 dout(10) << "stopping agent" << dendl
;
3457 service
.agent_stop();
3461 reset_heartbeat_peers();
3463 tick_timer
.shutdown();
3466 Mutex::Locker
l(tick_timer_lock
);
3467 tick_timer_without_osd_lock
.shutdown();
3470 // note unmount epoch
3471 dout(10) << "noting clean unmount in epoch " << osdmap
->get_epoch() << dendl
;
3472 superblock
.mounted
= service
.get_boot_epoch();
3473 superblock
.clean_thru
= osdmap
->get_epoch();
3474 ObjectStore::Transaction t
;
3475 write_superblock(t
);
3476 int r
= store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
3478 derr
<< "OSD::shutdown: error writing superblock: "
3479 << cpp_strerror(r
) << dendl
;
3484 Mutex::Locker
l(pg_stat_queue_lock
);
3485 assert(pg_stat_queue
.empty());
3488 service
.shutdown_reserver();
3491 #ifdef PG_DEBUG_REFS
3492 service
.dump_live_pgids();
3495 RWLock::RLocker
l(pg_map_lock
);
3496 for (ceph::unordered_map
<spg_t
, PG
*>::iterator p
= pg_map
.begin();
3499 dout(20) << " kicking pg " << p
->first
<< dendl
;
3501 if (p
->second
->ref
!= 1) {
3502 derr
<< "pgid " << p
->first
<< " has ref count of "
3503 << p
->second
->ref
<< dendl
;
3504 #ifdef PG_DEBUG_REFS
3505 p
->second
->dump_live_ids();
3507 if (cct
->_conf
->osd_shutdown_pgref_assert
) {
3511 p
->second
->unlock();
3512 p
->second
->put("PGMap");
3516 #ifdef PG_DEBUG_REFS
3517 service
.dump_live_pgids();
3521 cct
->_conf
->remove_observer(this);
3524 dout(10) << "syncing store" << dendl
;
3525 enable_disable_fuse(true);
3527 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
3528 dout(10) << "flushing journal" << dendl
;
3529 store
->flush_journal();
3535 dout(10) << "Store synced" << dendl
;
3540 osdmap
= OSDMapRef();
3542 op_tracker
.on_shutdown();
3544 class_handler
->shutdown();
3545 client_messenger
->shutdown();
3546 cluster_messenger
->shutdown();
3547 hb_front_client_messenger
->shutdown();
3548 hb_back_client_messenger
->shutdown();
3549 objecter_messenger
->shutdown();
3550 hb_front_server_messenger
->shutdown();
3551 hb_back_server_messenger
->shutdown();
3558 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
3560 bool created
= false;
3562 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
3563 vector
<string
> vcmd
{cmd
};
3567 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
3570 if (r
== -ENOENT
&& !created
) {
3571 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
3572 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
3573 vector
<string
> vnewcmd
{newcmd
};
3577 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
3580 derr
<< __func__
<< " fail: osd does not exist and created failed: "
3581 << cpp_strerror(r
) << dendl
;
3587 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
3596 int OSD::update_crush_location()
3598 if (!cct
->_conf
->osd_crush_update_on_start
) {
3599 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
3604 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
3605 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
3607 struct store_statfs_t st
;
3608 int r
= store
->statfs(&st
);
3610 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
3613 snprintf(weight
, sizeof(weight
), "%.4lf",
3615 (double)(st
.total
) /
3616 (double)(1ull << 40 /* TB */)));
3619 std::multimap
<string
,string
> loc
= cct
->crush_location
.get_location();
3620 dout(10) << __func__
<< " crush location is " << loc
<< dendl
;
3623 string("{\"prefix\": \"osd crush create-or-move\", ") +
3624 string("\"id\": ") + stringify(whoami
) + string(", ") +
3625 string("\"weight\":") + weight
+ string(", ") +
3626 string("\"args\": [");
3627 for (multimap
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
) {
3628 if (p
!= loc
.begin())
3630 cmd
+= "\"" + p
->first
+ "=" + p
->second
+ "\"";
3634 return mon_cmd_maybe_osd_create(cmd
);
3637 int OSD::update_crush_device_class()
3639 if (!cct
->_conf
->osd_class_update_on_start
) {
3640 dout(10) << __func__
<< " osd_class_update_on_start = false" << dendl
;
3644 string device_class
;
3645 int r
= store
->read_meta("crush_device_class", &device_class
);
3646 if (r
< 0 || device_class
.empty()) {
3647 device_class
= store
->get_default_device_class();
3650 if (device_class
.empty()) {
3651 dout(20) << __func__
<< " no device class stored locally" << dendl
;
3656 string("{\"prefix\": \"osd crush set-device-class\", ") +
3657 string("\"class\": \"") + device_class
+ string("\", ") +
3658 string("\"ids\": [\"") + stringify(whoami
) + string("\"]}");
3660 r
= mon_cmd_maybe_osd_create(cmd
);
3661 // the above cmd can fail for various reasons, e.g.:
3662 // (1) we are connecting to a pre-luminous monitor
3663 // (2) user manually specify a class other than
3664 // 'ceph-disk prepare --crush-device-class'
3665 // simply skip result-checking for now
3669 void OSD::write_superblock(ObjectStore::Transaction
& t
)
3671 dout(10) << "write_superblock " << superblock
<< dendl
;
3673 //hack: at minimum it's using the baseline feature set
3674 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
3675 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
3678 ::encode(superblock
, bl
);
3679 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
3682 int OSD::read_superblock()
3685 int r
= store
->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
3689 bufferlist::iterator p
= bl
.begin();
3690 ::decode(superblock
, p
);
3692 dout(10) << "read_superblock " << superblock
<< dendl
;
3697 void OSD::clear_temp_objects()
3699 dout(10) << __func__
<< dendl
;
3701 store
->list_collections(ls
);
3702 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
3704 if (!p
->is_pg(&pgid
))
3707 // list temp objects
3708 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
3710 vector
<ghobject_t
> temps
;
3713 vector
<ghobject_t
> objects
;
3714 store
->collection_list(*p
, next
, ghobject_t::get_max(),
3715 store
->get_ideal_list_max(),
3717 if (objects
.empty())
3719 vector
<ghobject_t
>::iterator q
;
3720 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
3721 // Hammer set pool for temps to -1, so check for clean-up
3722 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
3723 temps
.push_back(*q
);
3728 // If we saw a non-temp object and hit the break above we can
3729 // break out of the while loop too.
3730 if (q
!= objects
.end())
3733 if (!temps
.empty()) {
3734 ObjectStore::Transaction t
;
3736 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
3737 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
3739 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
3740 store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
3741 t
= ObjectStore::Transaction();
3746 store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
3752 void OSD::recursive_remove_collection(CephContext
* cct
,
3753 ObjectStore
*store
, spg_t pgid
,
3759 make_snapmapper_oid());
3761 ceph::shared_ptr
<ObjectStore::Sequencer
> osr (std::make_shared
<
3762 ObjectStore::Sequencer
>("rm"));
3763 ObjectStore::Transaction t
;
3764 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
3766 vector
<ghobject_t
> objects
;
3767 store
->collection_list(tmp
, ghobject_t(), ghobject_t::get_max(),
3768 INT_MAX
, &objects
, 0);
3769 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
3772 for (vector
<ghobject_t
>::iterator p
= objects
.begin();
3775 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
3776 int r
= mapper
.remove_oid(p
->hobj
, &_t
);
3777 if (r
!= 0 && r
!= -ENOENT
)
3780 if (removed
> cct
->_conf
->osd_target_transaction_size
) {
3781 int r
= store
->apply_transaction(osr
.get(), std::move(t
));
3783 t
= ObjectStore::Transaction();
3787 t
.remove_collection(tmp
);
3788 int r
= store
->apply_transaction(osr
.get(), std::move(t
));
3792 if (!osr
->flush_commit(&waiter
)) {
3798 // ======================================================
3801 PGPool
OSD::_get_pool(int id
, OSDMapRef createmap
)
3803 if (!createmap
->have_pg_pool(id
)) {
3804 dout(5) << __func__
<< ": the OSDmap does not contain a PG pool with id = "
3809 PGPool p
= PGPool(cct
, createmap
, id
);
3811 dout(10) << "_get_pool " << p
.id
<< dendl
;
3815 PG
*OSD::_open_lock_pg(
3816 OSDMapRef createmap
,
3817 spg_t pgid
, bool no_lockdep_check
)
3819 assert(osd_lock
.is_locked());
3821 PG
* pg
= _make_pg(createmap
, pgid
);
3823 RWLock::WLocker
l(pg_map_lock
);
3824 pg
->lock(no_lockdep_check
);
3826 pg
->get("PGMap"); // because it's in pg_map
3827 service
.pg_add_epoch(pg
->info
.pgid
, createmap
->get_epoch());
3833 OSDMapRef createmap
,
3836 dout(10) << "_open_lock_pg " << pgid
<< dendl
;
3837 PGPool pool
= _get_pool(pgid
.pool(), createmap
);
3841 if (createmap
->get_pg_type(pgid
.pgid
) == pg_pool_t::TYPE_REPLICATED
||
3842 createmap
->get_pg_type(pgid
.pgid
) == pg_pool_t::TYPE_ERASURE
)
3843 pg
= new PrimaryLogPG(&service
, createmap
, pool
, pgid
);
3851 void OSD::add_newly_split_pg(PG
*pg
, PG::RecoveryCtx
*rctx
)
3853 epoch_t
e(service
.get_osdmap()->get_epoch());
3854 pg
->get("PGMap"); // For pg_map
3855 pg_map
[pg
->info
.pgid
] = pg
;
3856 service
.pg_add_epoch(pg
->info
.pgid
, pg
->get_osdmap()->get_epoch());
3858 dout(10) << "Adding newly split pg " << *pg
<< dendl
;
3859 pg
->handle_loaded(rctx
);
3860 pg
->write_if_dirty(*(rctx
->transaction
));
3861 pg
->queue_null(e
, e
);
3862 map
<spg_t
, list
<PG::CephPeeringEvtRef
> >::iterator to_wake
=
3863 peering_wait_for_split
.find(pg
->info
.pgid
);
3864 if (to_wake
!= peering_wait_for_split
.end()) {
3865 for (list
<PG::CephPeeringEvtRef
>::iterator i
=
3866 to_wake
->second
.begin();
3867 i
!= to_wake
->second
.end();
3869 pg
->queue_peering_event(*i
);
3871 peering_wait_for_split
.erase(to_wake
);
3873 if (!service
.get_osdmap()->have_pg_pool(pg
->info
.pgid
.pool()))
3877 OSD::res_result
OSD::_try_resurrect_pg(
3878 OSDMapRef curmap
, spg_t pgid
, spg_t
*resurrected
, PGRef
*old_pg_state
)
3880 assert(resurrected
);
3881 assert(old_pg_state
);
3882 // find nearest ancestor
3883 DeletingStateRef df
;
3886 df
= service
.deleting_pgs
.lookup(cur
);
3891 cur
= cur
.get_parent();
3894 return RES_NONE
; // good to go
3896 df
->old_pg_state
->lock();
3897 OSDMapRef create_map
= df
->old_pg_state
->get_osdmap();
3898 df
->old_pg_state
->unlock();
3900 set
<spg_t
> children
;
3902 if (df
->try_stop_deletion()) {
3903 dout(10) << __func__
<< ": halted deletion on pg " << pgid
<< dendl
;
3905 *old_pg_state
= df
->old_pg_state
;
3906 service
.deleting_pgs
.remove(pgid
); // PG is no longer being removed!
3909 // raced, ensure we don't see DeletingStateRef when we try to
3911 service
.deleting_pgs
.remove(pgid
);
3914 } else if (cur
.is_split(create_map
->get_pg_num(cur
.pool()),
3915 curmap
->get_pg_num(cur
.pool()),
3917 children
.count(pgid
)) {
3918 if (df
->try_stop_deletion()) {
3919 dout(10) << __func__
<< ": halted deletion on ancestor pg " << pgid
3922 *old_pg_state
= df
->old_pg_state
;
3923 service
.deleting_pgs
.remove(cur
); // PG is no longer being removed!
3926 /* this is not a problem, failing to cancel proves that all objects
3927 * have been removed, so no hobject_t overlap is possible
3935 PG
*OSD::_create_lock_pg(
3936 OSDMapRef createmap
,
3941 vector
<int>& up
, int up_primary
,
3942 vector
<int>& acting
, int acting_primary
,
3943 pg_history_t history
,
3944 const PastIntervals
& pi
,
3945 ObjectStore::Transaction
& t
)
3947 assert(osd_lock
.is_locked());
3948 dout(20) << "_create_lock_pg pgid " << pgid
<< dendl
;
3950 PG
*pg
= _open_lock_pg(createmap
, pgid
, true);
3952 service
.init_splits_between(pgid
, pg
->get_osdmap(), service
.get_osdmap());
3965 dout(7) << "_create_lock_pg " << *pg
<< dendl
;
3969 PG
*OSD::_lookup_lock_pg(spg_t pgid
)
3971 RWLock::RLocker
l(pg_map_lock
);
3973 auto pg_map_entry
= pg_map
.find(pgid
);
3974 if (pg_map_entry
== pg_map
.end())
3976 PG
*pg
= pg_map_entry
->second
;
3981 PG
*OSD::lookup_lock_pg(spg_t pgid
)
3983 return _lookup_lock_pg(pgid
);
3986 PG
*OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid
)
3988 assert(pg_map
.count(pgid
));
3989 PG
*pg
= pg_map
[pgid
];
3994 void OSD::load_pgs()
3996 assert(osd_lock
.is_locked());
3997 dout(0) << "load_pgs" << dendl
;
3999 RWLock::RLocker
l(pg_map_lock
);
4000 assert(pg_map
.empty());
4004 int r
= store
->list_collections(ls
);
4006 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
4009 bool has_upgraded
= false;
4011 for (vector
<coll_t
>::iterator it
= ls
.begin();
4015 if (it
->is_temp(&pgid
) ||
4016 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
, pgid
))) {
4017 dout(10) << "load_pgs " << *it
<< " clearing temp" << dendl
;
4018 recursive_remove_collection(cct
, store
, pgid
, *it
);
4022 if (!it
->is_pg(&pgid
)) {
4023 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
4027 if (pgid
.preferred() >= 0) {
4028 dout(10) << __func__
<< ": skipping localized PG " << pgid
<< dendl
;
4029 // FIXME: delete it too, eventually
4033 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
4035 epoch_t map_epoch
= 0;
4036 int r
= PG::peek_map_epoch(store
, pgid
, &map_epoch
, &bl
);
4038 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
4044 if (map_epoch
> 0) {
4045 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
4047 if (!osdmap
->have_pg_pool(pgid
.pool())) {
4048 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
4049 << " on pg " << pgid
<< ", but the pool is not present in the "
4050 << "current map, so this is probably a result of bug 10617. "
4051 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4052 << "to clean it up later." << dendl
;
4055 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
4056 << map_epoch
<< ", but missing map. Crashing."
4058 assert(0 == "Missing map in load_pgs");
4061 pg
= _open_lock_pg(pgosdmap
, pgid
);
4063 pg
= _open_lock_pg(osdmap
, pgid
);
4065 // there can be no waiters here, so we don't call wake_pg_waiters
4067 pg
->ch
= store
->open_collection(pg
->coll
);
4069 // read pg state, log
4070 pg
->read_state(store
, bl
);
4072 if (pg
->must_upgrade()) {
4073 if (!pg
->can_upgrade()) {
4074 derr
<< "PG needs upgrade, but on-disk data is too old; upgrade to"
4075 << " an older version first." << dendl
;
4076 assert(0 == "PG too old to upgrade");
4078 if (!has_upgraded
) {
4079 derr
<< "PGs are upgrading" << dendl
;
4080 has_upgraded
= true;
4082 dout(10) << "PG " << pg
->info
.pgid
4083 << " must upgrade..." << dendl
;
4088 dout(10) << "load_pgs " << *it
<< " deleting dne" << dendl
;
4090 service
.pg_remove_epoch(pg
->pg_id
);
4094 RWLock::WLocker
l(pg_map_lock
);
4095 auto p
= pg_map
.find(pg
->get_pgid());
4096 assert(p
!= pg_map
.end() && p
->second
== pg
);
4097 dout(20) << __func__
<< " removed pg " << pg
<< " from pg_map" << dendl
;
4101 recursive_remove_collection(cct
, store
, pgid
, *it
);
4105 service
.init_splits_between(pg
->info
.pgid
, pg
->get_osdmap(), osdmap
);
4107 // generate state for PG's current mapping
4108 int primary
, up_primary
;
4109 vector
<int> acting
, up
;
4110 pg
->get_osdmap()->pg_to_up_acting_osds(
4111 pgid
.pgid
, &up
, &up_primary
, &acting
, &primary
);
4112 pg
->init_primary_up_acting(
4117 int role
= OSDMap::calc_pg_role(whoami
, pg
->acting
);
4118 if (pg
->pool
.info
.is_replicated() || role
== pg
->pg_whoami
.shard
)
4123 pg
->reg_next_scrub();
4125 PG::RecoveryCtx
rctx(0, 0, 0, 0, 0, 0);
4126 pg
->handle_loaded(&rctx
);
4128 dout(10) << "load_pgs loaded " << *pg
<< " " << pg
->pg_log
.get_log() << dendl
;
4129 if (pg
->pg_log
.is_dirty()) {
4130 ObjectStore::Transaction t
;
4131 pg
->write_if_dirty(t
);
4132 store
->apply_transaction(pg
->osr
.get(), std::move(t
));
4137 RWLock::RLocker
l(pg_map_lock
);
4138 dout(0) << "load_pgs opened " << pg_map
.size() << " pgs" << dendl
;
4141 // clean up old infos object?
4142 if (has_upgraded
&& store
->exists(coll_t::meta(), OSD::make_infos_oid())) {
4143 dout(1) << __func__
<< " removing legacy infos object" << dendl
;
4144 ObjectStore::Transaction t
;
4145 t
.remove(coll_t::meta(), OSD::make_infos_oid());
4146 int r
= store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
4148 derr
<< __func__
<< ": apply_transaction returned "
4149 << cpp_strerror(r
) << dendl
;
4154 build_past_intervals_parallel();
4159 * build past_intervals efficiently on old, degraded, and buried
4160 * clusters. this is important for efficiently catching up osds that
4161 * are way behind on maps to the current cluster state.
4163 * this is a parallel version of PG::generate_past_intervals().
4164 * follow the same logic, but do all pgs at the same time so that we
4165 * can make a single pass across the osdmap history.
4167 void OSD::build_past_intervals_parallel()
4171 vector
<int> old_acting
, old_up
;
4172 epoch_t same_interval_since
;
4176 map
<PG
*,pistate
> pis
;
4178 // calculate junction of map range
4179 epoch_t end_epoch
= superblock
.oldest_map
;
4180 epoch_t cur_epoch
= superblock
.newest_map
;
4182 RWLock::RLocker
l(pg_map_lock
);
4183 for (ceph::unordered_map
<spg_t
, PG
*>::iterator i
= pg_map
.begin();
4188 // Ignore PGs only partially created (DNE)
4189 if (pg
->info
.dne()) {
4193 auto rpib
= pg
->get_required_past_interval_bounds(
4195 superblock
.oldest_map
);
4196 if (rpib
.first
>= rpib
.second
&& pg
->past_intervals
.empty()) {
4197 if (pg
->info
.history
.same_interval_since
== 0) {
4198 pg
->info
.history
.same_interval_since
= rpib
.second
;
4202 auto apib
= pg
->past_intervals
.get_bounds();
4203 if (apib
.second
>= rpib
.second
&&
4204 apib
.first
<= rpib
.first
) {
4205 if (pg
->info
.history
.same_interval_since
== 0) {
4206 pg
->info
.history
.same_interval_since
= rpib
.second
;
4212 dout(10) << pg
->info
.pgid
<< " needs " << rpib
.first
<< "-"
4213 << rpib
.second
<< dendl
;
4214 pistate
& p
= pis
[pg
];
4215 p
.start
= rpib
.first
;
4216 p
.end
= rpib
.second
;
4217 p
.same_interval_since
= 0;
4219 if (rpib
.first
< cur_epoch
)
4220 cur_epoch
= rpib
.first
;
4221 if (rpib
.second
> end_epoch
)
4222 end_epoch
= rpib
.second
;
4226 dout(10) << __func__
<< " nothing to build" << dendl
;
4230 dout(1) << __func__
<< " over " << cur_epoch
<< "-" << end_epoch
<< dendl
;
4231 assert(cur_epoch
<= end_epoch
);
4233 OSDMapRef cur_map
, last_map
;
4234 for ( ; cur_epoch
<= end_epoch
; cur_epoch
++) {
4235 dout(10) << __func__
<< " epoch " << cur_epoch
<< dendl
;
4237 cur_map
= get_map(cur_epoch
);
4239 for (map
<PG
*,pistate
>::iterator i
= pis
.begin(); i
!= pis
.end(); ++i
) {
4241 pistate
& p
= i
->second
;
4243 if (cur_epoch
< p
.start
|| cur_epoch
> p
.end
)
4246 vector
<int> acting
, up
;
4249 pg_t pgid
= pg
->info
.pgid
.pgid
;
4250 if (p
.same_interval_since
&& last_map
->get_pools().count(pgid
.pool()))
4251 pgid
= pgid
.get_ancestor(last_map
->get_pg_num(pgid
.pool()));
4252 cur_map
->pg_to_up_acting_osds(
4253 pgid
, &up
, &up_primary
, &acting
, &primary
);
4255 if (p
.same_interval_since
== 0) {
4256 dout(10) << __func__
<< " epoch " << cur_epoch
<< " pg " << pg
->info
.pgid
4257 << " first map, acting " << acting
4258 << " up " << up
<< ", same_interval_since = " << cur_epoch
<< dendl
;
4259 p
.same_interval_since
= cur_epoch
;
4261 p
.old_acting
= acting
;
4262 p
.primary
= primary
;
4263 p
.up_primary
= up_primary
;
4268 boost::scoped_ptr
<IsPGRecoverablePredicate
> recoverable(
4269 pg
->get_is_recoverable_predicate());
4270 std::stringstream debug
;
4271 bool new_interval
= PastIntervals::check_new_interval(
4274 p
.old_acting
, acting
,
4278 p
.same_interval_since
,
4279 pg
->info
.history
.last_epoch_clean
,
4283 &pg
->past_intervals
,
4286 dout(10) << __func__
<< " epoch " << cur_epoch
<< " pg " << pg
->info
.pgid
4287 << " " << debug
.str() << dendl
;
4289 p
.old_acting
= acting
;
4290 p
.primary
= primary
;
4291 p
.up_primary
= up_primary
;
4292 p
.same_interval_since
= cur_epoch
;
4297 // Now that past_intervals have been recomputed let's fix the same_interval_since
4298 // if it was cleared by import.
4299 for (map
<PG
*,pistate
>::iterator i
= pis
.begin(); i
!= pis
.end(); ++i
) {
4301 pistate
& p
= i
->second
;
4303 if (pg
->info
.history
.same_interval_since
== 0) {
4304 assert(p
.same_interval_since
);
4305 dout(10) << __func__
<< " fix same_interval_since " << p
.same_interval_since
<< " pg " << *pg
<< dendl
;
4306 dout(10) << __func__
<< " past_intervals " << pg
->past_intervals
<< dendl
;
4308 pg
->info
.history
.same_interval_since
= p
.same_interval_since
;
4312 // write info only at the end. this is necessary because we check
4313 // whether the past_intervals go far enough back or forward in time,
4314 // but we don't check for holes. we could avoid it by discarding
4315 // the previous past_intervals and rebuilding from scratch, or we
4316 // can just do this and commit all our work at the end.
4317 ObjectStore::Transaction t
;
4319 for (map
<PG
*,pistate
>::iterator i
= pis
.begin(); i
!= pis
.end(); ++i
) {
4322 pg
->dirty_big_info
= true;
4323 pg
->dirty_info
= true;
4324 pg
->write_if_dirty(t
);
4327 // don't let the transaction get too big
4328 if (++num
>= cct
->_conf
->osd_target_transaction_size
) {
4329 store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
4330 t
= ObjectStore::Transaction();
4335 store
->apply_transaction(service
.meta_osr
.get(), std::move(t
));
4339 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
4340 * hasn't changed since the given epoch and we are the primary.
4342 int OSD::handle_pg_peering_evt(
4344 const pg_history_t
& orig_history
,
4345 const PastIntervals
& pi
,
4347 PG::CephPeeringEvtRef evt
)
4349 if (service
.splitting(pgid
)) {
4350 peering_wait_for_split
[pgid
].push_back(evt
);
4354 PG
*pg
= _lookup_lock_pg(pgid
);
4357 if (!osdmap
->have_pg_pool(pgid
.pool()))
4359 int up_primary
, acting_primary
;
4360 vector
<int> up
, acting
;
4361 osdmap
->pg_to_up_acting_osds(
4362 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4364 pg_history_t history
= orig_history
;
4365 bool valid_history
= project_pg_history(
4366 pgid
, history
, epoch
, up
, up_primary
, acting
, acting_primary
);
4368 if (!valid_history
|| epoch
< history
.same_interval_since
) {
4369 dout(10) << __func__
<< pgid
<< " acting changed in "
4370 << history
.same_interval_since
<< " (msg from " << epoch
<< ")"
4375 if (service
.splitting(pgid
)) {
4379 const bool is_mon_create
=
4380 evt
->get_event().dynamic_type() == PG::NullEvt::static_type();
4381 if (maybe_wait_for_max_pg(pgid
, is_mon_create
)) {
4384 // do we need to resurrect a deleting pg?
4387 res_result result
= _try_resurrect_pg(
4388 service
.get_osdmap(),
4393 PG::RecoveryCtx rctx
= create_context();
4396 const pg_pool_t
* pp
= osdmap
->get_pg_pool(pgid
.pool());
4397 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
4398 store
->get_type() != "bluestore") {
4399 clog
->warn() << "pg " << pgid
4400 << " is at risk of silent data corruption: "
4401 << "the pool allows ec overwrites but is not stored in "
4402 << "bluestore, so deep scrubbing will not detect bitrot";
4404 PG::_create(*rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
4405 PG::_init(*rctx
.transaction
, pgid
, pp
);
4407 int role
= osdmap
->calc_pg_role(whoami
, acting
, acting
.size());
4408 if (!pp
->is_replicated() && role
!= pgid
.shard
)
4411 pg
= _create_lock_pg(
4416 acting
, acting_primary
,
4419 pg
->handle_create(&rctx
);
4420 pg
->write_if_dirty(*rctx
.transaction
);
4421 dispatch_context(rctx
, pg
, osdmap
);
4423 dout(10) << *pg
<< " is new" << dendl
;
4425 pg
->queue_peering_event(evt
);
4426 wake_pg_waiters(pg
);
4431 old_pg_state
->lock();
4432 OSDMapRef old_osd_map
= old_pg_state
->get_osdmap();
4433 int old_role
= old_pg_state
->role
;
4434 vector
<int> old_up
= old_pg_state
->up
;
4435 int old_up_primary
= old_pg_state
->up_primary
.osd
;
4436 vector
<int> old_acting
= old_pg_state
->acting
;
4437 int old_primary
= old_pg_state
->primary
.osd
;
4438 pg_history_t old_history
= old_pg_state
->info
.history
;
4439 PastIntervals old_past_intervals
= old_pg_state
->past_intervals
;
4440 old_pg_state
->unlock();
4441 pg
= _create_lock_pg(
4454 pg
->handle_create(&rctx
);
4455 pg
->write_if_dirty(*rctx
.transaction
);
4456 dispatch_context(rctx
, pg
, osdmap
);
4458 dout(10) << *pg
<< " is new (resurrected)" << dendl
;
4460 pg
->queue_peering_event(evt
);
4461 wake_pg_waiters(pg
);
4466 assert(old_pg_state
);
4467 old_pg_state
->lock();
4468 OSDMapRef old_osd_map
= old_pg_state
->get_osdmap();
4469 int old_role
= old_pg_state
->role
;
4470 vector
<int> old_up
= old_pg_state
->up
;
4471 int old_up_primary
= old_pg_state
->up_primary
.osd
;
4472 vector
<int> old_acting
= old_pg_state
->acting
;
4473 int old_primary
= old_pg_state
->primary
.osd
;
4474 pg_history_t old_history
= old_pg_state
->info
.history
;
4475 PastIntervals old_past_intervals
= old_pg_state
->past_intervals
;
4476 old_pg_state
->unlock();
4477 PG
*parent
= _create_lock_pg(
4491 parent
->handle_create(&rctx
);
4492 parent
->write_if_dirty(*rctx
.transaction
);
4493 dispatch_context(rctx
, parent
, osdmap
);
4495 dout(10) << *parent
<< " is new" << dendl
;
4497 assert(service
.splitting(pgid
));
4498 peering_wait_for_split
[pgid
].push_back(evt
);
4500 //parent->queue_peering_event(evt);
4501 parent
->queue_null(osdmap
->get_epoch(), osdmap
->get_epoch());
4502 wake_pg_waiters(parent
);
4511 // already had it. did the mapping change?
4512 if (epoch
< pg
->info
.history
.same_interval_since
) {
4513 dout(10) << *pg
<< __func__
<< " acting changed in "
4514 << pg
->info
.history
.same_interval_since
4515 << " (msg from " << epoch
<< ")" << dendl
;
4517 pg
->queue_peering_event(evt
);
4524 bool OSD::maybe_wait_for_max_pg(spg_t pgid
, bool is_mon_create
)
4526 const auto max_pgs_per_osd
=
4527 (cct
->_conf
->get_val
<uint64_t>("mon_max_pg_per_osd") *
4528 cct
->_conf
->get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4530 RWLock::RLocker pg_map_locker
{pg_map_lock
};
4531 if (pg_map
.size() < max_pgs_per_osd
) {
4534 lock_guard
<mutex
> pending_creates_locker
{pending_creates_lock
};
4535 if (is_mon_create
) {
4536 pending_creates_from_mon
++;
4538 bool is_primary
= osdmap
->get_pg_acting_rank(pgid
.pgid
, whoami
) == 0;
4539 pending_creates_from_osd
.emplace(pgid
.pgid
, is_primary
);
4541 dout(1) << __func__
<< " withhold creation of pg " << pgid
4542 << ": " << pg_map
.size() << " >= "<< max_pgs_per_osd
<< dendl
;
4546 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4547 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4548 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4549 static vector
<int32_t> twiddle(const vector
<int>& acting
) {
4550 if (acting
.size() > 1) {
4553 vector
<int32_t> twiddled(acting
.begin(), acting
.end());
4554 twiddled
.push_back(-1);
4559 void OSD::resume_creating_pg()
4561 bool do_sub_pg_creates
= false;
4562 bool have_pending_creates
= false;
4564 const auto max_pgs_per_osd
=
4565 (cct
->_conf
->get_val
<uint64_t>("mon_max_pg_per_osd") *
4566 cct
->_conf
->get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4567 RWLock::RLocker
l(pg_map_lock
);
4568 if (max_pgs_per_osd
<= pg_map
.size()) {
4569 // this could happen if admin decreases this setting before a PG is removed
4572 unsigned spare_pgs
= max_pgs_per_osd
- pg_map
.size();
4573 lock_guard
<mutex
> pending_creates_locker
{pending_creates_lock
};
4574 if (pending_creates_from_mon
> 0) {
4575 do_sub_pg_creates
= true;
4576 if (pending_creates_from_mon
>= spare_pgs
) {
4577 spare_pgs
= pending_creates_from_mon
= 0;
4579 spare_pgs
-= pending_creates_from_mon
;
4580 pending_creates_from_mon
= 0;
4583 auto pg
= pending_creates_from_osd
.cbegin();
4584 while (spare_pgs
> 0 && pg
!= pending_creates_from_osd
.cend()) {
4585 dout(20) << __func__
<< " pg " << pg
->first
<< dendl
;
4587 osdmap
->pg_to_up_acting_osds(pg
->first
, nullptr, nullptr, &acting
, nullptr);
4588 service
.queue_want_pg_temp(pg
->first
, twiddle(acting
), true);
4589 pg
= pending_creates_from_osd
.erase(pg
);
4590 do_sub_pg_creates
= true;
4593 have_pending_creates
= (pending_creates_from_mon
> 0 ||
4594 !pending_creates_from_osd
.empty());
4597 bool do_renew_subs
= false;
4598 if (do_sub_pg_creates
) {
4599 if (monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0)) {
4600 dout(4) << __func__
<< ": resolicit pg creates from mon since "
4601 << last_pg_create_epoch
<< dendl
;
4602 do_renew_subs
= true;
4605 version_t start
= osdmap
->get_epoch() + 1;
4606 if (have_pending_creates
) {
4607 // don't miss any new osdmap deleting PGs
4608 if (monc
->sub_want("osdmap", start
, 0)) {
4609 dout(4) << __func__
<< ": resolicit osdmap from mon since "
4611 do_renew_subs
= true;
4613 } else if (do_sub_pg_creates
) {
4614 // no need to subscribe the osdmap continuously anymore
4615 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4616 if (monc
->sub_want_increment("osdmap", start
, CEPH_SUBSCRIBE_ONETIME
)) {
4617 dout(4) << __func__
<< ": re-subscribe osdmap(onetime) since"
4619 do_renew_subs
= true;
4623 if (do_renew_subs
) {
4627 service
.send_pg_temp();
4630 void OSD::build_initial_pg_history(
4633 utime_t created_stamp
,
4637 dout(10) << __func__
<< " " << pgid
<< " created " << created
<< dendl
;
4638 h
->epoch_created
= created
;
4639 h
->epoch_pool_created
= created
;
4640 h
->same_interval_since
= created
;
4641 h
->same_up_since
= created
;
4642 h
->same_primary_since
= created
;
4643 h
->last_scrub_stamp
= created_stamp
;
4644 h
->last_deep_scrub_stamp
= created_stamp
;
4645 h
->last_clean_scrub_stamp
= created_stamp
;
4647 OSDMapRef lastmap
= service
.get_map(created
);
4648 int up_primary
, acting_primary
;
4649 vector
<int> up
, acting
;
4650 lastmap
->pg_to_up_acting_osds(
4651 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4653 ostringstream debug
;
4654 for (epoch_t e
= created
+ 1; e
<= osdmap
->get_epoch(); ++e
) {
4655 OSDMapRef osdmap
= service
.get_map(e
);
4656 int new_up_primary
, new_acting_primary
;
4657 vector
<int> new_up
, new_acting
;
4658 osdmap
->pg_to_up_acting_osds(
4659 pgid
.pgid
, &new_up
, &new_up_primary
, &new_acting
, &new_acting_primary
);
4661 // this is a bit imprecise, but sufficient?
4662 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
4663 const pg_pool_t
*pi
;
4664 bool operator()(const set
<pg_shard_t
> &have
) const {
4665 return have
.size() >= pi
->min_size
;
4667 min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
4668 } min_size_predicate(osdmap
->get_pg_pool(pgid
.pgid
.pool()));
4670 bool new_interval
= PastIntervals::check_new_interval(
4677 h
->same_interval_since
,
4678 h
->last_epoch_clean
,
4682 &min_size_predicate
,
4686 h
->same_interval_since
= e
;
4688 h
->same_up_since
= e
;
4690 if (acting_primary
!= new_acting_primary
) {
4691 h
->same_primary_since
= e
;
4693 if (pgid
.pgid
.is_split(lastmap
->get_pg_num(pgid
.pgid
.pool()),
4694 osdmap
->get_pg_num(pgid
.pgid
.pool()),
4696 h
->last_epoch_split
= e
;
4699 acting
= new_acting
;
4700 up_primary
= new_up_primary
;
4701 acting_primary
= new_acting_primary
;
4705 dout(20) << __func__
<< " " << debug
.str() << dendl
;
4706 dout(10) << __func__
<< " " << *h
<< " " << *pi
4707 << " [" << (pi
->empty() ? pair
<epoch_t
,epoch_t
>(0,0) :
4708 pi
->get_bounds()) << ")"
4713 * Fill in the passed history so you know same_interval_since, same_up_since,
4714 * and same_primary_since.
4716 bool OSD::project_pg_history(spg_t pgid
, pg_history_t
& h
, epoch_t from
,
4717 const vector
<int>& currentup
,
4718 int currentupprimary
,
4719 const vector
<int>& currentacting
,
4720 int currentactingprimary
)
4722 dout(15) << "project_pg_history " << pgid
4723 << " from " << from
<< " to " << osdmap
->get_epoch()
4728 for (e
= osdmap
->get_epoch();
4731 // verify during intermediate epoch (e-1)
4732 OSDMapRef oldmap
= service
.try_get_map(e
-1);
4734 dout(15) << __func__
<< ": found map gap, returning false" << dendl
;
4737 assert(oldmap
->have_pg_pool(pgid
.pool()));
4739 int upprimary
, actingprimary
;
4740 vector
<int> up
, acting
;
4741 oldmap
->pg_to_up_acting_osds(
4748 // acting set change?
4749 if ((actingprimary
!= currentactingprimary
||
4750 upprimary
!= currentupprimary
||
4751 acting
!= currentacting
||
4752 up
!= currentup
) && e
> h
.same_interval_since
) {
4753 dout(15) << "project_pg_history " << pgid
<< " acting|up changed in " << e
4754 << " from " << acting
<< "/" << up
4755 << " " << actingprimary
<< "/" << upprimary
4756 << " -> " << currentacting
<< "/" << currentup
4757 << " " << currentactingprimary
<< "/" << currentupprimary
4759 h
.same_interval_since
= e
;
4762 if (pgid
.is_split(oldmap
->get_pg_num(pgid
.pool()),
4763 osdmap
->get_pg_num(pgid
.pool()),
4764 0) && e
> h
.same_interval_since
) {
4765 h
.same_interval_since
= e
;
4768 if ((up
!= currentup
|| upprimary
!= currentupprimary
)
4769 && e
> h
.same_up_since
) {
4770 dout(15) << "project_pg_history " << pgid
<< " up changed in " << e
4771 << " from " << up
<< " " << upprimary
4772 << " -> " << currentup
<< " " << currentupprimary
<< dendl
;
4773 h
.same_up_since
= e
;
4777 if (OSDMap::primary_changed(
4780 currentactingprimary
,
4782 e
> h
.same_primary_since
) {
4783 dout(15) << "project_pg_history " << pgid
<< " primary changed in " << e
<< dendl
;
4784 h
.same_primary_since
= e
;
4787 if (h
.same_interval_since
>= e
&& h
.same_up_since
>= e
&& h
.same_primary_since
>= e
)
4791 // base case: these floors should be the pg creation epoch if we didn't
4792 // find any changes.
4793 if (e
== h
.epoch_created
) {
4794 if (!h
.same_interval_since
)
4795 h
.same_interval_since
= e
;
4796 if (!h
.same_up_since
)
4797 h
.same_up_since
= e
;
4798 if (!h
.same_primary_since
)
4799 h
.same_primary_since
= e
;
4802 dout(15) << "project_pg_history end " << h
<< dendl
;
4808 void OSD::_add_heartbeat_peer(int p
)
4814 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
4815 if (i
== heartbeat_peers
.end()) {
4816 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, osdmap
->get_epoch());
4819 hi
= &heartbeat_peers
[p
];
4821 HeartbeatSession
*s
= new HeartbeatSession(p
);
4822 hi
->con_back
= cons
.first
.get();
4823 hi
->con_back
->set_priv(s
->get());
4825 hi
->con_front
= cons
.second
.get();
4826 hi
->con_front
->set_priv(s
->get());
4827 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4828 << " " << hi
->con_back
->get_peer_addr()
4829 << " " << hi
->con_front
->get_peer_addr()
4832 hi
->con_front
.reset(NULL
);
4833 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4834 << " " << hi
->con_back
->get_peer_addr()
4841 hi
->epoch
= osdmap
->get_epoch();
4844 void OSD::_remove_heartbeat_peer(int n
)
4846 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
4847 assert(q
!= heartbeat_peers
.end());
4848 dout(20) << " removing heartbeat peer osd." << n
4849 << " " << q
->second
.con_back
->get_peer_addr()
4850 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
4852 q
->second
.con_back
->mark_down();
4853 if (q
->second
.con_front
) {
4854 q
->second
.con_front
->mark_down();
4856 heartbeat_peers
.erase(q
);
4859 void OSD::need_heartbeat_peer_update()
4863 dout(20) << "need_heartbeat_peer_update" << dendl
;
4864 heartbeat_set_peers_need_update();
4867 void OSD::maybe_update_heartbeat_peers()
4869 assert(osd_lock
.is_locked());
4871 if (is_waiting_for_healthy()) {
4872 utime_t now
= ceph_clock_now();
4873 if (last_heartbeat_resample
== utime_t()) {
4874 last_heartbeat_resample
= now
;
4875 heartbeat_set_peers_need_update();
4876 } else if (!heartbeat_peers_need_update()) {
4877 utime_t dur
= now
- last_heartbeat_resample
;
4878 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
4879 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
4880 heartbeat_set_peers_need_update();
4881 last_heartbeat_resample
= now
;
4882 reset_heartbeat_peers(); // we want *new* peers!
4887 if (!heartbeat_peers_need_update())
4889 heartbeat_clear_peers_need_update();
4891 Mutex::Locker
l(heartbeat_lock
);
4893 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
4896 // build heartbeat from set
4898 RWLock::RLocker
l(pg_map_lock
);
4899 for (ceph::unordered_map
<spg_t
, PG
*>::iterator i
= pg_map
.begin();
4903 pg
->heartbeat_peer_lock
.Lock();
4904 dout(20) << i
->first
<< " heartbeat_peers " << pg
->heartbeat_peers
<< dendl
;
4905 for (set
<int>::iterator p
= pg
->heartbeat_peers
.begin();
4906 p
!= pg
->heartbeat_peers
.end();
4908 if (osdmap
->is_up(*p
))
4909 _add_heartbeat_peer(*p
);
4910 for (set
<int>::iterator p
= pg
->probe_targets
.begin();
4911 p
!= pg
->probe_targets
.end();
4913 if (osdmap
->is_up(*p
))
4914 _add_heartbeat_peer(*p
);
4915 pg
->heartbeat_peer_lock
.Unlock();
4919 // include next and previous up osds to ensure we have a fully-connected set
4920 set
<int> want
, extras
;
4921 int next
= osdmap
->get_next_up_osd_after(whoami
);
4924 int prev
= osdmap
->get_previous_up_osd_before(whoami
);
4925 if (prev
>= 0 && prev
!= next
)
4928 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
4929 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
4931 _add_heartbeat_peer(*p
);
4934 // remove down peers; enumerate extras
4935 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
4936 while (p
!= heartbeat_peers
.end()) {
4937 if (!osdmap
->is_up(p
->first
)) {
4940 _remove_heartbeat_peer(o
);
4943 if (p
->second
.epoch
< osdmap
->get_epoch()) {
4944 extras
.insert(p
->first
);
4950 int start
= osdmap
->get_next_up_osd_after(whoami
);
4951 for (int n
= start
; n
>= 0; ) {
4952 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
4954 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
4955 dout(10) << " adding random peer osd." << n
<< dendl
;
4957 _add_heartbeat_peer(n
);
4959 n
= osdmap
->get_next_up_osd_after(n
);
4961 break; // came full circle; stop
4965 for (set
<int>::iterator p
= extras
.begin();
4966 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
4970 _remove_heartbeat_peer(*p
);
4973 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
4976 void OSD::reset_heartbeat_peers()
4978 assert(osd_lock
.is_locked());
4979 dout(10) << "reset_heartbeat_peers" << dendl
;
4980 Mutex::Locker
l(heartbeat_lock
);
4981 while (!heartbeat_peers
.empty()) {
4982 HeartbeatInfo
& hi
= heartbeat_peers
.begin()->second
;
4983 hi
.con_back
->mark_down();
4985 hi
.con_front
->mark_down();
4987 heartbeat_peers
.erase(heartbeat_peers
.begin());
4989 failure_queue
.clear();
4992 void OSD::handle_osd_ping(MOSDPing
*m
)
4994 if (superblock
.cluster_fsid
!= m
->fsid
) {
4995 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
4996 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
<< dendl
;
5001 int from
= m
->get_source().num();
5003 heartbeat_lock
.Lock();
5004 if (is_stopping()) {
5005 heartbeat_lock
.Unlock();
5010 OSDMapRef curmap
= service
.get_osdmap();
5012 heartbeat_lock
.Unlock();
5019 case MOSDPing::PING
:
5021 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
5022 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
5023 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
5024 if (heartbeat_drop
->second
== 0) {
5025 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
5027 --heartbeat_drop
->second
;
5028 dout(5) << "Dropping heartbeat from " << from
5029 << ", " << heartbeat_drop
->second
5030 << " remaining to drop" << dendl
;
5033 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
5034 ((((double)(rand()%100))/100.0))) {
5036 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
5037 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
5038 dout(5) << "Dropping heartbeat from " << from
5039 << ", " << heartbeat_drop
->second
5040 << " remaining to drop" << dendl
;
5045 if (!cct
->get_heartbeat_map()->is_healthy()) {
5046 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl
;
5050 Message
*r
= new MOSDPing(monc
->get_fsid(),
5051 curmap
->get_epoch(),
5052 MOSDPing::PING_REPLY
, m
->stamp
,
5053 cct
->_conf
->osd_heartbeat_min_size
);
5054 m
->get_connection()->send_message(r
);
5056 if (curmap
->is_up(from
)) {
5057 service
.note_peer_epoch(from
, m
->map_epoch
);
5059 ConnectionRef con
= service
.get_con_osd_cluster(from
, curmap
->get_epoch());
5061 service
.share_map_peer(from
, con
.get());
5064 } else if (!curmap
->exists(from
) ||
5065 curmap
->get_down_at(from
) > m
->map_epoch
) {
5066 // tell them they have died
5067 Message
*r
= new MOSDPing(monc
->get_fsid(),
5068 curmap
->get_epoch(),
5071 cct
->_conf
->osd_heartbeat_min_size
);
5072 m
->get_connection()->send_message(r
);
5077 case MOSDPing::PING_REPLY
:
5079 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
5080 if (i
!= heartbeat_peers
.end()) {
5081 if (m
->get_connection() == i
->second
.con_back
) {
5082 dout(25) << "handle_osd_ping got reply from osd." << from
5083 << " first_tx " << i
->second
.first_tx
5084 << " last_tx " << i
->second
.last_tx
5085 << " last_rx_back " << i
->second
.last_rx_back
<< " -> " << m
->stamp
5086 << " last_rx_front " << i
->second
.last_rx_front
5088 i
->second
.last_rx_back
= m
->stamp
;
5089 // if there is no front con, set both stamps.
5090 if (i
->second
.con_front
== NULL
)
5091 i
->second
.last_rx_front
= m
->stamp
;
5092 } else if (m
->get_connection() == i
->second
.con_front
) {
5093 dout(25) << "handle_osd_ping got reply from osd." << from
5094 << " first_tx " << i
->second
.first_tx
5095 << " last_tx " << i
->second
.last_tx
5096 << " last_rx_back " << i
->second
.last_rx_back
5097 << " last_rx_front " << i
->second
.last_rx_front
<< " -> " << m
->stamp
5099 i
->second
.last_rx_front
= m
->stamp
;
5102 utime_t cutoff
= ceph_clock_now();
5103 cutoff
-= cct
->_conf
->osd_heartbeat_grace
;
5104 if (i
->second
.is_healthy(cutoff
)) {
5105 // Cancel false reports
5106 auto failure_queue_entry
= failure_queue
.find(from
);
5107 if (failure_queue_entry
!= failure_queue
.end()) {
5108 dout(10) << "handle_osd_ping canceling queued "
5109 << "failure report for osd." << from
<< dendl
;
5110 failure_queue
.erase(failure_queue_entry
);
5113 auto failure_pending_entry
= failure_pending
.find(from
);
5114 if (failure_pending_entry
!= failure_pending
.end()) {
5115 dout(10) << "handle_osd_ping canceling in-flight "
5116 << "failure report for osd." << from
<< dendl
;
5117 send_still_alive(curmap
->get_epoch(),
5118 failure_pending_entry
->second
.second
);
5119 failure_pending
.erase(failure_pending_entry
);
5125 curmap
->is_up(from
)) {
5126 service
.note_peer_epoch(from
, m
->map_epoch
);
5128 ConnectionRef con
= service
.get_con_osd_cluster(from
, curmap
->get_epoch());
5130 service
.share_map_peer(from
, con
.get());
5137 case MOSDPing::YOU_DIED
:
5138 dout(10) << "handle_osd_ping " << m
->get_source_inst()
5139 << " says i am down in " << m
->map_epoch
<< dendl
;
5140 osdmap_subscribe(curmap
->get_epoch()+1, false);
5144 heartbeat_lock
.Unlock();
5148 void OSD::heartbeat_entry()
5150 Mutex::Locker
l(heartbeat_lock
);
5153 while (!heartbeat_stop
) {
5156 double wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
5158 w
.set_from_double(wait
);
5159 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
5160 heartbeat_cond
.WaitInterval(heartbeat_lock
, w
);
5163 dout(30) << "heartbeat_entry woke up" << dendl
;
5167 void OSD::heartbeat_check()
5169 assert(heartbeat_lock
.is_locked());
5170 utime_t now
= ceph_clock_now();
5172 // check for heartbeat replies (move me elsewhere?)
5173 utime_t cutoff
= now
;
5174 cutoff
-= cct
->_conf
->osd_heartbeat_grace
;
5175 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5176 p
!= heartbeat_peers
.end();
5179 if (p
->second
.first_tx
== utime_t()) {
5180 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
5181 << "yet, skipping" << dendl
;
5185 dout(25) << "heartbeat_check osd." << p
->first
5186 << " first_tx " << p
->second
.first_tx
5187 << " last_tx " << p
->second
.last_tx
5188 << " last_rx_back " << p
->second
.last_rx_back
5189 << " last_rx_front " << p
->second
.last_rx_front
5191 if (p
->second
.is_unhealthy(cutoff
)) {
5192 if (p
->second
.last_rx_back
== utime_t() ||
5193 p
->second
.last_rx_front
== utime_t()) {
5194 derr
<< "heartbeat_check: no reply from " << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5195 << " osd." << p
->first
<< " ever on either front or back, first ping sent "
5196 << p
->second
.first_tx
<< " (cutoff " << cutoff
<< ")" << dendl
;
5198 failure_queue
[p
->first
] = p
->second
.last_tx
;
5200 derr
<< "heartbeat_check: no reply from " << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5201 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
5202 << " front " << p
->second
.last_rx_front
5203 << " (cutoff " << cutoff
<< ")" << dendl
;
5205 failure_queue
[p
->first
] = MIN(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
5211 void OSD::heartbeat()
5213 dout(30) << "heartbeat" << dendl
;
5217 int n_samples
= 86400 / cct
->_conf
->osd_heartbeat_interval
;
5218 if (getloadavg(loadavgs
, 1) == 1) {
5219 logger
->set(l_osd_loadavg
, 100 * loadavgs
[0]);
5220 daily_loadavg
= (daily_loadavg
* (n_samples
- 1) + loadavgs
[0]) / n_samples
;
5221 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg
<< dendl
;
5224 dout(30) << "heartbeat checking stats" << dendl
;
5227 vector
<int> hb_peers
;
5228 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5229 p
!= heartbeat_peers
.end();
5231 hb_peers
.push_back(p
->first
);
5232 service
.update_osd_stat(hb_peers
);
5234 dout(5) << "heartbeat: " << service
.get_osd_stat() << dendl
;
5236 utime_t now
= ceph_clock_now();
5239 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
5240 i
!= heartbeat_peers
.end();
5242 int peer
= i
->first
;
5243 i
->second
.last_tx
= now
;
5244 if (i
->second
.first_tx
== utime_t())
5245 i
->second
.first_tx
= now
;
5246 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
5247 i
->second
.con_back
->send_message(new MOSDPing(monc
->get_fsid(),
5248 service
.get_osdmap()->get_epoch(),
5249 MOSDPing::PING
, now
,
5250 cct
->_conf
->osd_heartbeat_min_size
));
5252 if (i
->second
.con_front
)
5253 i
->second
.con_front
->send_message(new MOSDPing(monc
->get_fsid(),
5254 service
.get_osdmap()->get_epoch(),
5255 MOSDPing::PING
, now
,
5256 cct
->_conf
->osd_heartbeat_min_size
));
5259 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
5261 // hmm.. am i all alone?
5262 dout(30) << "heartbeat lonely?" << dendl
;
5263 if (heartbeat_peers
.empty()) {
5264 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
5265 last_mon_heartbeat
= now
;
5266 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
5267 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
5271 dout(30) << "heartbeat done" << dendl
;
5274 bool OSD::heartbeat_reset(Connection
*con
)
5276 Mutex::Locker
l(heartbeat_lock
);
5277 HeartbeatSession
*s
= static_cast<HeartbeatSession
*>(con
->get_priv());
5279 if (is_stopping()) {
5283 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(s
->peer
);
5284 if (p
!= heartbeat_peers
.end() &&
5285 (p
->second
.con_back
== con
||
5286 p
->second
.con_front
== con
)) {
5287 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5288 << ", reopening" << dendl
;
5289 if (con
!= p
->second
.con_back
) {
5290 p
->second
.con_back
->mark_down();
5292 p
->second
.con_back
.reset(NULL
);
5293 if (p
->second
.con_front
&& con
!= p
->second
.con_front
) {
5294 p
->second
.con_front
->mark_down();
5296 p
->second
.con_front
.reset(NULL
);
5297 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
5299 p
->second
.con_back
= newcon
.first
.get();
5300 p
->second
.con_back
->set_priv(s
->get());
5301 if (newcon
.second
) {
5302 p
->second
.con_front
= newcon
.second
.get();
5303 p
->second
.con_front
->set_priv(s
->get());
5306 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5307 << ", raced with osdmap update, closing out peer" << dendl
;
5308 heartbeat_peers
.erase(p
);
5311 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
5320 // =========================================
5324 assert(osd_lock
.is_locked());
5325 dout(10) << "tick" << dendl
;
5327 if (is_active() || is_waiting_for_healthy()) {
5328 maybe_update_heartbeat_peers();
5331 if (is_waiting_for_healthy()) {
5333 } else if (is_preboot() &&
5334 waiting_for_luminous_mons
&&
5335 monc
->monmap
.get_required_features().contains_all(
5336 ceph::features::mon::FEATURE_LUMINOUS
)) {
5337 // mon upgrade finished!
5343 tick_timer
.add_event_after(get_tick_interval(), new C_Tick(this));
5346 void OSD::tick_without_osd_lock()
5348 assert(tick_timer_lock
.is_locked());
5349 dout(10) << "tick_without_osd_lock" << dendl
;
5351 logger
->set(l_osd_buf
, buffer::get_total_alloc());
5352 logger
->set(l_osd_history_alloc_bytes
, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
5353 logger
->set(l_osd_history_alloc_num
, buffer::get_history_alloc_num());
5354 logger
->set(l_osd_cached_crc
, buffer::get_cached_crc());
5355 logger
->set(l_osd_cached_crc_adjusted
, buffer::get_cached_crc_adjusted());
5356 logger
->set(l_osd_missed_crc
, buffer::get_missed_crc());
5357 logger
->set(l_osd_pg_removing
, remove_wq
.get_remove_queue_len());
5359 // osd_lock is not being held, which means the OSD state
5360 // might change when doing the monitor report
5361 if (is_active() || is_waiting_for_healthy()) {
5362 heartbeat_lock
.Lock();
5364 heartbeat_lock
.Unlock();
5366 map_lock
.get_read();
5367 Mutex::Locker
l(mon_report_lock
);
5371 bool report
= false;
5372 utime_t now
= ceph_clock_now();
5373 pg_stat_queue_lock
.Lock();
5374 double backoff
= stats_ack_timeout
/ cct
->_conf
->osd_mon_ack_timeout
;
5375 double adjusted_min
= cct
->_conf
->osd_mon_report_interval_min
* backoff
;
5376 // note: we shouldn't adjust max because it must remain < the
5377 // mon's mon_osd_report_timeout (which defaults to 1.5x our
5379 double max
= cct
->_conf
->osd_mon_report_interval_max
;
5380 if (!outstanding_pg_stats
.empty() &&
5381 (now
- stats_ack_timeout
) > last_pg_stats_ack
) {
5382 dout(1) << __func__
<< " mon hasn't acked PGStats in "
5383 << now
- last_pg_stats_ack
5384 << " seconds, reconnecting elsewhere" << dendl
;
5386 last_pg_stats_ack
= now
; // reset clock
5387 last_pg_stats_sent
= utime_t();
5389 MAX(cct
->_conf
->osd_mon_ack_timeout
,
5390 stats_ack_timeout
* cct
->_conf
->osd_stats_ack_timeout_factor
);
5391 outstanding_pg_stats
.clear();
5393 if (now
- last_pg_stats_sent
> max
) {
5394 osd_stat_updated
= true;
5396 } else if (service
.need_fullness_update()) {
5398 } else if ((int)outstanding_pg_stats
.size() >=
5399 cct
->_conf
->osd_mon_report_max_in_flight
) {
5400 dout(20) << __func__
<< " have max " << outstanding_pg_stats
5401 << " stats updates in flight" << dendl
;
5403 if (now
- last_mon_report
> adjusted_min
) {
5404 dout(20) << __func__
<< " stats backoff " << backoff
5405 << " adjusted_min " << adjusted_min
<< " - sending report"
5407 osd_stat_updated
= true;
5411 pg_stat_queue_lock
.Unlock();
5414 monc
->reopen_session();
5415 } else if (report
) {
5416 last_mon_report
= now
;
5418 // do any pending reports
5421 if (osdmap
->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
5425 map_lock
.put_read();
5429 if (!scrub_random_backoff()) {
5432 service
.promote_throttle_recalibrate();
5433 resume_creating_pg();
5434 bool need_send_beacon
= false;
5435 const auto now
= ceph::coarse_mono_clock::now();
5437 // borrow lec lock to pretect last_sent_beacon from changing
5438 Mutex::Locker l
{min_last_epoch_clean_lock
};
5439 const auto elapsed
= now
- last_sent_beacon
;
5440 if (chrono::duration_cast
<chrono::seconds
>(elapsed
).count() >
5441 cct
->_conf
->osd_beacon_report_interval
) {
5442 need_send_beacon
= true;
5445 if (need_send_beacon
) {
5450 check_ops_in_flight();
5451 mgrc
.update_osd_health(get_health_metrics());
5452 service
.kick_recovery_queue();
5453 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
5454 new C_Tick_WithoutOSDLock(this));
5457 void OSD::check_ops_in_flight()
5459 vector
<string
> warnings
;
5460 if (op_tracker
.check_ops_in_flight(warnings
)) {
5461 for (vector
<string
>::iterator i
= warnings
.begin();
5462 i
!= warnings
.end();
5470 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5471 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5472 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5473 // getomap <pool> [namespace/]<obj-name>
5474 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5475 // injectmdataerr [namespace/]<obj-name> [shardid]
5476 // injectdataerr [namespace/]<obj-name> [shardid]
5478 // set_recovery_delay [utime]
5479 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
5480 const std::string
&command
, cmdmap_t
& cmdmap
, ostream
&ss
)
5483 //Support changing the omap on a single osd by using the Admin Socket to
5484 //directly request the osd make a change.
5485 if (command
== "setomapval" || command
== "rmomapkey" ||
5486 command
== "setomapheader" || command
== "getomap" ||
5487 command
== "truncobj" || command
== "injectmdataerr" ||
5488 command
== "injectdataerr"
5492 OSDMapRef curmap
= service
->get_osdmap();
5497 cmd_getval(service
->cct
, cmdmap
, "pool", poolstr
);
5498 pool
= curmap
->lookup_pg_pool_name(poolstr
);
5499 //If we can't find it by name then maybe id specified
5500 if (pool
< 0 && isdigit(poolstr
[0]))
5501 pool
= atoll(poolstr
.c_str());
5503 ss
<< "Invalid pool '" << poolstr
<< "''";
5507 string objname
, nspace
;
5508 cmd_getval(service
->cct
, cmdmap
, "objname", objname
);
5509 std::size_t found
= objname
.find_first_of('/');
5510 if (found
!= string::npos
) {
5511 nspace
= objname
.substr(0, found
);
5512 objname
= objname
.substr(found
+1);
5514 object_locator_t
oloc(pool
, nspace
);
5515 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
5518 ss
<< "Invalid namespace/objname";
5523 cmd_getval(service
->cct
, cmdmap
, "shardid", shardid
, int64_t(shard_id_t::NO_SHARD
));
5524 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
5525 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
5526 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
5527 if (curmap
->pg_is_ec(rawpg
)) {
5528 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
5529 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
5534 ObjectStore::Transaction t
;
5536 if (command
== "setomapval") {
5537 map
<string
, bufferlist
> newattrs
;
5540 cmd_getval(service
->cct
, cmdmap
, "key", key
);
5541 cmd_getval(service
->cct
, cmdmap
, "val", valstr
);
5544 newattrs
[key
] = val
;
5545 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
5546 r
= store
->apply_transaction(service
->meta_osr
.get(), std::move(t
));
5548 ss
<< "error=" << r
;
5551 } else if (command
== "rmomapkey") {
5554 cmd_getval(service
->cct
, cmdmap
, "key", key
);
5557 t
.omap_rmkeys(coll_t(pgid
), ghobject_t(obj
), keys
);
5558 r
= store
->apply_transaction(service
->meta_osr
.get(), std::move(t
));
5560 ss
<< "error=" << r
;
5563 } else if (command
== "setomapheader") {
5564 bufferlist newheader
;
5567 cmd_getval(service
->cct
, cmdmap
, "header", headerstr
);
5568 newheader
.append(headerstr
);
5569 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
5570 r
= store
->apply_transaction(service
->meta_osr
.get(), std::move(t
));
5572 ss
<< "error=" << r
;
5575 } else if (command
== "getomap") {
5576 //Debug: Output entire omap
5578 map
<string
, bufferlist
> keyvals
;
5579 r
= store
->omap_get(coll_t(pgid
), ghobject_t(obj
), &hdrbl
, &keyvals
);
5581 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
5582 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
5583 it
!= keyvals
.end(); ++it
)
5584 ss
<< " key=" << (*it
).first
<< " val="
5585 << string((*it
).second
.c_str(), (*it
).second
.length());
5587 ss
<< "error=" << r
;
5589 } else if (command
== "truncobj") {
5591 cmd_getval(service
->cct
, cmdmap
, "len", trunclen
);
5592 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
5593 r
= store
->apply_transaction(service
->meta_osr
.get(), std::move(t
));
5595 ss
<< "error=" << r
;
5598 } else if (command
== "injectdataerr") {
5599 store
->inject_data_error(gobj
);
5601 } else if (command
== "injectmdataerr") {
5602 store
->inject_mdata_error(gobj
);
5607 if (command
== "set_recovery_delay") {
5609 cmd_getval(service
->cct
, cmdmap
, "utime", delay
, (int64_t)0);
5612 int r
= service
->cct
->_conf
->set_val("osd_recovery_delay_start",
5615 ss
<< "set_recovery_delay: error setting "
5616 << "osd_recovery_delay_start to '" << delay
<< "': error "
5620 service
->cct
->_conf
->apply_changes(NULL
);
5621 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
5622 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
5625 if (command
== "trigger_scrub" || command
== "trigger_deep_scrub") {
5627 bool deep
= (command
== "trigger_deep_scrub");
5628 OSDMapRef curmap
= service
->get_osdmap();
5632 cmd_getval(service
->cct
, cmdmap
, "pgid", pgidstr
);
5633 if (!pgid
.parse(pgidstr
.c_str())) {
5634 ss
<< "Invalid pgid specified";
5639 cmd_getval(service
->cct
, cmdmap
, "time", time
, (int64_t)0);
5641 PG
*pg
= service
->osd
->_lookup_lock_pg(pgid
);
5642 if (pg
== nullptr) {
5643 ss
<< "Can't find pg " << pgid
;
5647 if (pg
->is_primary()) {
5648 pg
->unreg_next_scrub();
5649 const pg_pool_t
*p
= curmap
->get_pg_pool(pgid
.pool());
5650 double pool_scrub_max_interval
= 0;
5651 double scrub_max_interval
;
5653 p
->opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &pool_scrub_max_interval
);
5654 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
5655 pool_scrub_max_interval
: g_conf
->osd_deep_scrub_interval
;
5657 p
->opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &pool_scrub_max_interval
);
5658 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
5659 pool_scrub_max_interval
: g_conf
->osd_scrub_max_interval
;
5661 // Instead of marking must_scrub force a schedule scrub
5662 utime_t stamp
= ceph_clock_now();
5664 stamp
-= scrub_max_interval
;
5666 stamp
-= (float)time
;
5667 stamp
-= 100.0; // push back last scrub more for good measure
5669 pg
->set_last_deep_scrub_stamp(stamp
);
5671 pg
->set_last_scrub_stamp(stamp
);
5673 pg
->reg_next_scrub();
5674 pg
->publish_stats_to_osd();
5675 ss
<< "ok - set" << (deep
? " deep" : "" ) << " stamp " << stamp
;
5677 ss
<< "Not primary";
5682 if (command
== "injectfull") {
5685 OSDService::s_names state
;
5686 cmd_getval(service
->cct
, cmdmap
, "type", type
, string("full"));
5687 cmd_getval(service
->cct
, cmdmap
, "count", count
, (int64_t)-1);
5688 if (type
== "none" || count
== 0) {
5692 state
= service
->get_full_state(type
);
5693 if (state
== OSDService::s_names::INVALID
) {
5694 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5697 service
->set_injectfull(state
, count
);
5700 ss
<< "Internal error - command=" << command
;
5703 // =========================================
5706 ObjectStore
*store
, SnapMapper
*mapper
,
5708 ObjectStore::Sequencer
*osr
,
5709 coll_t coll
, DeletingStateRef dstate
,
5711 ThreadPool::TPHandle
&handle
)
5713 vector
<ghobject_t
> olist
;
5715 ObjectStore::Transaction t
;
5717 handle
.reset_tp_timeout();
5718 store
->collection_list(
5721 ghobject_t::get_max(),
5722 store
->get_ideal_list_max(),
5725 generic_dout(10) << __func__
<< " " << olist
<< dendl
;
5726 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5727 // will recheck the answer before it really goes on.
5729 for (vector
<ghobject_t
>::iterator i
= olist
.begin();
5734 OSDriver::OSTransaction
_t(osdriver
->get_transaction(&t
));
5735 int r
= mapper
->remove_oid(i
->hobj
, &_t
);
5736 if (r
!= 0 && r
!= -ENOENT
) {
5740 if (++num
>= cct
->_conf
->osd_target_transaction_size
) {
5742 store
->queue_transaction(osr
, std::move(t
), &waiter
);
5743 cont
= dstate
->pause_clearing();
5744 handle
.suspend_tp_timeout();
5746 if (cct
->_conf
->osd_delete_sleep
) {
5748 t
.set_from_double(cct
->_conf
->osd_delete_sleep
);
5749 lgeneric_subdout(cct
, osd
, 10) << __func__
<< " inject delay of " << t
<< dendl
;
5752 handle
.reset_tp_timeout();
5754 cont
= dstate
->resume_clearing();
5757 t
= ObjectStore::Transaction();
5763 store
->queue_transaction(osr
, std::move(t
), &waiter
);
5764 cont
= dstate
->pause_clearing();
5765 handle
.suspend_tp_timeout();
5767 handle
.reset_tp_timeout();
5769 cont
= dstate
->resume_clearing();
5771 // whether there are more objects to remove in the collection
5772 *finished
= next
.is_max();
5776 void OSD::RemoveWQ::_process(
5777 pair
<PGRef
, DeletingStateRef
> item
,
5778 ThreadPool::TPHandle
&handle
)
5781 PGRef
pg(item
.first
);
5782 SnapMapper
&mapper
= pg
->snap_mapper
;
5783 OSDriver
&driver
= pg
->osdriver
;
5784 coll_t coll
= coll_t(pg
->info
.pgid
);
5786 bool finished
= false;
5788 if (!item
.second
->start_or_resume_clearing())
5791 bool cont
= remove_dir(
5792 pg
->cct
, store
, &mapper
, &driver
, pg
->osr
.get(), coll
, item
.second
,
5797 if (item
.second
->pause_clearing())
5802 if (!item
.second
->start_deleting())
5805 ObjectStore::Transaction t
;
5806 PGLog::clear_info_log(pg
->info
.pgid
, &t
);
5808 if (cct
->_conf
->osd_inject_failure_on_pg_removal
) {
5809 generic_derr
<< "osd_inject_failure_on_pg_removal" << dendl
;
5812 t
.remove_collection(coll
);
5814 // We need the sequencer to stick around until the op is complete
5815 store
->queue_transaction(
5820 0, // onreadable sync
5821 new ContainerContext
<PGRef
>(pg
),
5824 item
.second
->finish_deleting();
5826 // =========================================
5828 void OSD::ms_handle_connect(Connection
*con
)
5830 dout(10) << __func__
<< " con " << con
<< dendl
;
5831 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
5832 Mutex::Locker
l(osd_lock
);
5835 dout(10) << __func__
<< " on mon" << dendl
;
5839 } else if (is_booting()) {
5840 _send_boot(); // resend boot message
5842 map_lock
.get_read();
5843 Mutex::Locker
l2(mon_report_lock
);
5845 utime_t now
= ceph_clock_now();
5846 last_mon_report
= now
;
5848 // resend everything, it's a new session
5851 service
.requeue_pg_temp();
5852 service
.send_pg_temp();
5855 if (osdmap
->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
5859 map_lock
.put_read();
5861 send_beacon(ceph::coarse_mono_clock::now());
5865 // full map requests may happen while active or pre-boot
5866 if (requested_full_first
) {
5867 rerequest_full_maps();
5872 void OSD::ms_handle_fast_connect(Connection
*con
)
5874 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
5875 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
5876 Session
*s
= static_cast<Session
*>(con
->get_priv());
5878 s
= new Session(cct
);
5879 con
->set_priv(s
->get());
5881 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
5882 << " addr=" << s
->con
->get_peer_addr() << dendl
;
5883 // we don't connect to clients
5884 assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
5885 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
5891 void OSD::ms_handle_fast_accept(Connection
*con
)
5893 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
5894 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
5895 Session
*s
= static_cast<Session
*>(con
->get_priv());
5897 s
= new Session(cct
);
5898 con
->set_priv(s
->get());
5900 dout(10) << "new session (incoming)" << s
<< " con=" << con
5901 << " addr=" << con
->get_peer_addr()
5902 << " must have raced with connect" << dendl
;
5903 assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
5904 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
5910 bool OSD::ms_handle_reset(Connection
*con
)
5912 Session
*session
= static_cast<Session
*>(con
->get_priv());
5913 dout(2) << "ms_handle_reset con " << con
<< " session " << session
<< dendl
;
5916 session
->wstate
.reset(con
);
5917 session
->con
.reset(NULL
); // break con <-> session ref cycle
5918 // note that we break session->con *before* the session_handle_reset
5919 // cleanup below. this avoids a race between us and
5920 // PG::add_backoff, Session::check_backoff, etc.
5921 session_handle_reset(session
);
5926 bool OSD::ms_handle_refused(Connection
*con
)
5928 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
5931 Session
*session
= static_cast<Session
*>(con
->get_priv());
5932 dout(2) << "ms_handle_refused con " << con
<< " session " << session
<< dendl
;
5935 int type
= con
->get_peer_type();
5936 // handle only OSD failures here
5937 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
5938 OSDMapRef osdmap
= get_osdmap();
5940 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
5941 if (id
>= 0 && osdmap
->is_up(id
)) {
5942 // I'm cheating mon heartbeat grace logic, because we know it's not going
5943 // to respawn alone. +1 so we won't hit any boundary case.
5944 monc
->send_mon_message(new MOSDFailure(monc
->get_fsid(),
5945 osdmap
->get_inst(id
),
5946 cct
->_conf
->osd_heartbeat_grace
+ 1,
5947 osdmap
->get_epoch(),
5948 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
5957 struct C_OSD_GetVersion
: public Context
{
5959 uint64_t oldest
, newest
;
5960 explicit C_OSD_GetVersion(OSD
*o
) : osd(o
), oldest(0), newest(0) {}
5961 void finish(int r
) override
{
5963 osd
->_got_mon_epochs(oldest
, newest
);
5967 void OSD::start_boot()
5969 if (!_is_healthy()) {
5970 // if we are not healthy, do not mark ourselves up (yet)
5971 dout(1) << "not healthy; waiting to boot" << dendl
;
5972 if (!is_waiting_for_healthy())
5973 start_waiting_for_healthy();
5974 // send pings sooner rather than later
5978 dout(1) << __func__
<< dendl
;
5979 set_state(STATE_PREBOOT
);
5980 waiting_for_luminous_mons
= false;
5981 dout(10) << "start_boot - have maps " << superblock
.oldest_map
5982 << ".." << superblock
.newest_map
<< dendl
;
5983 C_OSD_GetVersion
*c
= new C_OSD_GetVersion(this);
5984 monc
->get_version("osdmap", &c
->newest
, &c
->oldest
, c
);
5987 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
5989 Mutex::Locker
l(osd_lock
);
5991 _preboot(oldest
, newest
);
5995 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
5997 assert(is_preboot());
5998 dout(10) << __func__
<< " _preboot mon has osdmaps "
5999 << oldest
<< ".." << newest
<< dendl
;
6001 // ensure our local fullness awareness is accurate
6004 // if our map within recent history, try to add ourselves to the osdmap.
6005 if (osdmap
->get_epoch() == 0) {
6006 derr
<< "waiting for initial osdmap" << dendl
;
6007 } else if (osdmap
->is_destroyed(whoami
)) {
6008 derr
<< "osdmap says I am destroyed" << dendl
;
6009 // provide a small margin so we don't livelock seeing if we
6010 // un-destroyed ourselves.
6011 if (osdmap
->get_epoch() > newest
- 1) {
6014 } else if (osdmap
->test_flag(CEPH_OSDMAP_NOUP
) || osdmap
->is_noup(whoami
)) {
6015 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
6016 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
6017 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6019 } else if (osdmap
->require_osd_release
< CEPH_RELEASE_JEWEL
) {
6020 derr
<< "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
6022 } else if (!monc
->monmap
.get_required_features().contains_all(
6023 ceph::features::mon::FEATURE_LUMINOUS
)) {
6024 derr
<< "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
6025 << "Luminous or later before Luminous OSDs will boot" << dendl
;
6026 waiting_for_luminous_mons
= true;
6027 } else if (service
.need_fullness_update()) {
6028 derr
<< "osdmap fullness state needs update" << dendl
;
6030 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
6031 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
6036 // get all the latest maps
6037 if (osdmap
->get_epoch() + 1 >= oldest
)
6038 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6040 osdmap_subscribe(oldest
- 1, true);
6043 void OSD::send_full_update()
6045 if (!service
.need_fullness_update())
6048 if (service
.is_full()) {
6049 state
= CEPH_OSD_FULL
;
6050 } else if (service
.is_backfillfull()) {
6051 state
= CEPH_OSD_BACKFILLFULL
;
6052 } else if (service
.is_nearfull()) {
6053 state
= CEPH_OSD_NEARFULL
;
6056 OSDMap::calc_state_set(state
, s
);
6057 dout(10) << __func__
<< " want state " << s
<< dendl
;
6058 monc
->send_mon_message(new MOSDFull(osdmap
->get_epoch(), state
));
6061 void OSD::start_waiting_for_healthy()
6063 dout(1) << "start_waiting_for_healthy" << dendl
;
6064 set_state(STATE_WAITING_FOR_HEALTHY
);
6065 last_heartbeat_resample
= utime_t();
6067 // subscribe to osdmap updates, in case our peers really are known to be dead
6068 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6071 bool OSD::_is_healthy()
6073 if (!cct
->get_heartbeat_map()->is_healthy()) {
6074 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
6078 if (is_waiting_for_healthy()) {
6079 Mutex::Locker
l(heartbeat_lock
);
6080 utime_t cutoff
= ceph_clock_now();
6081 cutoff
-= cct
->_conf
->osd_heartbeat_grace
;
6082 int num
= 0, up
= 0;
6083 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
6084 p
!= heartbeat_peers
.end();
6086 if (p
->second
.is_healthy(cutoff
))
6090 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
6091 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
6092 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
6100 void OSD::_send_boot()
6102 dout(10) << "_send_boot" << dendl
;
6103 entity_addr_t cluster_addr
= cluster_messenger
->get_myaddr();
6104 Connection
*local_connection
= cluster_messenger
->get_loopback_connection().get();
6105 if (cluster_addr
.is_blank_ip()) {
6106 int port
= cluster_addr
.get_port();
6107 cluster_addr
= client_messenger
->get_myaddr();
6108 cluster_addr
.set_port(port
);
6109 cluster_messenger
->set_addr_unknowns(cluster_addr
);
6110 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl
;
6112 Session
*s
= static_cast<Session
*>(local_connection
->get_priv());
6116 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6119 entity_addr_t hb_back_addr
= hb_back_server_messenger
->get_myaddr();
6120 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
6121 if (hb_back_addr
.is_blank_ip()) {
6122 int port
= hb_back_addr
.get_port();
6123 hb_back_addr
= cluster_addr
;
6124 hb_back_addr
.set_port(port
);
6125 hb_back_server_messenger
->set_addr_unknowns(hb_back_addr
);
6126 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl
;
6128 Session
*s
= static_cast<Session
*>(local_connection
->get_priv());
6132 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6135 entity_addr_t hb_front_addr
= hb_front_server_messenger
->get_myaddr();
6136 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
6137 if (hb_front_addr
.is_blank_ip()) {
6138 int port
= hb_front_addr
.get_port();
6139 hb_front_addr
= client_messenger
->get_myaddr();
6140 hb_front_addr
.set_port(port
);
6141 hb_front_server_messenger
->set_addr_unknowns(hb_front_addr
);
6142 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl
;
6144 Session
*s
= static_cast<Session
*>(local_connection
->get_priv());
6148 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6151 MOSDBoot
*mboot
= new MOSDBoot(superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
6152 hb_back_addr
, hb_front_addr
, cluster_addr
,
6154 dout(10) << " client_addr " << client_messenger
->get_myaddr()
6155 << ", cluster_addr " << cluster_addr
6156 << ", hb_back_addr " << hb_back_addr
6157 << ", hb_front_addr " << hb_front_addr
6159 _collect_metadata(&mboot
->metadata
);
6160 monc
->send_mon_message(mboot
);
6161 set_state(STATE_BOOTING
);
6164 void OSD::_collect_metadata(map
<string
,string
> *pm
)
6167 (*pm
)["osd_data"] = dev_path
;
6168 if (store
->get_type() == "filestore") {
6169 // not applicable for bluestore
6170 (*pm
)["osd_journal"] = journal_path
;
6172 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddr());
6173 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddr());
6174 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddr());
6175 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddr());
6178 (*pm
)["osd_objectstore"] = store
->get_type();
6179 (*pm
)["rotational"] = store_is_rotational
? "1" : "0";
6180 (*pm
)["journal_rotational"] = journal_is_rotational
? "1" : "0";
6181 (*pm
)["default_device_class"] = store
->get_default_device_class();
6182 store
->collect_metadata(pm
);
6184 collect_sys_info(pm
, cct
);
6186 std::string front_iface
, back_iface
;
6189 CEPH_PICK_ADDRESS_PUBLIC | CEPH_PICK_ADDRESS_CLUSTER,
6190 &front_iface, &back_iface);
6192 (*pm
)["front_iface"] = pick_iface(cct
,
6193 client_messenger
->get_myaddr().get_sockaddr_storage());
6194 (*pm
)["back_iface"] = pick_iface(cct
,
6195 cluster_messenger
->get_myaddr().get_sockaddr_storage());
6197 dout(10) << __func__
<< " " << *pm
<< dendl
;
6200 void OSD::queue_want_up_thru(epoch_t want
)
6202 map_lock
.get_read();
6203 epoch_t cur
= osdmap
->get_up_thru(whoami
);
6204 Mutex::Locker
l(mon_report_lock
);
6205 if (want
> up_thru_wanted
) {
6206 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
6207 << ", currently " << cur
6209 up_thru_wanted
= want
;
6212 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
6213 << ", currently " << cur
6216 map_lock
.put_read();
6219 void OSD::send_alive()
6221 assert(mon_report_lock
.is_locked());
6222 if (!osdmap
->exists(whoami
))
6224 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
6225 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
6226 if (up_thru_wanted
> up_thru
) {
6227 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
6228 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
6232 void OSD::request_full_map(epoch_t first
, epoch_t last
)
6234 dout(10) << __func__
<< " " << first
<< ".." << last
6235 << ", previously requested "
6236 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
6237 assert(osd_lock
.is_locked());
6238 assert(first
> 0 && last
> 0);
6239 assert(first
<= last
);
6240 assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
6241 if (requested_full_first
== 0) {
6243 requested_full_first
= first
;
6244 requested_full_last
= last
;
6245 } else if (last
<= requested_full_last
) {
6249 // additional request
6250 first
= requested_full_last
+ 1;
6251 requested_full_last
= last
;
6253 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
6254 req
->request_full(first
, last
);
6255 monc
->send_mon_message(req
);
6258 void OSD::got_full_map(epoch_t e
)
6260 assert(requested_full_first
<= requested_full_last
);
6261 assert(osd_lock
.is_locked());
6262 if (requested_full_first
== 0) {
6263 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
6266 if (e
< requested_full_first
) {
6267 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6268 << ".." << requested_full_last
6269 << ", ignoring" << dendl
;
6272 if (e
>= requested_full_last
) {
6273 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6274 << ".." << requested_full_last
<< ", resetting" << dendl
;
6275 requested_full_first
= requested_full_last
= 0;
6279 requested_full_first
= e
+ 1;
6281 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6282 << ".." << requested_full_last
6283 << ", still need more" << dendl
;
6286 void OSD::requeue_failures()
6288 Mutex::Locker
l(heartbeat_lock
);
6289 unsigned old_queue
= failure_queue
.size();
6290 unsigned old_pending
= failure_pending
.size();
6291 for (map
<int,pair
<utime_t
,entity_inst_t
> >::iterator p
=
6292 failure_pending
.begin();
6293 p
!= failure_pending
.end(); ) {
6294 failure_queue
[p
->first
] = p
->second
.first
;
6295 failure_pending
.erase(p
++);
6297 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
6298 << failure_queue
.size() << dendl
;
6301 void OSD::send_failures()
6303 assert(map_lock
.is_locked());
6304 assert(mon_report_lock
.is_locked());
6305 Mutex::Locker
l(heartbeat_lock
);
6306 utime_t now
= ceph_clock_now();
6307 while (!failure_queue
.empty()) {
6308 int osd
= failure_queue
.begin()->first
;
6309 if (!failure_pending
.count(osd
)) {
6310 entity_inst_t i
= osdmap
->get_inst(osd
);
6311 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
6312 monc
->send_mon_message(new MOSDFailure(monc
->get_fsid(), i
, failed_for
,
6313 osdmap
->get_epoch()));
6314 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
, i
);
6316 failure_queue
.erase(osd
);
6320 void OSD::send_still_alive(epoch_t epoch
, const entity_inst_t
&i
)
6322 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), i
, 0, epoch
, MOSDFailure::FLAG_ALIVE
);
6323 monc
->send_mon_message(m
);
6326 void OSD::send_pg_stats(const utime_t
&now
)
6328 assert(map_lock
.is_locked());
6329 assert(osdmap
->require_osd_release
< CEPH_RELEASE_LUMINOUS
);
6330 dout(20) << "send_pg_stats" << dendl
;
6332 osd_stat_t cur_stat
= service
.get_osd_stat();
6334 cur_stat
.os_perf_stat
= store
->get_cur_stats();
6336 pg_stat_queue_lock
.Lock();
6338 if (osd_stat_updated
|| !pg_stat_queue
.empty()) {
6339 last_pg_stats_sent
= now
;
6340 osd_stat_updated
= false;
6342 dout(10) << "send_pg_stats - " << pg_stat_queue
.size() << " pgs updated" << dendl
;
6344 utime_t
had_for(now
);
6345 had_for
-= had_map_since
;
6347 MPGStats
*m
= new MPGStats(monc
->get_fsid(), osdmap
->get_epoch(), had_for
);
6349 uint64_t tid
= ++pg_stat_tid
;
6351 m
->osd_stat
= cur_stat
;
6353 xlist
<PG
*>::iterator p
= pg_stat_queue
.begin();
6357 if (!pg
->is_primary()) { // we hold map_lock; role is stable.
6358 pg
->stat_queue_item
.remove_myself();
6359 pg
->put("pg_stat_queue");
6362 pg
->pg_stats_publish_lock
.Lock();
6363 if (pg
->pg_stats_publish_valid
) {
6364 m
->pg_stat
[pg
->info
.pgid
.pgid
] = pg
->pg_stats_publish
;
6365 dout(25) << " sending " << pg
->info
.pgid
<< " " << pg
->pg_stats_publish
.reported_epoch
<< ":"
6366 << pg
->pg_stats_publish
.reported_seq
<< dendl
;
6368 dout(25) << " NOT sending " << pg
->info
.pgid
<< " " << pg
->pg_stats_publish
.reported_epoch
<< ":"
6369 << pg
->pg_stats_publish
.reported_seq
<< ", not valid" << dendl
;
6371 pg
->pg_stats_publish_lock
.Unlock();
6374 if (last_pg_stats_ack
== utime_t() || !outstanding_pg_stats
.empty()) {
6375 last_pg_stats_ack
= ceph_clock_now();
6377 outstanding_pg_stats
.insert(tid
);
6378 dout(20) << __func__
<< " updates pending: " << outstanding_pg_stats
<< dendl
;
6380 monc
->send_mon_message(m
);
6383 pg_stat_queue_lock
.Unlock();
6386 void OSD::handle_pg_stats_ack(MPGStatsAck
*ack
)
6388 dout(10) << "handle_pg_stats_ack " << dendl
;
6390 if (!require_mon_peer(ack
)) {
6395 // NOTE: we may get replies from a previous mon even while
6396 // outstanding_pg_stats is empty if reconnecting races with replies
6399 pg_stat_queue_lock
.Lock();
6401 last_pg_stats_ack
= ceph_clock_now();
6403 // decay timeout slowly (analogous to TCP)
6405 MAX(cct
->_conf
->osd_mon_ack_timeout
,
6406 stats_ack_timeout
* cct
->_conf
->osd_stats_ack_timeout_decay
);
6407 dout(20) << __func__
<< " timeout now " << stats_ack_timeout
<< dendl
;
6409 if (ack
->get_tid() > pg_stat_tid_flushed
) {
6410 pg_stat_tid_flushed
= ack
->get_tid();
6411 pg_stat_queue_cond
.Signal();
6414 xlist
<PG
*>::iterator p
= pg_stat_queue
.begin();
6420 auto acked
= ack
->pg_stat
.find(pg
->info
.pgid
.pgid
);
6421 if (acked
!= ack
->pg_stat
.end()) {
6422 pg
->pg_stats_publish_lock
.Lock();
6423 if (acked
->second
.first
== pg
->pg_stats_publish
.reported_seq
&&
6424 acked
->second
.second
== pg
->pg_stats_publish
.reported_epoch
) {
6425 dout(25) << " ack on " << pg
->info
.pgid
<< " " << pg
->pg_stats_publish
.reported_epoch
6426 << ":" << pg
->pg_stats_publish
.reported_seq
<< dendl
;
6427 pg
->stat_queue_item
.remove_myself();
6428 pg
->put("pg_stat_queue");
6430 dout(25) << " still pending " << pg
->info
.pgid
<< " " << pg
->pg_stats_publish
.reported_epoch
6431 << ":" << pg
->pg_stats_publish
.reported_seq
<< " > acked "
6432 << acked
->second
<< dendl
;
6434 pg
->pg_stats_publish_lock
.Unlock();
6436 dout(30) << " still pending " << pg
->info
.pgid
<< " " << pg
->pg_stats_publish
.reported_epoch
6437 << ":" << pg
->pg_stats_publish
.reported_seq
<< dendl
;
6441 outstanding_pg_stats
.erase(ack
->get_tid());
6442 dout(20) << __func__
<< " still pending: " << outstanding_pg_stats
<< dendl
;
6444 pg_stat_queue_lock
.Unlock();
6449 void OSD::flush_pg_stats()
6451 dout(10) << "flush_pg_stats" << dendl
;
6453 utime_t now
= ceph_clock_now();
6454 map_lock
.get_read();
6455 mon_report_lock
.Lock();
6457 mon_report_lock
.Unlock();
6458 map_lock
.put_read();
6461 pg_stat_queue_lock
.Lock();
6462 uint64_t tid
= pg_stat_tid
;
6463 dout(10) << "flush_pg_stats waiting for stats tid " << tid
<< " to flush" << dendl
;
6464 while (tid
> pg_stat_tid_flushed
)
6465 pg_stat_queue_cond
.Wait(pg_stat_queue_lock
);
6466 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid
<< " to flush" << dendl
;
6467 pg_stat_queue_lock
.Unlock();
6472 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
6474 const auto& monmap
= monc
->monmap
;
6475 // send beacon to mon even if we are just connected, and the monmap is not
6476 // initialized yet by then.
6477 if (monmap
.epoch
> 0 &&
6478 monmap
.get_required_features().contains_all(
6479 ceph::features::mon::FEATURE_LUMINOUS
)) {
6480 dout(20) << __func__
<< " sending" << dendl
;
6481 MOSDBeacon
* beacon
= nullptr;
6483 Mutex::Locker l
{min_last_epoch_clean_lock
};
6484 beacon
= new MOSDBeacon(osdmap
->get_epoch(), min_last_epoch_clean
);
6485 std::swap(beacon
->pgs
, min_last_epoch_clean_pgs
);
6486 last_sent_beacon
= now
;
6488 monc
->send_mon_message(beacon
);
6490 dout(20) << __func__
<< " not sending" << dendl
;
6494 void OSD::handle_command(MMonCommand
*m
)
6496 if (!require_mon_peer(m
)) {
6501 Command
*c
= new Command(m
->cmd
, m
->get_tid(), m
->get_data(), NULL
);
6502 command_wq
.queue(c
);
6506 void OSD::handle_command(MCommand
*m
)
6508 ConnectionRef con
= m
->get_connection();
6509 Session
*session
= static_cast<Session
*>(con
->get_priv());
6511 con
->send_message(new MCommandReply(m
, -EPERM
));
6516 OSDCap
& caps
= session
->caps
;
6519 if (!caps
.allow_all() || m
->get_source().is_mon()) {
6520 con
->send_message(new MCommandReply(m
, -EPERM
));
6525 Command
*c
= new Command(m
->cmd
, m
->get_tid(), m
->get_data(), con
.get());
6526 command_wq
.queue(c
);
6536 string availability
;
6537 } osd_commands
[] = {
6539 #define COMMAND(parsesig, helptext, module, perm, availability) \
6540 {parsesig, helptext, module, perm, availability},
6542 // yes, these are really pg commands, but there's a limit to how
6543 // much work it's worth. The OSD returns all of them. Make this
6544 // form (pg <pgid> <cmd>) valid only for the cli.
6545 // Rest uses "tell <pgid> <cmd>"
6548 "name=pgid,type=CephPgid " \
6549 "name=cmd,type=CephChoices,strings=query", \
6550 "show details of a specific pg", "osd", "r", "cli")
6552 "name=pgid,type=CephPgid " \
6553 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6554 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6555 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6558 "name=pgid,type=CephPgid " \
6559 "name=cmd,type=CephChoices,strings=list_missing " \
6560 "name=offset,type=CephString,req=false",
6561 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6564 // new form: tell <pgid> <cmd> for both cli and rest
6567 "show details of a specific pg", "osd", "r", "cli,rest")
6568 COMMAND("mark_unfound_lost " \
6569 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6570 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6571 "osd", "rw", "cli,rest")
6572 COMMAND("list_missing " \
6573 "name=offset,type=CephString,req=false",
6574 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6575 "osd", "r", "cli,rest")
6576 COMMAND("perf histogram dump "
6577 "name=logger,type=CephString,req=false "
6578 "name=counter,type=CephString,req=false",
6579 "Get histogram data",
6580 "osd", "r", "cli,rest")
6582 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6583 COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6584 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6585 COMMAND("injectargs " \
6586 "name=injected_args,type=CephString,n=N",
6587 "inject configuration arguments into running OSD",
6588 "osd", "rw", "cli,rest")
6589 COMMAND("config set " \
6590 "name=key,type=CephString name=value,type=CephString",
6591 "Set a configuration option at runtime (not persistent)",
6592 "osd", "rw", "cli,rest")
6593 COMMAND("cluster_log " \
6594 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6595 "name=message,type=CephString,n=N",
6596 "log a message to the cluster log",
6597 "osd", "rw", "cli,rest")
6599 "name=count,type=CephInt,req=false " \
6600 "name=size,type=CephInt,req=false " \
6601 "name=object_size,type=CephInt,req=false " \
6602 "name=object_num,type=CephInt,req=false ", \
6603 "OSD benchmark: write <count> <size>-byte objects, " \
6604 "(default 1G size 4MB). Results in log.",
6605 "osd", "rw", "cli,rest")
6606 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6608 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6609 "show heap usage info (available only if compiled with tcmalloc)", \
6610 "osd", "rw", "cli,rest")
6611 COMMAND("debug dump_missing " \
6612 "name=filename,type=CephFilepath",
6613 "dump missing objects to a named file", "osd", "r", "cli,rest")
6614 COMMAND("debug kick_recovery_wq " \
6615 "name=delay,type=CephInt,range=0",
6616 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6617 COMMAND("cpu_profiler " \
6618 "name=arg,type=CephChoices,strings=status|flush",
6619 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6620 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6621 "osd", "r", "cli,rest")
6622 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6623 "osd", "rw", "cli,rest")
6625 "compact object store's omap. "
6626 "WARNING: Compaction probably slows your requests",
6627 "osd", "rw", "cli,rest")
6631 class unlock_guard
{
6634 explicit unlock_guard(Mutex
& mutex
)
6639 unlock_guard(unlock_guard
&) = delete;
6646 void OSD::do_command(Connection
*con
, ceph_tid_t tid
, vector
<string
>& cmd
, bufferlist
& data
)
6649 stringstream ss
, ds
;
6653 dout(20) << "do_command tid " << tid
<< " " << cmd
<< dendl
;
6655 map
<string
, cmd_vartype
> cmdmap
;
6659 boost::scoped_ptr
<Formatter
> f
;
6662 ss
<< "no command given";
6666 if (!cmdmap_from_json(cmd
, &cmdmap
, ss
)) {
6671 cmd_getval(cct
, cmdmap
, "prefix", prefix
);
6673 if (prefix
== "get_command_descriptions") {
6675 JSONFormatter
*f
= new JSONFormatter();
6676 f
->open_object_section("command_descriptions");
6677 for (OSDCommand
*cp
= osd_commands
;
6678 cp
< &osd_commands
[ARRAY_SIZE(osd_commands
)]; cp
++) {
6680 ostringstream secname
;
6681 secname
<< "cmd" << setfill('0') << std::setw(3) << cmdnum
;
6682 dump_cmddesc_to_json(f
, secname
.str(), cp
->cmdstring
, cp
->helpstring
,
6683 cp
->module
, cp
->perm
, cp
->availability
, 0);
6686 f
->close_section(); // command_descriptions
6693 cmd_getval(cct
, cmdmap
, "format", format
);
6694 f
.reset(Formatter::create(format
));
6696 if (prefix
== "version") {
6698 f
->open_object_section("version");
6699 f
->dump_string("version", pretty_version_to_str());
6703 ds
<< pretty_version_to_str();
6707 else if (prefix
== "injectargs") {
6708 vector
<string
> argsvec
;
6709 cmd_getval(cct
, cmdmap
, "injected_args", argsvec
);
6711 if (argsvec
.empty()) {
6713 ss
<< "ignoring empty injectargs";
6716 string args
= argsvec
.front();
6717 for (vector
<string
>::iterator a
= ++argsvec
.begin(); a
!= argsvec
.end(); ++a
)
6719 unlock_guard unlock
{osd_lock
};
6720 r
= cct
->_conf
->injectargs(args
, &ss
);
6722 else if (prefix
== "config set") {
6725 cmd_getval(cct
, cmdmap
, "key", key
);
6726 cmd_getval(cct
, cmdmap
, "value", val
);
6727 unlock_guard unlock
{osd_lock
};
6728 r
= cct
->_conf
->set_val(key
, val
, true, &ss
);
6730 cct
->_conf
->apply_changes(nullptr);
6733 else if (prefix
== "cluster_log") {
6735 cmd_getval(cct
, cmdmap
, "message", msg
);
6738 ss
<< "ignoring empty log message";
6741 string message
= msg
.front();
6742 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
6743 message
+= " " + *a
;
6745 cmd_getval(cct
, cmdmap
, "level", lvl
);
6746 clog_type level
= string_to_clog_type(lvl
);
6749 ss
<< "unknown level '" << lvl
<< "'";
6752 clog
->do_log(level
, message
);
6755 // either 'pg <pgid> <command>' or
6756 // 'tell <pgid>' (which comes in without any of that prefix)?
6758 else if (prefix
== "pg" ||
6759 prefix
== "query" ||
6760 prefix
== "mark_unfound_lost" ||
6761 prefix
== "list_missing"
6765 if (!cmd_getval(cct
, cmdmap
, "pgid", pgidstr
)) {
6766 ss
<< "no pgid specified";
6768 } else if (!pgid
.parse(pgidstr
.c_str())) {
6769 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
6774 if (osdmap
->get_primary_shard(pgid
, &pcand
) &&
6775 (pg
= _lookup_lock_pg(pcand
))) {
6776 if (pg
->is_primary()) {
6777 // simulate pg <pgid> cmd= for pg->do-command
6779 cmd_putval(cct
, cmdmap
, "cmd", prefix
);
6780 r
= pg
->do_command(cmdmap
, ss
, data
, odata
, con
, tid
);
6783 // don't reply, pg will do so async
6787 ss
<< "not primary for pgid " << pgid
;
6789 // send them the latest diff to ensure they realize the mapping
6791 service
.send_incremental_map(osdmap
->get_epoch() - 1, con
, osdmap
);
6793 // do not reply; they will get newer maps and realize they
6800 ss
<< "i don't have pgid " << pgid
;
6806 else if (prefix
== "bench") {
6809 int64_t osize
, onum
;
6810 // default count 1G, size 4MB
6811 cmd_getval(cct
, cmdmap
, "count", count
, (int64_t)1 << 30);
6812 cmd_getval(cct
, cmdmap
, "size", bsize
, (int64_t)4 << 20);
6813 cmd_getval(cct
, cmdmap
, "object_size", osize
, (int64_t)0);
6814 cmd_getval(cct
, cmdmap
, "object_num", onum
, (int64_t)0);
6816 ceph::shared_ptr
<ObjectStore::Sequencer
> osr (std::make_shared
<
6817 ObjectStore::Sequencer
>("bench"));
6819 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
6821 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
6822 // let us limit the block size because the next checks rely on it
6823 // having a sane value. If we allow any block size to be set things
6824 // can still go sideways.
6825 ss
<< "block 'size' values are capped at "
6826 << byte_u_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
6827 << " a higher value, please adjust 'osd_bench_max_block_size'";
6830 } else if (bsize
< (int64_t) (1 << 20)) {
6831 // entering the realm of small block sizes.
6832 // limit the count to a sane value, assuming a configurable amount of
6833 // IOPS and duration, so that the OSD doesn't get hung up on this,
6834 // preventing timeouts from going off
6836 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
6837 if (count
> max_count
) {
6838 ss
<< "'count' values greater than " << max_count
6839 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
6840 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
6841 << " for " << duration
<< " seconds,"
6842 << " can cause ill effects on osd. "
6843 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6844 << " value if you wish to use a higher 'count'.";
6849 // 1MB block sizes are big enough so that we get more stuff done.
6850 // However, to avoid the osd from getting hung on this and having
6851 // timers being triggered, we are going to limit the count assuming
6852 // a configurable throughput and duration.
6853 // NOTE: max_count is the total amount of bytes that we believe we
6854 // will be able to write during 'duration' for the given
6855 // throughput. The block size hardly impacts this unless it's
6856 // way too big. Given we already check how big the block size
6857 // is, it's safe to assume everything will check out.
6859 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
6860 if (count
> max_count
) {
6861 ss
<< "'count' values greater than " << max_count
6862 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
6863 << byte_u_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
6864 << " for " << duration
<< " seconds,"
6865 << " can cause ill effects on osd. "
6866 << " Please adjust 'osd_bench_large_size_max_throughput'"
6867 << " with a higher value if you wish to use a higher 'count'.";
6873 if (osize
&& bsize
> osize
)
6876 dout(1) << " bench count " << count
6877 << " bsize " << byte_u_t(bsize
) << dendl
;
6879 ObjectStore::Transaction cleanupt
;
6881 if (osize
&& onum
) {
6883 bufferptr
bp(osize
);
6885 bl
.push_back(std::move(bp
));
6886 bl
.rebuild_page_aligned();
6887 for (int i
=0; i
<onum
; ++i
) {
6889 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
6891 hobject_t
soid(sobject_t(oid
, 0));
6892 ObjectStore::Transaction t
;
6893 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
6894 store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
6895 cleanupt
.remove(coll_t(), ghobject_t(soid
));
6900 bufferptr
bp(bsize
);
6902 bl
.push_back(std::move(bp
));
6903 bl
.rebuild_page_aligned();
6907 if (!osr
->flush_commit(&waiter
)) {
6912 utime_t start
= ceph_clock_now();
6913 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
6915 unsigned offset
= 0;
6916 if (onum
&& osize
) {
6917 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
6918 offset
= rand() % (osize
/ bsize
) * bsize
;
6920 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
6923 hobject_t
soid(sobject_t(oid
, 0));
6924 ObjectStore::Transaction t
;
6925 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
6926 store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
6927 if (!onum
|| !osize
)
6928 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
6933 if (!osr
->flush_commit(&waiter
)) {
6937 utime_t end
= ceph_clock_now();
6940 store
->queue_transaction(osr
.get(), std::move(cleanupt
), NULL
);
6943 if (!osr
->flush_commit(&waiter
)) {
6948 double elapsed
= end
- start
;
6949 double rate
= count
/ elapsed
;
6950 double iops
= rate
/ bsize
;
6952 f
->open_object_section("osd_bench_results");
6953 f
->dump_int("bytes_written", count
);
6954 f
->dump_int("blocksize", bsize
);
6955 f
->dump_float("elapsed_sec", elapsed
);
6956 f
->dump_float("bytes_per_sec", rate
);
6957 f
->dump_float("iops", iops
);
6961 ds
<< "bench: wrote " << byte_u_t(count
)
6962 << " in blocks of " << byte_u_t(bsize
) << " in "
6963 << elapsed
<< " sec at " << byte_u_t(rate
) << "/sec "
6964 << si_u_t(iops
) << " IOPS";
6968 else if (prefix
== "flush_pg_stats") {
6969 if (osdmap
->require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
6970 mgrc
.send_pgstats();
6971 ds
<< service
.get_osd_stat_seq() << "\n";
6977 else if (prefix
== "heap") {
6978 r
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ds
);
6981 else if (prefix
== "debug dump_missing") {
6983 cmd_getval(cct
, cmdmap
, "filename", file_name
);
6984 std::ofstream
fout(file_name
.c_str());
6985 if (!fout
.is_open()) {
6986 ss
<< "failed to open file '" << file_name
<< "'";
6991 fout
<< "*** osd " << whoami
<< ": dump_missing ***" << std::endl
;
6992 RWLock::RLocker
l(pg_map_lock
);
6993 for (ceph::unordered_map
<spg_t
, PG
*>::const_iterator pg_map_e
= pg_map
.begin();
6994 pg_map_e
!= pg_map
.end(); ++pg_map_e
) {
6995 PG
*pg
= pg_map_e
->second
;
6998 fout
<< *pg
<< std::endl
;
6999 std::map
<hobject_t
, pg_missing_item
>::const_iterator mend
=
7000 pg
->pg_log
.get_missing().get_items().end();
7001 std::map
<hobject_t
, pg_missing_item
>::const_iterator mi
=
7002 pg
->pg_log
.get_missing().get_items().begin();
7003 for (; mi
!= mend
; ++mi
) {
7004 fout
<< mi
->first
<< " -> " << mi
->second
<< std::endl
;
7005 if (!pg
->missing_loc
.needs_recovery(mi
->first
))
7007 if (pg
->missing_loc
.is_unfound(mi
->first
))
7008 fout
<< " unfound ";
7009 const set
<pg_shard_t
> &mls(pg
->missing_loc
.get_locations(mi
->first
));
7012 fout
<< "missing_loc: " << mls
<< std::endl
;
7020 else if (prefix
== "debug kick_recovery_wq") {
7022 cmd_getval(cct
, cmdmap
, "delay", delay
);
7025 unlock_guard unlock
{osd_lock
};
7026 r
= cct
->_conf
->set_val("osd_recovery_delay_start", oss
.str().c_str());
7028 ss
<< "kick_recovery_wq: error setting "
7029 << "osd_recovery_delay_start to '" << delay
<< "': error "
7033 cct
->_conf
->apply_changes(NULL
);
7034 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
7035 << "to " << cct
->_conf
->osd_recovery_delay_start
;
7038 else if (prefix
== "cpu_profiler") {
7040 cmd_getval(cct
, cmdmap
, "arg", arg
);
7041 vector
<string
> argvec
;
7042 get_str_vec(arg
, argvec
);
7043 cpu_profiler_handle_command(argvec
, ds
);
7046 else if (prefix
== "dump_pg_recovery_stats") {
7049 pg_recovery_stats
.dump_formatted(f
.get());
7052 pg_recovery_stats
.dump(s
);
7053 ds
<< "dump pg recovery stats: " << s
.str();
7057 else if (prefix
== "reset_pg_recovery_stats") {
7058 ss
<< "reset pg recovery stats";
7059 pg_recovery_stats
.reset();
7062 else if (prefix
== "perf histogram dump") {
7064 std::string counter
;
7065 cmd_getval(cct
, cmdmap
, "logger", logger
);
7066 cmd_getval(cct
, cmdmap
, "counter", counter
);
7068 cct
->get_perfcounters_collection()->dump_formatted_histograms(
7069 f
.get(), false, logger
, counter
);
7074 else if (prefix
== "compact") {
7075 dout(1) << "triggering manual compaction" << dendl
;
7076 auto start
= ceph::coarse_mono_clock::now();
7078 auto end
= ceph::coarse_mono_clock::now();
7079 auto time_span
= chrono::duration_cast
<chrono::duration
<double>>(end
- start
);
7080 dout(1) << "finished manual compaction in "
7081 << time_span
.count()
7082 << " seconds" << dendl
;
7083 ss
<< "compacted omap in " << time_span
.count() << " seconds";
7087 ss
<< "unrecognized command! " << cmd
;
7094 dout(0) << "do_command r=" << r
<< " " << rs
<< dendl
;
7097 MCommandReply
*reply
= new MCommandReply(r
, rs
);
7098 reply
->set_tid(tid
);
7099 reply
->set_data(odata
);
7100 con
->send_message(reply
);
7104 bool OSD::heartbeat_dispatch(Message
*m
)
7106 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
7107 switch (m
->get_type()) {
7110 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
7115 handle_osd_ping(static_cast<MOSDPing
*>(m
));
7119 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
7126 bool OSD::ms_dispatch(Message
*m
)
7128 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
7129 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
7130 service
.got_stop_ack();
7138 if (is_stopping()) {
7152 void OSD::maybe_share_map(
7157 if (!op
->check_send_map
) {
7160 epoch_t last_sent_epoch
= 0;
7162 session
->sent_epoch_lock
.lock();
7163 last_sent_epoch
= session
->last_sent_epoch
;
7164 session
->sent_epoch_lock
.unlock();
7166 const Message
*m
= op
->get_req();
7169 m
->get_connection().get(),
7172 session
? &last_sent_epoch
: NULL
);
7174 session
->sent_epoch_lock
.lock();
7175 if (session
->last_sent_epoch
< last_sent_epoch
) {
7176 session
->last_sent_epoch
= last_sent_epoch
;
7178 session
->sent_epoch_lock
.unlock();
7180 op
->check_send_map
= false;
7183 void OSD::dispatch_session_waiting(Session
*session
, OSDMapRef osdmap
)
7185 assert(session
->session_dispatch_lock
.is_locked());
7187 auto i
= session
->waiting_on_map
.begin();
7188 while (i
!= session
->waiting_on_map
.end()) {
7189 OpRequestRef op
= &(*i
);
7190 assert(ms_can_fast_dispatch(op
->get_req()));
7191 const MOSDFastDispatchOp
*m
= static_cast<const MOSDFastDispatchOp
*>(
7193 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
7196 session
->waiting_on_map
.erase(i
++);
7200 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
7201 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
7202 static_cast<const MOSDOp
*>(m
)->get_pg());
7203 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
7207 pgid
= m
->get_spg();
7209 enqueue_op(pgid
, op
, m
->get_map_epoch());
7212 if (session
->waiting_on_map
.empty()) {
7213 clear_session_waiting_on_map(session
);
7215 register_session_waiting_on_map(session
);
7219 void OSD::ms_fast_dispatch(Message
*m
)
7222 if (service
.is_stopping()) {
7226 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7229 osd_reqid_t reqid
= op
->get_reqid();
7231 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
7232 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
7236 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7238 // note sender epoch, min req'd epoch
7239 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
7240 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
7241 assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
7243 service
.maybe_inject_dispatch_delay();
7245 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
7246 m
->get_type() != CEPH_MSG_OSD_OP
) {
7247 // queue it directly
7249 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
7251 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
7253 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7254 // message that didn't have an explicit spg_t); we need to map
7255 // them to an spg_t while preserving delivery order.
7256 Session
*session
= static_cast<Session
*>(m
->get_connection()->get_priv());
7259 Mutex::Locker
l(session
->session_dispatch_lock
);
7261 session
->waiting_on_map
.push_back(*op
);
7262 OSDMapRef nextmap
= service
.get_nextmap_reserved();
7263 dispatch_session_waiting(session
, nextmap
);
7264 service
.release_map(nextmap
);
7269 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
7272 void OSD::ms_fast_preprocess(Message
*m
)
7274 if (m
->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD
) {
7275 if (m
->get_type() == CEPH_MSG_OSD_MAP
) {
7276 MOSDMap
*mm
= static_cast<MOSDMap
*>(m
);
7277 Session
*s
= static_cast<Session
*>(m
->get_connection()->get_priv());
7279 s
->received_map_lock
.lock();
7280 s
->received_map_epoch
= mm
->get_last();
7281 s
->received_map_lock
.unlock();
7288 bool OSD::ms_get_authorizer(int dest_type
, AuthAuthorizer
**authorizer
, bool force_new
)
7290 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type
) << dendl
;
7292 if (is_stopping()) {
7293 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
7297 if (dest_type
== CEPH_ENTITY_TYPE_MON
)
7301 /* the MonClient checks keys every tick(), so we should just wait for that cycle
7303 if (monc
->wait_auth_rotating(10) < 0) {
7304 derr
<< "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl
;
7309 *authorizer
= monc
->build_authorizer(dest_type
);
7310 return *authorizer
!= NULL
;
7314 bool OSD::ms_verify_authorizer(
7315 Connection
*con
, int peer_type
,
7316 int protocol
, bufferlist
& authorizer_data
, bufferlist
& authorizer_reply
,
7317 bool& isvalid
, CryptoKey
& session_key
,
7318 std::unique_ptr
<AuthAuthorizerChallenge
> *challenge
)
7320 AuthAuthorizeHandler
*authorize_handler
= 0;
7321 switch (peer_type
) {
7322 case CEPH_ENTITY_TYPE_MDS
:
7324 * note: mds is technically a client from our perspective, but
7325 * this makes the 'cluster' consistent w/ monitor's usage.
7327 case CEPH_ENTITY_TYPE_OSD
:
7328 case CEPH_ENTITY_TYPE_MGR
:
7329 authorize_handler
= authorize_handler_cluster_registry
->get_handler(protocol
);
7332 authorize_handler
= authorize_handler_service_registry
->get_handler(protocol
);
7334 if (!authorize_handler
) {
7335 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol
<< dendl
;
7340 AuthCapsInfo caps_info
;
7343 uint64_t auid
= CEPH_AUTH_UID_DEFAULT
;
7345 RotatingKeyRing
*keys
= monc
->rotating_secrets
.get();
7347 isvalid
= authorize_handler
->verify_authorizer(
7349 authorizer_data
, authorizer_reply
, name
, global_id
, caps_info
, session_key
,
7352 dout(10) << __func__
<< " no rotating_keys (yet), denied" << dendl
;
7357 Session
*s
= static_cast<Session
*>(con
->get_priv());
7359 s
= new Session(cct
);
7360 con
->set_priv(s
->get());
7362 dout(10) << " new session " << s
<< " con=" << s
->con
<< " addr=" << s
->con
->get_peer_addr() << dendl
;
7365 s
->entity_name
= name
;
7366 if (caps_info
.allow_all
)
7367 s
->caps
.set_allow_all();
7370 if (caps_info
.caps
.length() > 0) {
7371 bufferlist::iterator p
= caps_info
.caps
.begin();
7376 catch (buffer::error
& e
) {
7378 bool success
= s
->caps
.parse(str
);
7380 dout(10) << " session " << s
<< " " << s
->entity_name
<< " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
7382 dout(10) << " session " << s
<< " " << s
->entity_name
<< " failed to parse caps '" << str
<< "'" << dendl
;
7390 void OSD::do_waiters()
7392 assert(osd_lock
.is_locked());
7394 dout(10) << "do_waiters -- start" << dendl
;
7395 while (!finished
.empty()) {
7396 OpRequestRef next
= finished
.front();
7397 finished
.pop_front();
7400 dout(10) << "do_waiters -- finish" << dendl
;
7403 void OSD::dispatch_op(OpRequestRef op
)
7405 switch (op
->get_req()->get_type()) {
7407 case MSG_OSD_PG_CREATE
:
7408 handle_pg_create(op
);
7410 case MSG_OSD_PG_NOTIFY
:
7411 handle_pg_notify(op
);
7413 case MSG_OSD_PG_QUERY
:
7414 handle_pg_query(op
);
7416 case MSG_OSD_PG_LOG
:
7419 case MSG_OSD_PG_REMOVE
:
7420 handle_pg_remove(op
);
7422 case MSG_OSD_PG_INFO
:
7425 case MSG_OSD_PG_TRIM
:
7428 case MSG_OSD_BACKFILL_RESERVE
:
7429 handle_pg_backfill_reserve(op
);
7431 case MSG_OSD_RECOVERY_RESERVE
:
7432 handle_pg_recovery_reserve(op
);
7437 void OSD::_dispatch(Message
*m
)
7439 assert(osd_lock
.is_locked());
7440 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
7442 switch (m
->get_type()) {
7444 // -- don't need lock --
7446 dout(10) << "ping from " << m
->get_source() << dendl
;
7450 // -- don't need OSDMap --
7452 // map and replication
7453 case CEPH_MSG_OSD_MAP
:
7454 handle_osd_map(static_cast<MOSDMap
*>(m
));
7458 case MSG_PGSTATSACK
:
7459 handle_pg_stats_ack(static_cast<MPGStatsAck
*>(m
));
7462 case MSG_MON_COMMAND
:
7463 handle_command(static_cast<MMonCommand
*>(m
));
7466 handle_command(static_cast<MCommand
*>(m
));
7470 handle_scrub(static_cast<MOSDScrub
*>(m
));
7473 case MSG_OSD_FORCE_RECOVERY
:
7474 handle_force_recovery(m
);
7477 // -- need OSDMap --
7479 case MSG_OSD_PG_CREATE
:
7480 case MSG_OSD_PG_NOTIFY
:
7481 case MSG_OSD_PG_QUERY
:
7482 case MSG_OSD_PG_LOG
:
7483 case MSG_OSD_PG_REMOVE
:
7484 case MSG_OSD_PG_INFO
:
7485 case MSG_OSD_PG_TRIM
:
7486 case MSG_OSD_BACKFILL_RESERVE
:
7487 case MSG_OSD_RECOVERY_RESERVE
:
7489 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7491 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7492 // no map? starting up?
7494 dout(7) << "no OSDMap, not booted" << dendl
;
7495 logger
->inc(l_osd_waiting_for_map
);
7496 waiting_for_osdmap
.push_back(op
);
7497 op
->mark_delayed("no osdmap");
7507 void OSD::handle_pg_scrub(MOSDScrub
*m
, PG
*pg
)
7510 if (pg
->is_primary()) {
7511 pg
->unreg_next_scrub();
7512 pg
->scrubber
.must_scrub
= true;
7513 pg
->scrubber
.must_deep_scrub
= m
->deep
|| m
->repair
;
7514 pg
->scrubber
.must_repair
= m
->repair
;
7515 pg
->reg_next_scrub();
7516 dout(10) << "marking " << *pg
<< " for scrub" << dendl
;
7521 void OSD::handle_scrub(MOSDScrub
*m
)
7523 dout(10) << "handle_scrub " << *m
<< dendl
;
7524 if (!require_mon_or_mgr_peer(m
)) {
7528 if (m
->fsid
!= monc
->get_fsid()) {
7529 dout(0) << "handle_scrub fsid " << m
->fsid
<< " != " << monc
->get_fsid() << dendl
;
7534 RWLock::RLocker
l(pg_map_lock
);
7535 if (m
->scrub_pgs
.empty()) {
7536 for (ceph::unordered_map
<spg_t
, PG
*>::iterator p
= pg_map
.begin();
7539 handle_pg_scrub(m
, p
->second
);
7541 for (vector
<pg_t
>::iterator p
= m
->scrub_pgs
.begin();
7542 p
!= m
->scrub_pgs
.end();
7545 if (osdmap
->get_primary_shard(*p
, &pcand
)) {
7546 auto pg_map_entry
= pg_map
.find(pcand
);
7547 if (pg_map_entry
!= pg_map
.end()) {
7548 handle_pg_scrub(m
, pg_map_entry
->second
);
7557 bool OSD::scrub_random_backoff()
7559 bool coin_flip
= (rand() / (double)RAND_MAX
>=
7560 cct
->_conf
->osd_scrub_backoff_ratio
);
7562 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl
;
7568 OSDService::ScrubJob::ScrubJob(CephContext
* cct
,
7569 const spg_t
& pg
, const utime_t
& timestamp
,
7570 double pool_scrub_min_interval
,
7571 double pool_scrub_max_interval
, bool must
)
7574 sched_time(timestamp
),
7577 // if not explicitly requested, postpone the scrub with a random delay
7579 double scrub_min_interval
= pool_scrub_min_interval
> 0 ?
7580 pool_scrub_min_interval
: cct
->_conf
->osd_scrub_min_interval
;
7581 double scrub_max_interval
= pool_scrub_max_interval
> 0 ?
7582 pool_scrub_max_interval
: cct
->_conf
->osd_scrub_max_interval
;
7584 sched_time
+= scrub_min_interval
;
7585 double r
= rand() / (double)RAND_MAX
;
7587 scrub_min_interval
* cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
7588 deadline
+= scrub_max_interval
;
7592 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob
& rhs
) const {
7593 if (sched_time
< rhs
.sched_time
)
7595 if (sched_time
> rhs
.sched_time
)
7597 return pgid
< rhs
.pgid
;
7600 bool OSD::scrub_time_permit(utime_t now
)
7603 time_t tt
= now
.sec();
7604 localtime_r(&tt
, &bdt
);
7606 bool day_permit
= false;
7607 if (cct
->_conf
->osd_scrub_begin_week_day
< cct
->_conf
->osd_scrub_end_week_day
) {
7608 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
&& bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7612 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
|| bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7618 dout(20) << __func__
<< " should run between week day " << cct
->_conf
->osd_scrub_begin_week_day
7619 << " - " << cct
->_conf
->osd_scrub_end_week_day
7620 << " now " << bdt
.tm_wday
<< " = no" << dendl
;
7624 bool time_permit
= false;
7625 if (cct
->_conf
->osd_scrub_begin_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7626 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
&& bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7630 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
|| bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7635 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7636 << " - " << cct
->_conf
->osd_scrub_end_hour
7637 << " now " << bdt
.tm_hour
<< " = no" << dendl
;
7639 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7640 << " - " << cct
->_conf
->osd_scrub_end_hour
7641 << " now " << bdt
.tm_hour
<< " = yes" << dendl
;
7646 bool OSD::scrub_load_below_threshold()
7649 if (getloadavg(loadavgs
, 3) != 3) {
7650 dout(10) << __func__
<< " couldn't read loadavgs\n" << dendl
;
7654 // allow scrub if below configured threshold
7655 long cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
7656 double loadavg_per_cpu
= cpus
> 0 ? loadavgs
[0] / cpus
: loadavgs
[0];
7657 if (loadavg_per_cpu
< cct
->_conf
->osd_scrub_load_threshold
) {
7658 dout(20) << __func__
<< " loadavg per cpu " << loadavg_per_cpu
7659 << " < max " << cct
->_conf
->osd_scrub_load_threshold
7660 << " = yes" << dendl
;
7664 // allow scrub if below daily avg and currently decreasing
7665 if (loadavgs
[0] < daily_loadavg
&& loadavgs
[0] < loadavgs
[2]) {
7666 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7667 << " < daily_loadavg " << daily_loadavg
7668 << " and < 15m avg " << loadavgs
[2]
7669 << " = yes" << dendl
;
7673 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7674 << " >= max " << cct
->_conf
->osd_scrub_load_threshold
7675 << " and ( >= daily_loadavg " << daily_loadavg
7676 << " or >= 15m avg " << loadavgs
[2]
7677 << ") = no" << dendl
;
7681 void OSD::sched_scrub()
7683 // if not permitted, fail fast
7684 if (!service
.can_inc_scrubs_pending()) {
7687 if (!cct
->_conf
->osd_scrub_during_recovery
&& service
.is_recovery_active()) {
7688 dout(20) << __func__
<< " not scheduling scrubs due to active recovery" << dendl
;
7693 utime_t now
= ceph_clock_now();
7694 bool time_permit
= scrub_time_permit(now
);
7695 bool load_is_low
= scrub_load_below_threshold();
7696 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low
<< dendl
;
7698 OSDService::ScrubJob scrub
;
7699 if (service
.first_scrub_stamp(&scrub
)) {
7701 dout(30) << "sched_scrub examine " << scrub
.pgid
<< " at " << scrub
.sched_time
<< dendl
;
7703 if (scrub
.sched_time
> now
) {
7704 // save ourselves some effort
7705 dout(10) << "sched_scrub " << scrub
.pgid
<< " scheduled at " << scrub
.sched_time
7706 << " > " << now
<< dendl
;
7710 if ((scrub
.deadline
>= now
) && !(time_permit
&& load_is_low
)) {
7711 dout(10) << __func__
<< " not scheduling scrub for " << scrub
.pgid
<< " due to "
7712 << (!time_permit
? "time not permit" : "high load") << dendl
;
7716 PG
*pg
= _lookup_lock_pg(scrub
.pgid
);
7719 if (pg
->get_pgbackend()->scrub_supported() && pg
->is_active()) {
7720 dout(10) << "sched_scrub scrubbing " << scrub
.pgid
<< " at " << scrub
.sched_time
7721 << (pg
->scrubber
.must_scrub
? ", explicitly requested" :
7722 (load_is_low
? ", load_is_low" : " deadline < now"))
7724 if (pg
->sched_scrub()) {
7730 } while (service
.next_scrub_stamp(scrub
, &scrub
));
7732 dout(20) << "sched_scrub done" << dendl
;
7737 vector
<OSDHealthMetric
> OSD::get_health_metrics()
7739 vector
<OSDHealthMetric
> metrics
;
7740 lock_guard
<mutex
> pending_creates_locker
{pending_creates_lock
};
7741 auto n_primaries
= pending_creates_from_mon
;
7742 for (const auto& create
: pending_creates_from_osd
) {
7743 if (create
.second
) {
7747 metrics
.emplace_back(osd_metric::PENDING_CREATING_PGS
, n_primaries
);
7751 // =====================================================
7754 void OSD::wait_for_new_map(OpRequestRef op
)
7757 if (waiting_for_osdmap
.empty()) {
7758 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
7761 logger
->inc(l_osd_waiting_for_map
);
7762 waiting_for_osdmap
.push_back(op
);
7763 op
->mark_delayed("wait for new map");
7768 * assimilate new OSDMap(s). scan pgs, etc.
7771 void OSD::note_down_osd(int peer
)
7773 assert(osd_lock
.is_locked());
7774 cluster_messenger
->mark_down(osdmap
->get_cluster_addr(peer
));
7776 heartbeat_lock
.Lock();
7777 failure_queue
.erase(peer
);
7778 failure_pending
.erase(peer
);
7779 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
7780 if (p
!= heartbeat_peers
.end()) {
7781 p
->second
.con_back
->mark_down();
7782 if (p
->second
.con_front
) {
7783 p
->second
.con_front
->mark_down();
7785 heartbeat_peers
.erase(p
);
7787 heartbeat_lock
.Unlock();
7790 void OSD::note_up_osd(int peer
)
7792 service
.forget_peer_epoch(peer
, osdmap
->get_epoch() - 1);
7793 heartbeat_set_peers_need_update();
7796 struct C_OnMapCommit
: public Context
{
7798 epoch_t first
, last
;
7800 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
7801 : osd(o
), first(f
), last(l
), msg(m
) {}
7802 void finish(int r
) override
{
7803 osd
->_committed_osd_maps(first
, last
, msg
);
7808 struct C_OnMapApply
: public Context
{
7809 OSDService
*service
;
7810 list
<OSDMapRef
> pinned_maps
;
7812 C_OnMapApply(OSDService
*service
,
7813 const list
<OSDMapRef
> &pinned_maps
,
7815 : service(service
), pinned_maps(pinned_maps
), e(e
) {}
7816 void finish(int r
) override
{
7817 service
->clear_map_bl_cache_pins(e
);
7821 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
7823 Mutex::Locker
l(osdmap_subscribe_lock
);
7824 if (latest_subscribed_epoch
>= epoch
&& !force_request
)
7827 latest_subscribed_epoch
= MAX(epoch
, latest_subscribed_epoch
);
7829 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
7835 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
7837 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
7838 if (min
<= superblock
.oldest_map
)
7842 ObjectStore::Transaction t
;
7843 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
7844 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
7845 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
7846 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
7847 superblock
.oldest_map
= e
+ 1;
7849 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
7850 service
.publish_superblock(superblock
);
7851 write_superblock(t
);
7852 int tr
= store
->queue_transaction(service
.meta_osr
.get(), std::move(t
), nullptr);
7856 // skip_maps leaves us with a range of old maps if we fail to remove all
7857 // of them before moving superblock.oldest_map forward to the first map
7858 // in the incoming MOSDMap msg. so we should continue removing them in
7859 // this case, even we could do huge series of delete transactions all at
7866 service
.publish_superblock(superblock
);
7867 write_superblock(t
);
7868 int tr
= store
->queue_transaction(service
.meta_osr
.get(), std::move(t
), nullptr);
7871 // we should not remove the cached maps
7872 assert(min
<= service
.map_cache
.cached_key_lower_bound());
7875 void OSD::handle_osd_map(MOSDMap
*m
)
7877 assert(osd_lock
.is_locked());
7878 // Keep a ref in the list until we get the newly received map written
7879 // onto disk. This is important because as long as the refs are alive,
7880 // the OSDMaps will be pinned in the cache and we won't try to read it
7881 // off of disk. Otherwise these maps will probably not stay in the cache,
7882 // and reading those OSDMaps before they are actually written can result
7884 list
<OSDMapRef
> pinned_maps
;
7885 if (m
->fsid
!= monc
->get_fsid()) {
7886 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
7887 << monc
->get_fsid() << dendl
;
7891 if (is_initializing()) {
7892 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
7897 Session
*session
= static_cast<Session
*>(m
->get_connection()->get_priv());
7898 if (session
&& !(session
->entity_name
.is_mon() ||
7899 session
->entity_name
.is_osd())) {
7901 dout(10) << "got osd map from Session " << session
7902 << " which we can't take maps from (not a mon or osd)" << dendl
;
7910 // share with the objecter
7912 service
.objecter
->handle_osd_map(m
);
7914 epoch_t first
= m
->get_first();
7915 epoch_t last
= m
->get_last();
7916 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
7917 << superblock
.newest_map
7918 << ", src has [" << m
->oldest_map
<< "," << m
->newest_map
<< "]"
7921 logger
->inc(l_osd_map
);
7922 logger
->inc(l_osd_mape
, last
- first
+ 1);
7923 if (first
<= superblock
.newest_map
)
7924 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
7925 if (service
.max_oldest_map
< m
->oldest_map
) {
7926 service
.max_oldest_map
= m
->oldest_map
;
7927 assert(service
.max_oldest_map
>= superblock
.oldest_map
);
7930 // make sure there is something new, here, before we bother flushing
7931 // the queues and such
7932 if (last
<= superblock
.newest_map
) {
7933 dout(10) << " no new maps here, dropping" << dendl
;
7939 bool skip_maps
= false;
7940 if (first
> superblock
.newest_map
+ 1) {
7941 dout(10) << "handle_osd_map message skips epochs "
7942 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
7943 if (m
->oldest_map
<= superblock
.newest_map
+ 1) {
7944 osdmap_subscribe(superblock
.newest_map
+ 1, false);
7948 // always try to get the full range of maps--as many as we can. this
7949 // 1- is good to have
7950 // 2- is at present the only way to ensure that we get a *full* map as
7952 if (m
->oldest_map
< first
) {
7953 osdmap_subscribe(m
->oldest_map
- 1, true);
7960 ObjectStore::Transaction t
;
7961 uint64_t txn_size
= 0;
7963 // store new maps: queue for disk and put in the osdmap cache
7964 epoch_t start
= MAX(superblock
.newest_map
+ 1, first
);
7965 for (epoch_t e
= start
; e
<= last
; e
++) {
7966 if (txn_size
>= t
.get_num_bytes()) {
7967 derr
<< __func__
<< " transaction size overflowed" << dendl
;
7968 assert(txn_size
< t
.get_num_bytes());
7970 txn_size
= t
.get_num_bytes();
7971 map
<epoch_t
,bufferlist
>::iterator p
;
7972 p
= m
->maps
.find(e
);
7973 if (p
!= m
->maps
.end()) {
7974 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
7975 OSDMap
*o
= new OSDMap
;
7976 bufferlist
& bl
= p
->second
;
7980 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
7981 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
7983 pinned_maps
.push_back(add_map(o
));
7989 p
= m
->incremental_maps
.find(e
);
7990 if (p
!= m
->incremental_maps
.end()) {
7991 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
7992 bufferlist
& bl
= p
->second
;
7993 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
7994 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
7995 pin_map_inc_bl(e
, bl
);
7997 OSDMap
*o
= new OSDMap
;
8000 bool got
= get_map_bl(e
- 1, obl
);
8005 OSDMap::Incremental inc
;
8006 bufferlist::iterator p
= bl
.begin();
8008 if (o
->apply_incremental(inc
) < 0) {
8009 derr
<< "ERROR: bad fsid? i have " << osdmap
->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
8010 assert(0 == "bad fsid");
8014 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
8016 bool injected_failure
= false;
8017 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
8018 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
8019 derr
<< __func__
<< " injecting map crc failure" << dendl
;
8020 injected_failure
= true;
8023 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
8024 dout(2) << "got incremental " << e
8025 << " but failed to encode full with correct crc; requesting"
8027 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
8028 dout(20) << "my encoded map was:\n";
8029 fbl
.hexdump(*_dout
);
8032 request_full_map(e
, last
);
8038 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8039 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
8041 pinned_maps
.push_back(add_map(o
));
8045 assert(0 == "MOSDMap lied about what maps it had?");
8048 // even if this map isn't from a mon, we may have satisfied our subscription
8049 monc
->sub_got("osdmap", last
);
8051 if (!m
->maps
.empty() && requested_full_first
) {
8052 dout(10) << __func__
<< " still missing full maps " << requested_full_first
8053 << ".." << requested_full_last
<< dendl
;
8054 rerequest_full_maps();
8057 if (superblock
.oldest_map
) {
8058 // make sure we at least keep pace with incoming maps
8059 trim_maps(m
->oldest_map
, last
- first
+ 1, skip_maps
);
8062 if (!superblock
.oldest_map
|| skip_maps
)
8063 superblock
.oldest_map
= first
;
8064 superblock
.newest_map
= last
;
8065 superblock
.current_epoch
= last
;
8067 // note in the superblock that we were clean thru the prior epoch
8068 epoch_t boot_epoch
= service
.get_boot_epoch();
8069 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
8070 superblock
.mounted
= boot_epoch
;
8071 superblock
.clean_thru
= last
;
8074 // superblock and commit
8075 write_superblock(t
);
8076 store
->queue_transaction(
8077 service
.meta_osr
.get(),
8079 new C_OnMapApply(&service
, pinned_maps
, last
),
8080 new C_OnMapCommit(this, start
, last
, m
), 0);
8081 service
.publish_superblock(superblock
);
8084 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
8086 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
8087 if (is_stopping()) {
8088 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8091 Mutex::Locker
l(osd_lock
);
8092 if (is_stopping()) {
8093 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8096 map_lock
.get_write();
8098 bool do_shutdown
= false;
8099 bool do_restart
= false;
8100 bool network_error
= false;
8102 // advance through the new maps
8103 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
8104 dout(10) << " advance to epoch " << cur
8105 << " (<= last " << last
8106 << " <= newest_map " << superblock
.newest_map
8109 OSDMapRef newmap
= get_map(cur
);
8110 assert(newmap
); // we just cached it above!
8112 // start blacklisting messages sent to peers that go down.
8113 service
.pre_publish_map(newmap
);
8115 // kill connections to newly down osds
8116 bool waited_for_reservations
= false;
8118 osdmap
->get_all_osds(old
);
8119 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
8121 osdmap
->is_up(*p
) && // in old map
8122 newmap
->is_down(*p
)) { // but not the new one
8123 if (!waited_for_reservations
) {
8124 service
.await_reserved_maps();
8125 waited_for_reservations
= true;
8128 } else if (*p
!= whoami
&&
8129 osdmap
->is_down(*p
) &&
8130 newmap
->is_up(*p
)) {
8135 if ((osdmap
->test_flag(CEPH_OSDMAP_NOUP
) !=
8136 newmap
->test_flag(CEPH_OSDMAP_NOUP
)) ||
8137 (osdmap
->is_noup(whoami
) != newmap
->is_noup(whoami
))) {
8138 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
8141 // this captures the case where we sent the boot message while
8142 // NOUP was being set on the mon and our boot request was
8143 // dropped, and then later it is cleared. it imperfectly
8144 // handles the case where our original boot message was not
8145 // dropped and we restart even though we might have booted, but
8146 // that is harmless (boot will just take slightly longer).
8150 if (osdmap
->require_osd_release
< CEPH_RELEASE_LUMINOUS
&&
8151 newmap
->require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
8152 dout(10) << __func__
<< " require_osd_release reached luminous in "
8153 << newmap
->get_epoch() << dendl
;
8154 clear_pg_stat_queue();
8155 clear_outstanding_pg_stats();
8161 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
8163 osdmap
->is_up(whoami
) &&
8164 osdmap
->get_inst(whoami
) == client_messenger
->get_myinst()) {
8165 up_epoch
= osdmap
->get_epoch();
8166 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
8168 boot_epoch
= osdmap
->get_epoch();
8169 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
8171 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
8175 had_map_since
= ceph_clock_now();
8177 epoch_t _bind_epoch
= service
.get_bind_epoch();
8178 if (osdmap
->is_up(whoami
) &&
8179 osdmap
->get_addr(whoami
) == client_messenger
->get_myaddr() &&
8180 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
8183 dout(1) << "state: booting -> active" << dendl
;
8184 set_state(STATE_ACTIVE
);
8186 // set incarnation so that osd_reqid_t's we generate for our
8187 // objecter requests are unique across restarts.
8188 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
8192 if (osdmap
->get_epoch() > 0 &&
8194 if (!osdmap
->exists(whoami
)) {
8195 dout(0) << "map says i do not exist. shutting down." << dendl
;
8196 do_shutdown
= true; // don't call shutdown() while we have
8197 // everything paused
8198 } else if (!osdmap
->is_up(whoami
) ||
8199 !osdmap
->get_addr(whoami
).probably_equals(
8200 client_messenger
->get_myaddr()) ||
8201 !osdmap
->get_cluster_addr(whoami
).probably_equals(
8202 cluster_messenger
->get_myaddr()) ||
8203 !osdmap
->get_hb_back_addr(whoami
).probably_equals(
8204 hb_back_server_messenger
->get_myaddr()) ||
8205 (osdmap
->get_hb_front_addr(whoami
) != entity_addr_t() &&
8206 !osdmap
->get_hb_front_addr(whoami
).probably_equals(
8207 hb_front_server_messenger
->get_myaddr()))) {
8208 if (!osdmap
->is_up(whoami
)) {
8209 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
8210 service
.got_stop_ack();
8212 clog
->warn() << "Monitor daemon marked osd." << whoami
<< " down, "
8213 "but it is still running";
8214 clog
->debug() << "map e" << osdmap
->get_epoch()
8215 << " wrongly marked me down at e"
8216 << osdmap
->get_down_at(whoami
);
8218 } else if (!osdmap
->get_addr(whoami
).probably_equals(
8219 client_messenger
->get_myaddr())) {
8220 clog
->error() << "map e" << osdmap
->get_epoch()
8221 << " had wrong client addr (" << osdmap
->get_addr(whoami
)
8222 << " != my " << client_messenger
->get_myaddr() << ")";
8223 } else if (!osdmap
->get_cluster_addr(whoami
).probably_equals(
8224 cluster_messenger
->get_myaddr())) {
8225 clog
->error() << "map e" << osdmap
->get_epoch()
8226 << " had wrong cluster addr ("
8227 << osdmap
->get_cluster_addr(whoami
)
8228 << " != my " << cluster_messenger
->get_myaddr() << ")";
8229 } else if (!osdmap
->get_hb_back_addr(whoami
).probably_equals(
8230 hb_back_server_messenger
->get_myaddr())) {
8231 clog
->error() << "map e" << osdmap
->get_epoch()
8232 << " had wrong heartbeat back addr ("
8233 << osdmap
->get_hb_back_addr(whoami
)
8234 << " != my " << hb_back_server_messenger
->get_myaddr()
8236 } else if (osdmap
->get_hb_front_addr(whoami
) != entity_addr_t() &&
8237 !osdmap
->get_hb_front_addr(whoami
).probably_equals(
8238 hb_front_server_messenger
->get_myaddr())) {
8239 clog
->error() << "map e" << osdmap
->get_epoch()
8240 << " had wrong heartbeat front addr ("
8241 << osdmap
->get_hb_front_addr(whoami
)
8242 << " != my " << hb_front_server_messenger
->get_myaddr()
8246 if (!service
.is_stopping()) {
8247 epoch_t up_epoch
= 0;
8248 epoch_t bind_epoch
= osdmap
->get_epoch();
8249 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
8253 utime_t now
= ceph_clock_now();
8254 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
8255 osd_markdown_log
.push_back(now
);
8256 //clear all out-of-date log
8257 while (!osd_markdown_log
.empty() &&
8258 osd_markdown_log
.front() + grace
< now
)
8259 osd_markdown_log
.pop_front();
8260 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
8261 dout(0) << __func__
<< " marked down "
8262 << osd_markdown_log
.size()
8263 << " > osd_max_markdown_count "
8264 << cct
->_conf
->osd_max_markdown_count
8265 << " in last " << grace
<< " seconds, shutting down"
8271 start_waiting_for_healthy();
8273 set
<int> avoid_ports
;
8274 #if defined(__FreeBSD__)
8275 // prevent FreeBSD from grabbing the client_messenger port during
8276 // rebinding. In which case a cluster_meesneger will connect also
8278 avoid_ports
.insert(client_messenger
->get_myaddr().get_port());
8280 avoid_ports
.insert(cluster_messenger
->get_myaddr().get_port());
8281 avoid_ports
.insert(hb_back_server_messenger
->get_myaddr().get_port());
8282 avoid_ports
.insert(hb_front_server_messenger
->get_myaddr().get_port());
8284 int r
= cluster_messenger
->rebind(avoid_ports
);
8286 do_shutdown
= true; // FIXME: do_restart?
8287 network_error
= true;
8288 dout(0) << __func__
<< " marked down:"
8289 << " rebind cluster_messenger failed" << dendl
;
8292 r
= hb_back_server_messenger
->rebind(avoid_ports
);
8294 do_shutdown
= true; // FIXME: do_restart?
8295 network_error
= true;
8296 dout(0) << __func__
<< " marked down:"
8297 << " rebind hb_back_server_messenger failed" << dendl
;
8300 r
= hb_front_server_messenger
->rebind(avoid_ports
);
8302 do_shutdown
= true; // FIXME: do_restart?
8303 network_error
= true;
8304 dout(0) << __func__
<< " marked down:"
8305 << " rebind hb_front_server_messenger failed" << dendl
;
8308 hb_front_client_messenger
->mark_down_all();
8309 hb_back_client_messenger
->mark_down_all();
8311 reset_heartbeat_peers();
8316 map_lock
.put_write();
8318 check_osdmap_features(store
);
8323 if (is_active() || is_waiting_for_healthy())
8324 maybe_update_heartbeat_peers();
8327 dout(10) << " not yet active; waiting for peering wq to drain" << dendl
;
8334 if (network_error
) {
8335 Mutex::Locker
l(heartbeat_lock
);
8336 map
<int,pair
<utime_t
,entity_inst_t
>>::iterator it
=
8337 failure_pending
.begin();
8338 while (it
!= failure_pending
.end()) {
8339 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
8340 << it
->first
<< dendl
;
8341 send_still_alive(osdmap
->get_epoch(), it
->second
.second
);
8342 failure_pending
.erase(it
++);
8345 // trigger shutdown in a different thread
8346 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
8347 queue_async_signal(SIGINT
);
8349 else if (m
->newest_map
&& m
->newest_map
> last
) {
8350 dout(10) << " msg say newest map is " << m
->newest_map
8351 << ", requesting more" << dendl
;
8352 osdmap_subscribe(osdmap
->get_epoch()+1, false);
8354 else if (is_preboot()) {
8355 if (m
->get_source().is_mon())
8356 _preboot(m
->oldest_map
, m
->newest_map
);
8360 else if (do_restart
)
8365 void OSD::check_osdmap_features(ObjectStore
*fs
)
8367 // adjust required feature bits?
8369 // we have to be a bit careful here, because we are accessing the
8370 // Policy structures without taking any lock. in particular, only
8371 // modify integer values that can safely be read by a racing CPU.
8372 // since we are only accessing existing Policy structures a their
8373 // current memory location, and setting or clearing bits in integer
8374 // fields, and we are the only writer, this is not a problem.
8377 Messenger::Policy p
= client_messenger
->get_default_policy();
8379 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
8380 if ((p
.features_required
& mask
) != features
) {
8381 dout(0) << "crush map has features " << features
8382 << ", adjusting msgr requires for clients" << dendl
;
8383 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8384 client_messenger
->set_default_policy(p
);
8388 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
8390 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
8391 if ((p
.features_required
& mask
) != features
) {
8392 dout(0) << "crush map has features " << features
8393 << " was " << p
.features_required
8394 << ", adjusting msgr requires for mons" << dendl
;
8395 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8396 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
8400 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
8402 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
8404 if ((p
.features_required
& mask
) != features
) {
8405 dout(0) << "crush map has features " << features
8406 << ", adjusting msgr requires for osds" << dendl
;
8407 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8408 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
8411 if ((features
& CEPH_FEATURE_OSD_ERASURE_CODES
) &&
8412 !superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
8413 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
8414 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
8415 ObjectStore::Transaction t
;
8416 write_superblock(t
);
8417 int err
= store
->queue_transaction(service
.meta_osr
.get(), std::move(t
), NULL
);
8423 bool OSD::advance_pg(
8424 epoch_t osd_epoch
, PG
*pg
,
8425 ThreadPool::TPHandle
&handle
,
8426 PG::RecoveryCtx
*rctx
,
8427 set
<PGRef
> *new_pgs
)
8429 assert(pg
->is_locked());
8430 epoch_t next_epoch
= pg
->get_osdmap()->get_epoch() + 1;
8431 OSDMapRef lastmap
= pg
->get_osdmap();
8433 if (lastmap
->get_epoch() == osd_epoch
)
8435 assert(lastmap
->get_epoch() < osd_epoch
);
8437 epoch_t min_epoch
= service
.get_min_pg_epoch();
8440 max
= min_epoch
+ cct
->_conf
->osd_map_max_advance
;
8442 max
= next_epoch
+ cct
->_conf
->osd_map_max_advance
;
8446 next_epoch
<= osd_epoch
&& next_epoch
<= max
;
8448 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
8450 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
8451 // make sure max is bumped up so that we can get past any
8453 max
= MAX(max
, next_epoch
+ cct
->_conf
->osd_map_max_advance
);
8457 vector
<int> newup
, newacting
;
8458 int up_primary
, acting_primary
;
8459 nextmap
->pg_to_up_acting_osds(
8461 &newup
, &up_primary
,
8462 &newacting
, &acting_primary
);
8463 pg
->handle_advance_map(
8464 nextmap
, lastmap
, newup
, up_primary
,
8465 newacting
, acting_primary
, rctx
);
8468 set
<spg_t
> children
;
8469 spg_t
parent(pg
->info
.pgid
);
8470 if (parent
.is_split(
8471 lastmap
->get_pg_num(pg
->pool
.id
),
8472 nextmap
->get_pg_num(pg
->pool
.id
),
8474 service
.mark_split_in_progress(pg
->info
.pgid
, children
);
8476 pg
, children
, new_pgs
, lastmap
, nextmap
,
8481 handle
.reset_tp_timeout();
8483 service
.pg_update_epoch(pg
->info
.pgid
, lastmap
->get_epoch());
8484 pg
->handle_activate_map(rctx
);
8485 if (next_epoch
<= osd_epoch
) {
8486 dout(10) << __func__
<< " advanced to max " << max
8487 << " past min epoch " << min_epoch
8488 << " ... will requeue " << *pg
<< dendl
;
8494 void OSD::consume_map()
8496 assert(osd_lock
.is_locked());
8497 dout(7) << "consume_map version " << osdmap
->get_epoch() << dendl
;
8499 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8500 * speak the older sorting version any more. Be careful not to force
8501 * a shutdown if we are merely processing old maps, though.
8503 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
) && is_active()) {
8504 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
8508 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
8509 list
<PGRef
> to_remove
;
8513 RWLock::RLocker
l(pg_map_lock
);
8514 for (ceph::unordered_map
<spg_t
,PG
*>::iterator it
= pg_map
.begin();
8517 PG
*pg
= it
->second
;
8519 if (pg
->is_primary())
8521 else if (pg
->is_replica())
8526 if (!osdmap
->have_pg_pool(pg
->info
.pgid
.pool())) {
8528 to_remove
.push_back(PGRef(pg
));
8530 service
.init_splits_between(it
->first
, service
.get_osdmap(), osdmap
);
8536 lock_guard
<mutex
> pending_creates_locker
{pending_creates_lock
};
8537 for (auto pg
= pending_creates_from_osd
.cbegin();
8538 pg
!= pending_creates_from_osd
.cend();) {
8539 if (osdmap
->get_pg_acting_rank(pg
->first
, whoami
) < 0) {
8540 pg
= pending_creates_from_osd
.erase(pg
);
8547 for (list
<PGRef
>::iterator i
= to_remove
.begin();
8548 i
!= to_remove
.end();
8549 to_remove
.erase(i
++)) {
8550 RWLock::WLocker
locker(pg_map_lock
);
8556 service
.expand_pg_num(service
.get_osdmap(), osdmap
);
8558 service
.pre_publish_map(osdmap
);
8559 service
.await_reserved_maps();
8560 service
.publish_map(osdmap
);
8562 service
.maybe_inject_dispatch_delay();
8564 dispatch_sessions_waiting_on_map();
8566 service
.maybe_inject_dispatch_delay();
8568 // remove any PGs which we no longer host from the session waiting_for_pg lists
8569 dout(20) << __func__
<< " checking waiting_for_pg" << dendl
;
8570 op_shardedwq
.prune_pg_waiters(osdmap
, whoami
);
8572 service
.maybe_inject_dispatch_delay();
8576 RWLock::RLocker
l(pg_map_lock
);
8577 for (ceph::unordered_map
<spg_t
,PG
*>::iterator it
= pg_map
.begin();
8580 PG
*pg
= it
->second
;
8582 pg
->queue_null(osdmap
->get_epoch(), osdmap
->get_epoch());
8586 logger
->set(l_osd_pg
, pg_map
.size());
8588 logger
->set(l_osd_pg_primary
, num_pg_primary
);
8589 logger
->set(l_osd_pg_replica
, num_pg_replica
);
8590 logger
->set(l_osd_pg_stray
, num_pg_stray
);
8591 logger
->set(l_osd_pg_removing
, remove_wq
.get_remove_queue_len());
8594 void OSD::activate_map()
8596 assert(osd_lock
.is_locked());
8598 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
8600 if (osdmap
->test_flag(CEPH_OSDMAP_FULL
)) {
8601 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl
;
8602 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
8606 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
8607 if (!service
.recovery_is_paused()) {
8608 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
8609 service
.pause_recovery();
8612 if (service
.recovery_is_paused()) {
8613 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
8614 service
.unpause_recovery();
8618 service
.activate_map();
8621 take_waiters(waiting_for_osdmap
);
8624 bool OSD::require_mon_peer(const Message
*m
)
8626 if (!m
->get_connection()->peer_is_mon()) {
8627 dout(0) << "require_mon_peer received from non-mon "
8628 << m
->get_connection()->get_peer_addr()
8629 << " " << *m
<< dendl
;
8635 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
8637 if (!m
->get_connection()->peer_is_mon() &&
8638 !m
->get_connection()->peer_is_mgr()) {
8639 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8640 << m
->get_connection()->get_peer_addr()
8641 << " " << *m
<< dendl
;
8647 bool OSD::require_osd_peer(const Message
*m
)
8649 if (!m
->get_connection()->peer_is_osd()) {
8650 dout(0) << "require_osd_peer received from non-osd "
8651 << m
->get_connection()->get_peer_addr()
8652 << " " << *m
<< dendl
;
8658 bool OSD::require_self_aliveness(const Message
*m
, epoch_t epoch
)
8660 epoch_t up_epoch
= service
.get_up_epoch();
8661 if (epoch
< up_epoch
) {
8662 dout(7) << "from pre-up epoch " << epoch
<< " < " << up_epoch
<< dendl
;
8667 dout(7) << "still in boot state, dropping message " << *m
<< dendl
;
8674 bool OSD::require_same_peer_instance(const Message
*m
, OSDMapRef
& map
,
8675 bool is_fast_dispatch
)
8677 int from
= m
->get_source().num();
8679 if (map
->is_down(from
) ||
8680 (map
->get_cluster_addr(from
) != m
->get_source_inst().addr
)) {
8681 dout(5) << "from dead osd." << from
<< ", marking down, "
8682 << " msg was " << m
->get_source_inst().addr
8683 << " expected " << (map
->is_up(from
) ?
8684 map
->get_cluster_addr(from
) : entity_addr_t())
8686 ConnectionRef con
= m
->get_connection();
8688 Session
*s
= static_cast<Session
*>(con
->get_priv());
8690 if (!is_fast_dispatch
)
8691 s
->session_dispatch_lock
.Lock();
8692 clear_session_waiting_on_map(s
);
8693 con
->set_priv(NULL
); // break ref <-> session cycle, if any
8694 if (!is_fast_dispatch
)
8695 s
->session_dispatch_lock
.Unlock();
8705 * require that we have same (or newer) map, and that
8706 * the source is the pg primary.
8708 bool OSD::require_same_or_newer_map(OpRequestRef
& op
, epoch_t epoch
,
8709 bool is_fast_dispatch
)
8711 const Message
*m
= op
->get_req();
8712 dout(15) << "require_same_or_newer_map " << epoch
8713 << " (i am " << osdmap
->get_epoch() << ") " << m
<< dendl
;
8715 assert(osd_lock
.is_locked());
8717 // do they have a newer map?
8718 if (epoch
> osdmap
->get_epoch()) {
8719 dout(7) << "waiting for newer map epoch " << epoch
8720 << " > my " << osdmap
->get_epoch() << " with " << m
<< dendl
;
8721 wait_for_new_map(op
);
8725 if (!require_self_aliveness(op
->get_req(), epoch
)) {
8729 // ok, our map is same or newer.. do they still exist?
8730 if (m
->get_connection()->get_messenger() == cluster_messenger
&&
8731 !require_same_peer_instance(op
->get_req(), osdmap
, is_fast_dispatch
)) {
8742 // ----------------------------------------
8745 void OSD::split_pgs(
8747 const set
<spg_t
> &childpgids
, set
<PGRef
> *out_pgs
,
8750 PG::RecoveryCtx
*rctx
)
8752 unsigned pg_num
= nextmap
->get_pg_num(
8754 parent
->update_snap_mapper_bits(
8755 parent
->info
.pgid
.get_split_bits(pg_num
)
8758 vector
<object_stat_sum_t
> updated_stats(childpgids
.size() + 1);
8759 parent
->info
.stats
.stats
.sum
.split(updated_stats
);
8761 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
8762 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
8763 i
!= childpgids
.end();
8765 assert(stat_iter
!= updated_stats
.end());
8766 dout(10) << "Splitting " << *parent
<< " into " << *i
<< dendl
;
8767 assert(service
.splitting(*i
));
8768 PG
* child
= _make_pg(nextmap
, *i
);
8770 out_pgs
->insert(child
);
8771 rctx
->created_pgs
.insert(child
);
8773 unsigned split_bits
= i
->get_split_bits(pg_num
);
8774 dout(10) << "pg_num is " << pg_num
<< dendl
;
8775 dout(10) << "m_seed " << i
->ps() << dendl
;
8776 dout(10) << "split_bits is " << split_bits
<< dendl
;
8778 parent
->split_colls(
8788 child
->info
.stats
.stats
.sum
= *stat_iter
;
8790 child
->write_if_dirty(*(rctx
->transaction
));
8793 assert(stat_iter
!= updated_stats
.end());
8794 parent
->info
.stats
.stats
.sum
= *stat_iter
;
8795 parent
->write_if_dirty(*(rctx
->transaction
));
8801 void OSD::handle_pg_create(OpRequestRef op
)
8803 const MOSDPGCreate
*m
= static_cast<const MOSDPGCreate
*>(op
->get_req());
8804 assert(m
->get_type() == MSG_OSD_PG_CREATE
);
8806 dout(10) << "handle_pg_create " << *m
<< dendl
;
8808 if (!require_mon_peer(op
->get_req())) {
8812 if (!require_same_or_newer_map(op
, m
->epoch
, false))
8817 map
<pg_t
,utime_t
>::const_iterator ci
= m
->ctimes
.begin();
8818 for (map
<pg_t
,pg_create_t
>::const_iterator p
= m
->mkpg
.begin();
8821 assert(ci
!= m
->ctimes
.end() && ci
->first
== p
->first
);
8822 epoch_t created
= p
->second
.created
;
8823 if (p
->second
.split_bits
) // Skip split pgs
8827 if (on
.preferred() >= 0) {
8828 dout(20) << "ignoring localized pg " << on
<< dendl
;
8832 if (!osdmap
->have_pg_pool(on
.pool())) {
8833 dout(20) << "ignoring pg on deleted pool " << on
<< dendl
;
8837 dout(20) << "mkpg " << on
<< " e" << created
<< "@" << ci
->second
<< dendl
;
8839 // is it still ours?
8840 vector
<int> up
, acting
;
8841 int up_primary
= -1;
8842 int acting_primary
= -1;
8843 osdmap
->pg_to_up_acting_osds(on
, &up
, &up_primary
, &acting
, &acting_primary
);
8844 int role
= osdmap
->calc_pg_role(whoami
, acting
, acting
.size());
8846 if (acting_primary
!= whoami
) {
8847 dout(10) << "mkpg " << on
<< " not acting_primary (" << acting_primary
8848 << "), my role=" << role
<< ", skipping" << dendl
;
8853 bool mapped
= osdmap
->get_primary_shard(on
, &pgid
);
8857 osdmap
->get_pools().at(pgid
.pool()).ec_pool(),
8859 pg_history_t history
;
8860 build_initial_pg_history(pgid
, created
, ci
->second
, &history
, &pi
);
8862 // The mon won't resend unless the primary changed, so
8863 // we ignore same_interval_since. We'll pass this history
8864 // to handle_pg_peering_evt with the current epoch as the
8865 // event -- the project_pg_history check in
8866 // handle_pg_peering_evt will be a noop.
8867 if (history
.same_primary_since
> m
->epoch
) {
8868 dout(10) << __func__
<< ": got obsolete pg create on pgid "
8869 << pgid
<< " from epoch " << m
->epoch
8870 << ", primary changed in " << history
.same_primary_since
8874 if (handle_pg_peering_evt(
8878 osdmap
->get_epoch(),
8879 PG::CephPeeringEvtRef(
8880 new PG::CephPeeringEvt(
8881 osdmap
->get_epoch(),
8882 osdmap
->get_epoch(),
8885 service
.send_pg_created(pgid
.pgid
);
8890 lock_guard
<mutex
> pending_creates_locker
{pending_creates_lock
};
8891 if (pending_creates_from_mon
== 0) {
8892 last_pg_create_epoch
= m
->epoch
;
8895 maybe_update_heartbeat_peers();
8899 // ----------------------------------------
8900 // peering and recovery
8902 PG::RecoveryCtx
OSD::create_context()
8904 ObjectStore::Transaction
*t
= new ObjectStore::Transaction
;
8905 C_Contexts
*on_applied
= new C_Contexts(cct
);
8906 C_Contexts
*on_safe
= new C_Contexts(cct
);
8907 map
<int, map
<spg_t
,pg_query_t
> > *query_map
=
8908 new map
<int, map
<spg_t
, pg_query_t
> >;
8909 map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > > *notify_list
=
8910 new map
<int, vector
<pair
<pg_notify_t
, PastIntervals
> > >;
8911 map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > > *info_map
=
8912 new map
<int,vector
<pair
<pg_notify_t
, PastIntervals
> > >;
8913 PG::RecoveryCtx
rctx(query_map
, info_map
, notify_list
,
8914 on_applied
, on_safe
, t
);
8918 struct C_OpenPGs
: public Context
{
8922 C_OpenPGs(set
<PGRef
>& p
, ObjectStore
*s
, OSD
* o
) : store(s
), osd(o
) {
8925 void finish(int r
) override
{
8926 RWLock::RLocker
l(osd
->pg_map_lock
);
8927 for (auto p
: pgs
) {
8928 if (osd
->pg_map
.count(p
->info
.pgid
)) {
8929 p
->ch
= store
->open_collection(p
->coll
);
8936 void OSD::dispatch_context_transaction(PG::RecoveryCtx
&ctx
, PG
*pg
,
8937 ThreadPool::TPHandle
*handle
)
8939 if (!ctx
.transaction
->empty()) {
8940 if (!ctx
.created_pgs
.empty()) {
8941 ctx
.on_applied
->add(new C_OpenPGs(ctx
.created_pgs
, store
, this));
8943 int tr
= store
->queue_transaction(
8945 std::move(*ctx
.transaction
), ctx
.on_applied
, ctx
.on_safe
, NULL
,
8946 TrackedOpRef(), handle
);
8947 delete (ctx
.transaction
);
8949 ctx
.transaction
= new ObjectStore::Transaction
;
8950 ctx
.on_applied
= new C_Contexts(cct
);
8951 ctx
.on_safe
= new C_Contexts(cct
);
8955 void OSD::dispatch_context(PG::RecoveryCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
8956 ThreadPool::TPHandle
*handle
)
8958 if (service
.get_osdmap()->is_up(whoami
) &&
8960 do_notifies(*ctx
.notify_list
, curmap
);
8961 do_queries(*ctx
.query_map
, curmap
);
8962 do_infos(*ctx
.info_map
, curmap
);
8964 delete ctx
.notify_list
;
8965 delete ctx
.query_map
;
8966 delete ctx
.info_map
;
8967 if ((ctx
.on_applied
->empty() &&
8968 ctx
.on_safe
->empty() &&
8969 ctx
.transaction
->empty() &&
8970 ctx
.created_pgs
.empty()) || !pg
) {
8971 delete ctx
.transaction
;
8972 delete ctx
.on_applied
;
8974 assert(ctx
.created_pgs
.empty());
8976 if (!ctx
.created_pgs
.empty()) {
8977 ctx
.on_applied
->add(new C_OpenPGs(ctx
.created_pgs
, store
, this));
8979 int tr
= store
->queue_transaction(
8981 std::move(*ctx
.transaction
), ctx
.on_applied
, ctx
.on_safe
, NULL
, TrackedOpRef(),
8983 delete (ctx
.transaction
);
8989 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8990 * content for, and they are primary for.
8993 void OSD::do_notifies(
8994 map
<int,vector
<pair
<pg_notify_t
,PastIntervals
> > >& notify_list
,
8998 vector
<pair
<pg_notify_t
,PastIntervals
> > >::iterator it
=
8999 notify_list
.begin();
9000 it
!= notify_list
.end();
9002 if (!curmap
->is_up(it
->first
)) {
9003 dout(20) << __func__
<< " skipping down osd." << it
->first
<< dendl
;
9006 ConnectionRef con
= service
.get_con_osd_cluster(
9007 it
->first
, curmap
->get_epoch());
9009 dout(20) << __func__
<< " skipping osd." << it
->first
9010 << " (NULL con)" << dendl
;
9013 service
.share_map_peer(it
->first
, con
.get(), curmap
);
9014 dout(7) << __func__
<< " osd." << it
->first
9015 << " on " << it
->second
.size() << " PGs" << dendl
;
9016 MOSDPGNotify
*m
= new MOSDPGNotify(curmap
->get_epoch(),
9018 con
->send_message(m
);
9024 * send out pending queries for info | summaries
9026 void OSD::do_queries(map
<int, map
<spg_t
,pg_query_t
> >& query_map
,
9029 for (map
<int, map
<spg_t
,pg_query_t
> >::iterator pit
= query_map
.begin();
9030 pit
!= query_map
.end();
9032 if (!curmap
->is_up(pit
->first
)) {
9033 dout(20) << __func__
<< " skipping down osd." << pit
->first
<< dendl
;
9036 int who
= pit
->first
;
9037 ConnectionRef con
= service
.get_con_osd_cluster(who
, curmap
->get_epoch());
9039 dout(20) << __func__
<< " skipping osd." << who
9040 << " (NULL con)" << dendl
;
9043 service
.share_map_peer(who
, con
.get(), curmap
);
9044 dout(7) << __func__
<< " querying osd." << who
9045 << " on " << pit
->second
.size() << " PGs" << dendl
;
9046 MOSDPGQuery
*m
= new MOSDPGQuery(curmap
->get_epoch(), pit
->second
);
9047 con
->send_message(m
);
9052 void OSD::do_infos(map
<int,
9053 vector
<pair
<pg_notify_t
, PastIntervals
> > >& info_map
,
9057 vector
<pair
<pg_notify_t
, PastIntervals
> > >::iterator p
=
9059 p
!= info_map
.end();
9061 if (!curmap
->is_up(p
->first
)) {
9062 dout(20) << __func__
<< " skipping down osd." << p
->first
<< dendl
;
9065 for (vector
<pair
<pg_notify_t
,PastIntervals
> >::iterator i
= p
->second
.begin();
9066 i
!= p
->second
.end();
9068 dout(20) << __func__
<< " sending info " << i
->first
.info
9069 << " to shard " << p
->first
<< dendl
;
9071 ConnectionRef con
= service
.get_con_osd_cluster(
9072 p
->first
, curmap
->get_epoch());
9074 dout(20) << __func__
<< " skipping osd." << p
->first
9075 << " (NULL con)" << dendl
;
9078 service
.share_map_peer(p
->first
, con
.get(), curmap
);
9079 MOSDPGInfo
*m
= new MOSDPGInfo(curmap
->get_epoch());
9080 m
->pg_list
= p
->second
;
9081 con
->send_message(m
);
9088 * from non-primary to primary
9089 * includes pg_info_t.
9090 * NOTE: called with opqueue active.
9092 void OSD::handle_pg_notify(OpRequestRef op
)
9094 const MOSDPGNotify
*m
= static_cast<const MOSDPGNotify
*>(op
->get_req());
9095 assert(m
->get_type() == MSG_OSD_PG_NOTIFY
);
9097 dout(7) << "handle_pg_notify from " << m
->get_source() << dendl
;
9098 int from
= m
->get_source().num();
9100 if (!require_osd_peer(op
->get_req()))
9103 if (!require_same_or_newer_map(op
, m
->get_epoch(), false))
9108 for (auto it
= m
->get_pg_list().begin();
9109 it
!= m
->get_pg_list().end();
9111 if (it
->first
.info
.pgid
.preferred() >= 0) {
9112 dout(20) << "ignoring localized pg " << it
->first
.info
.pgid
<< dendl
;
9116 handle_pg_peering_evt(
9117 spg_t(it
->first
.info
.pgid
.pgid
, it
->first
.to
),
9118 it
->first
.info
.history
, it
->second
,
9119 it
->first
.query_epoch
,
9120 PG::CephPeeringEvtRef(
9121 new PG::CephPeeringEvt(
9122 it
->first
.epoch_sent
, it
->first
.query_epoch
,
9123 PG::MNotifyRec(pg_shard_t(from
, it
->first
.from
), it
->first
,
9124 op
->get_req()->get_connection()->get_features())))
9129 void OSD::handle_pg_log(OpRequestRef op
)
9131 MOSDPGLog
*m
= static_cast<MOSDPGLog
*>(op
->get_nonconst_req());
9132 assert(m
->get_type() == MSG_OSD_PG_LOG
);
9133 dout(7) << "handle_pg_log " << *m
<< " from " << m
->get_source() << dendl
;
9135 if (!require_osd_peer(op
->get_req()))
9138 int from
= m
->get_source().num();
9139 if (!require_same_or_newer_map(op
, m
->get_epoch(), false))
9142 if (m
->info
.pgid
.preferred() >= 0) {
9143 dout(10) << "ignoring localized pg " << m
->info
.pgid
<< dendl
;
9148 handle_pg_peering_evt(
9149 spg_t(m
->info
.pgid
.pgid
, m
->to
),
9150 m
->info
.history
, m
->past_intervals
, m
->get_epoch(),
9151 PG::CephPeeringEvtRef(
9152 new PG::CephPeeringEvt(
9153 m
->get_epoch(), m
->get_query_epoch(),
9154 PG::MLogRec(pg_shard_t(from
, m
->from
), m
)))
9158 void OSD::handle_pg_info(OpRequestRef op
)
9160 const MOSDPGInfo
*m
= static_cast<const MOSDPGInfo
*>(op
->get_req());
9161 assert(m
->get_type() == MSG_OSD_PG_INFO
);
9162 dout(7) << "handle_pg_info " << *m
<< " from " << m
->get_source() << dendl
;
9164 if (!require_osd_peer(op
->get_req()))
9167 int from
= m
->get_source().num();
9168 if (!require_same_or_newer_map(op
, m
->get_epoch(), false))
9173 for (auto p
= m
->pg_list
.begin();
9174 p
!= m
->pg_list
.end();
9176 if (p
->first
.info
.pgid
.preferred() >= 0) {
9177 dout(10) << "ignoring localized pg " << p
->first
.info
.pgid
<< dendl
;
9181 handle_pg_peering_evt(
9182 spg_t(p
->first
.info
.pgid
.pgid
, p
->first
.to
),
9183 p
->first
.info
.history
, p
->second
, p
->first
.epoch_sent
,
9184 PG::CephPeeringEvtRef(
9185 new PG::CephPeeringEvt(
9186 p
->first
.epoch_sent
, p
->first
.query_epoch
,
9189 from
, p
->first
.from
), p
->first
.info
, p
->first
.epoch_sent
)))
9194 void OSD::handle_pg_trim(OpRequestRef op
)
9196 const MOSDPGTrim
*m
= static_cast<const MOSDPGTrim
*>(op
->get_req());
9197 assert(m
->get_type() == MSG_OSD_PG_TRIM
);
9199 dout(7) << "handle_pg_trim " << *m
<< " from " << m
->get_source() << dendl
;
9201 if (!require_osd_peer(op
->get_req()))
9204 int from
= m
->get_source().num();
9205 if (!require_same_or_newer_map(op
, m
->epoch
, false))
9208 if (m
->pgid
.preferred() >= 0) {
9209 dout(10) << "ignoring localized pg " << m
->pgid
<< dendl
;
9215 PG
*pg
= _lookup_lock_pg(m
->pgid
);
9217 dout(10) << " don't have pg " << m
->pgid
<< dendl
;
9221 if (m
->epoch
< pg
->info
.history
.same_interval_since
) {
9222 dout(10) << *pg
<< " got old trim to " << m
->trim_to
<< ", ignoring" << dendl
;
9227 if (pg
->is_primary()) {
9228 // peer is informing us of their last_complete_ondisk
9229 dout(10) << *pg
<< " replica osd." << from
<< " lcod " << m
->trim_to
<< dendl
;
9230 pg
->peer_last_complete_ondisk
[pg_shard_t(from
, m
->pgid
.shard
)] =
9232 // trim log when the pg is recovered
9233 pg
->calc_min_last_complete_ondisk();
9235 // primary is instructing us to trim
9236 ObjectStore::Transaction t
;
9237 pg
->pg_log
.trim(m
->trim_to
, pg
->info
);
9238 pg
->dirty_info
= true;
9239 pg
->write_if_dirty(t
);
9240 int tr
= store
->queue_transaction(pg
->osr
.get(), std::move(t
), NULL
);
9246 void OSD::handle_pg_backfill_reserve(OpRequestRef op
)
9248 const MBackfillReserve
*m
= static_cast<const MBackfillReserve
*>(op
->get_req());
9249 assert(m
->get_type() == MSG_OSD_BACKFILL_RESERVE
);
9251 if (!require_osd_peer(op
->get_req()))
9253 if (!require_same_or_newer_map(op
, m
->query_epoch
, false))
9256 PG::CephPeeringEvtRef evt
;
9257 if (m
->type
== MBackfillReserve::REQUEST
) {
9258 evt
= PG::CephPeeringEvtRef(
9259 new PG::CephPeeringEvt(
9262 PG::RequestBackfillPrio(m
->priority
)));
9263 } else if (m
->type
== MBackfillReserve::GRANT
) {
9264 evt
= PG::CephPeeringEvtRef(
9265 new PG::CephPeeringEvt(
9268 PG::RemoteBackfillReserved()));
9269 } else if (m
->type
== MBackfillReserve::REJECT
) {
9270 // NOTE: this is replica -> primary "i reject your request"
9271 // and also primary -> replica "cancel my previously-granted request"
9272 evt
= PG::CephPeeringEvtRef(
9273 new PG::CephPeeringEvt(
9276 PG::RemoteReservationRejected()));
9281 if (service
.splitting(m
->pgid
)) {
9282 peering_wait_for_split
[m
->pgid
].push_back(evt
);
9286 PG
*pg
= _lookup_lock_pg(m
->pgid
);
9288 dout(10) << " don't have pg " << m
->pgid
<< dendl
;
9292 pg
->queue_peering_event(evt
);
9296 void OSD::handle_pg_recovery_reserve(OpRequestRef op
)
9298 const MRecoveryReserve
*m
= static_cast<const MRecoveryReserve
*>(op
->get_req());
9299 assert(m
->get_type() == MSG_OSD_RECOVERY_RESERVE
);
9301 if (!require_osd_peer(op
->get_req()))
9303 if (!require_same_or_newer_map(op
, m
->query_epoch
, false))
9306 PG::CephPeeringEvtRef evt
;
9307 if (m
->type
== MRecoveryReserve::REQUEST
) {
9308 evt
= PG::CephPeeringEvtRef(
9309 new PG::CephPeeringEvt(
9312 PG::RequestRecovery()));
9313 } else if (m
->type
== MRecoveryReserve::GRANT
) {
9314 evt
= PG::CephPeeringEvtRef(
9315 new PG::CephPeeringEvt(
9318 PG::RemoteRecoveryReserved()));
9319 } else if (m
->type
== MRecoveryReserve::RELEASE
) {
9320 evt
= PG::CephPeeringEvtRef(
9321 new PG::CephPeeringEvt(
9324 PG::RecoveryDone()));
9329 if (service
.splitting(m
->pgid
)) {
9330 peering_wait_for_split
[m
->pgid
].push_back(evt
);
9334 PG
*pg
= _lookup_lock_pg(m
->pgid
);
9336 dout(10) << " don't have pg " << m
->pgid
<< dendl
;
9340 pg
->queue_peering_event(evt
);
9344 void OSD::handle_force_recovery(Message
*m
)
9346 MOSDForceRecovery
*msg
= static_cast<MOSDForceRecovery
*>(m
);
9347 assert(msg
->get_type() == MSG_OSD_FORCE_RECOVERY
);
9349 vector
<PGRef
> local_pgs
;
9350 local_pgs
.reserve(msg
->forced_pgs
.size());
9353 RWLock::RLocker
l(pg_map_lock
);
9354 for (auto& i
: msg
->forced_pgs
) {
9356 if (osdmap
->get_primary_shard(i
, &locpg
)) {
9357 auto pg_map_entry
= pg_map
.find(locpg
);
9358 if (pg_map_entry
!= pg_map
.end()) {
9359 local_pgs
.push_back(pg_map_entry
->second
);
9365 if (local_pgs
.size()) {
9366 service
.adjust_pg_priorities(local_pgs
, msg
->options
);
9373 * from primary to replica | stray
9374 * NOTE: called with opqueue active.
9376 void OSD::handle_pg_query(OpRequestRef op
)
9378 assert(osd_lock
.is_locked());
9380 const MOSDPGQuery
*m
= static_cast<const MOSDPGQuery
*>(op
->get_req());
9381 assert(m
->get_type() == MSG_OSD_PG_QUERY
);
9383 if (!require_osd_peer(op
->get_req()))
9386 dout(7) << "handle_pg_query from " << m
->get_source() << " epoch " << m
->get_epoch() << dendl
;
9387 int from
= m
->get_source().num();
9389 if (!require_same_or_newer_map(op
, m
->get_epoch(), false))
9394 map
< int, vector
<pair
<pg_notify_t
, PastIntervals
> > > notify_list
;
9396 for (auto it
= m
->pg_list
.begin();
9397 it
!= m
->pg_list
.end();
9399 spg_t pgid
= it
->first
;
9401 if (pgid
.preferred() >= 0) {
9402 dout(10) << "ignoring localized pg " << pgid
<< dendl
;
9406 if (service
.splitting(pgid
)) {
9407 peering_wait_for_split
[pgid
].push_back(
9408 PG::CephPeeringEvtRef(
9409 new PG::CephPeeringEvt(
9410 it
->second
.epoch_sent
, it
->second
.epoch_sent
,
9411 PG::MQuery(pg_shard_t(from
, it
->second
.from
),
9412 it
->second
, it
->second
.epoch_sent
))));
9417 RWLock::RLocker
l(pg_map_lock
);
9418 if (pg_map
.count(pgid
)) {
9420 pg
= _lookup_lock_pg_with_map_lock_held(pgid
);
9422 it
->second
.epoch_sent
, it
->second
.epoch_sent
,
9423 pg_shard_t(from
, it
->second
.from
), it
->second
);
9429 if (!osdmap
->have_pg_pool(pgid
.pool()))
9432 // get active crush mapping
9433 int up_primary
, acting_primary
;
9434 vector
<int> up
, acting
;
9435 osdmap
->pg_to_up_acting_osds(
9436 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
9439 pg_history_t history
= it
->second
.history
;
9440 bool valid_history
= project_pg_history(
9441 pgid
, history
, it
->second
.epoch_sent
,
9442 up
, up_primary
, acting
, acting_primary
);
9444 if (!valid_history
||
9445 it
->second
.epoch_sent
< history
.same_interval_since
) {
9446 dout(10) << " pg " << pgid
<< " dne, and pg has changed in "
9447 << history
.same_interval_since
9448 << " (msg from " << it
->second
.epoch_sent
<< ")" << dendl
;
9452 dout(10) << " pg " << pgid
<< " dne" << dendl
;
9453 pg_info_t
empty(spg_t(pgid
.pgid
, it
->second
.to
));
9454 /* This is racy, but that should be ok: if we complete the deletion
9455 * before the pg is recreated, we'll just start it off backfilling
9456 * instead of just empty */
9457 if (service
.deleting_pgs
.lookup(pgid
))
9458 empty
.set_last_backfill(hobject_t());
9459 if (it
->second
.type
== pg_query_t::LOG
||
9460 it
->second
.type
== pg_query_t::FULLLOG
) {
9461 ConnectionRef con
= service
.get_con_osd_cluster(from
, osdmap
->get_epoch());
9463 MOSDPGLog
*mlog
= new MOSDPGLog(
9464 it
->second
.from
, it
->second
.to
,
9465 osdmap
->get_epoch(), empty
,
9466 it
->second
.epoch_sent
);
9467 service
.share_map_peer(from
, con
.get(), osdmap
);
9468 con
->send_message(mlog
);
9471 notify_list
[from
].push_back(
9474 it
->second
.from
, it
->second
.to
,
9475 it
->second
.epoch_sent
,
9476 osdmap
->get_epoch(),
9479 osdmap
->get_pools().at(pgid
.pool()).ec_pool(),
9483 do_notifies(notify_list
, osdmap
);
9487 void OSD::handle_pg_remove(OpRequestRef op
)
9489 const MOSDPGRemove
*m
= static_cast<const MOSDPGRemove
*>(op
->get_req());
9490 assert(m
->get_type() == MSG_OSD_PG_REMOVE
);
9491 assert(osd_lock
.is_locked());
9493 if (!require_osd_peer(op
->get_req()))
9496 dout(7) << "handle_pg_remove from " << m
->get_source() << " on "
9497 << m
->pg_list
.size() << " pgs" << dendl
;
9499 if (!require_same_or_newer_map(op
, m
->get_epoch(), false))
9504 for (auto it
= m
->pg_list
.begin();
9505 it
!= m
->pg_list
.end();
9508 if (pgid
.preferred() >= 0) {
9509 dout(10) << "ignoring localized pg " << pgid
<< dendl
;
9513 RWLock::WLocker
l(pg_map_lock
);
9514 if (pg_map
.count(pgid
) == 0) {
9515 dout(10) << " don't have pg " << pgid
<< dendl
;
9518 dout(5) << "queue_pg_for_deletion: " << pgid
<< dendl
;
9519 PG
*pg
= _lookup_lock_pg_with_map_lock_held(pgid
);
9520 pg_history_t history
= pg
->info
.history
;
9521 int up_primary
, acting_primary
;
9522 vector
<int> up
, acting
;
9523 osdmap
->pg_to_up_acting_osds(
9524 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
9525 bool valid_history
= project_pg_history(
9526 pg
->info
.pgid
, history
, pg
->get_osdmap()->get_epoch(),
9527 up
, up_primary
, acting
, acting_primary
);
9528 if (valid_history
&&
9529 history
.same_interval_since
<= m
->get_epoch()) {
9530 assert(pg
->get_primary().osd
== m
->get_source().num());
9535 dout(10) << *pg
<< " ignoring remove request, pg changed in epoch "
9536 << history
.same_interval_since
9537 << " > " << m
->get_epoch() << dendl
;
9543 void OSD::_remove_pg(PG
*pg
)
9545 ObjectStore::Transaction rmt
;
9547 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
9548 // the pg_map must be done together without unlocking the pg lock,
9549 // to avoid racing with watcher cleanup in ms_handle_reset
9550 // and handle_notify_timeout
9551 pg
->on_removal(&rmt
);
9553 service
.cancel_pending_splits_for_parent(pg
->info
.pgid
);
9554 int tr
= store
->queue_transaction(
9555 pg
->osr
.get(), std::move(rmt
), NULL
,
9556 new ContainerContext
<
9557 SequencerRef
>(pg
->osr
));
9560 DeletingStateRef deleting
= service
.deleting_pgs
.lookup_or_create(
9566 remove_wq
.queue(make_pair(PGRef(pg
), deleting
));
9568 service
.pg_remove_epoch(pg
->info
.pgid
);
9570 // dereference from op_wq
9571 op_shardedwq
.clear_pg_pointer(pg
->info
.pgid
);
9574 pg_map
.erase(pg
->info
.pgid
);
9575 pg
->put("PGMap"); // since we've taken it out of map
9578 // =========================================================
9581 void OSDService::_maybe_queue_recovery() {
9582 assert(recovery_lock
.is_locked_by_me());
9583 uint64_t available_pushes
;
9584 while (!awaiting_throttle
.empty() &&
9585 _recover_now(&available_pushes
)) {
9586 uint64_t to_start
= MIN(
9588 cct
->_conf
->osd_recovery_max_single_start
);
9589 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
9590 awaiting_throttle
.pop_front();
9591 recovery_ops_reserved
+= to_start
;
9595 bool OSDService::_recover_now(uint64_t *available_pushes
)
9597 if (available_pushes
)
9598 *available_pushes
= 0;
9600 if (ceph_clock_now() < defer_recovery_until
) {
9601 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
9605 if (recovery_paused
) {
9606 dout(15) << __func__
<< " paused" << dendl
;
9610 uint64_t max
= cct
->_conf
->osd_recovery_max_active
;
9611 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
9612 dout(15) << __func__
<< " active " << recovery_ops_active
9613 << " + reserved " << recovery_ops_reserved
9614 << " >= max " << max
<< dendl
;
9618 if (available_pushes
)
9619 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
9625 void OSDService::adjust_pg_priorities(const vector
<PGRef
>& pgs
, int newflags
)
9627 if (!pgs
.size() || !(newflags
& (OFR_BACKFILL
| OFR_RECOVERY
))) {
9631 if (newflags
& OFR_BACKFILL
) {
9632 for (auto& pg
: pgs
) {
9633 if (pg
->set_force_backfill(!(newflags
& OFR_CANCEL
))) {
9634 did
.insert(pg
->pg_id
);
9637 } else if (newflags
& OFR_RECOVERY
) {
9638 for (auto& pg
: pgs
) {
9639 if (pg
->set_force_recovery(!(newflags
& OFR_CANCEL
))) {
9640 did
.insert(pg
->pg_id
);
9645 dout(10) << __func__
<< " " << ((newflags
& OFR_CANCEL
) ? "cleared" : "set")
9646 << " force_" << ((newflags
& OFR_BACKFILL
) ? "backfill" : "recovery")
9647 << " on no pgs" << dendl
;
9649 dout(10) << __func__
<< " " << ((newflags
& OFR_CANCEL
) ? "cleared" : "set")
9650 << " force_" << ((newflags
& OFR_BACKFILL
) ? "backfill" : "recovery")
9651 << " on " << did
<< dendl
;
9655 void OSD::do_recovery(
9656 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
,
9657 ThreadPool::TPHandle
&handle
)
9659 uint64_t started
= 0;
9662 * When the value of osd_recovery_sleep is set greater than zero, recovery
9663 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9664 * recovery event's schedule time. This is done by adding a
9665 * recovery_requeue_callback event, which re-queues the recovery op using
9666 * queue_recovery_after_sleep.
9668 float recovery_sleep
= get_osd_recovery_sleep();
9670 Mutex::Locker
l(service
.recovery_sleep_lock
);
9671 if (recovery_sleep
> 0 && service
.recovery_needs_sleep
) {
9673 auto recovery_requeue_callback
= new FunctionContext([this, pgref
, queued
, reserved_pushes
](int r
) {
9674 dout(20) << "do_recovery wake up at "
9676 << ", re-queuing recovery" << dendl
;
9677 Mutex::Locker
l(service
.recovery_sleep_lock
);
9678 service
.recovery_needs_sleep
= false;
9679 service
.queue_recovery_after_sleep(pgref
.get(), queued
, reserved_pushes
);
9682 // This is true for the first recovery op and when the previous recovery op
9683 // has been scheduled in the past. The next recovery op is scheduled after
9684 // completing the sleep from now.
9685 if (service
.recovery_schedule_time
< ceph_clock_now()) {
9686 service
.recovery_schedule_time
= ceph_clock_now();
9688 service
.recovery_schedule_time
+= recovery_sleep
;
9689 service
.recovery_sleep_timer
.add_event_at(service
.recovery_schedule_time
,
9690 recovery_requeue_callback
);
9691 dout(20) << "Recovery event scheduled at "
9692 << service
.recovery_schedule_time
<< dendl
;
9699 Mutex::Locker
l(service
.recovery_sleep_lock
);
9700 service
.recovery_needs_sleep
= true;
9703 if (pg
->pg_has_reset_since(queued
)) {
9707 assert(!pg
->deleting
);
9708 assert(pg
->is_peered() && pg
->is_primary());
9710 assert(pg
->recovery_queued
);
9711 pg
->recovery_queued
= false;
9713 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
9714 #ifdef DEBUG_RECOVERY_OIDS
9715 dout(20) << " active was " << service
.recovery_oids
[pg
->info
.pgid
] << dendl
;
9718 bool more
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
9719 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
9720 << " on " << *pg
<< dendl
;
9722 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
9723 if (!started
&& (more
|| !pg
->have_unfound())) {
9727 PG::RecoveryCtx rctx
= create_context();
9728 rctx
.handle
= &handle
;
9731 * if we couldn't start any recovery ops and things are still
9732 * unfound, see if we can discover more missing object locations.
9733 * It may be that our initial locations were bad and we errored
9734 * out while trying to pull.
9736 if (!more
&& pg
->have_unfound()) {
9737 pg
->discover_all_missing(*rctx
.query_map
);
9738 if (rctx
.query_map
->empty()) {
9740 if (pg
->state_test(PG_STATE_BACKFILLING
)) {
9741 auto evt
= PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9744 PG::DeferBackfill(cct
->_conf
->osd_recovery_retry_interval
)));
9745 pg
->queue_peering_event(evt
);
9746 action
= "in backfill";
9747 } else if (pg
->state_test(PG_STATE_RECOVERING
)) {
9748 auto evt
= PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9751 PG::DeferRecovery(cct
->_conf
->osd_recovery_retry_interval
)));
9752 pg
->queue_peering_event(evt
);
9753 action
= "in recovery";
9755 action
= "already out of recovery/backfill";
9757 dout(10) << __func__
<< ": no luck, giving up on this pg for now (" << action
<< ")" << dendl
;
9759 dout(10) << __func__
<< ": no luck, giving up on this pg for now (queue_recovery)" << dendl
;
9760 pg
->queue_recovery();
9764 pg
->write_if_dirty(*rctx
.transaction
);
9765 OSDMapRef curmap
= pg
->get_osdmap();
9766 dispatch_context(rctx
, pg
, curmap
);
9770 assert(started
<= reserved_pushes
);
9771 service
.release_reserved_pushes(reserved_pushes
);
9774 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
9776 Mutex::Locker
l(recovery_lock
);
9777 dout(10) << "start_recovery_op " << *pg
<< " " << soid
9778 << " (" << recovery_ops_active
<< "/"
9779 << cct
->_conf
->osd_recovery_max_active
<< " rops)"
9781 recovery_ops_active
++;
9783 #ifdef DEBUG_RECOVERY_OIDS
9784 dout(20) << " active was " << recovery_oids
[pg
->info
.pgid
] << dendl
;
9785 assert(recovery_oids
[pg
->info
.pgid
].count(soid
) == 0);
9786 recovery_oids
[pg
->info
.pgid
].insert(soid
);
9790 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
9792 Mutex::Locker
l(recovery_lock
);
9793 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
9794 << " dequeue=" << dequeue
9795 << " (" << recovery_ops_active
<< "/" << cct
->_conf
->osd_recovery_max_active
<< " rops)"
9799 assert(recovery_ops_active
> 0);
9800 recovery_ops_active
--;
9802 #ifdef DEBUG_RECOVERY_OIDS
9803 dout(20) << " active oids was " << recovery_oids
[pg
->info
.pgid
] << dendl
;
9804 assert(recovery_oids
[pg
->info
.pgid
].count(soid
));
9805 recovery_oids
[pg
->info
.pgid
].erase(soid
);
9808 _maybe_queue_recovery();
9811 bool OSDService::is_recovery_active()
9813 return local_reserver
.has_reservation() || remote_reserver
.has_reservation();
9816 // =========================================================
9819 bool OSD::op_is_discardable(const MOSDOp
*op
)
9821 // drop client request if they are not connected and can't get the
9823 if (!op
->get_connection()->is_connected()) {
9829 void OSD::enqueue_op(spg_t pg
, OpRequestRef
& op
, epoch_t epoch
)
9831 utime_t latency
= ceph_clock_now() - op
->get_req()->get_recv_stamp();
9832 dout(15) << "enqueue_op " << op
<< " prio " << op
->get_req()->get_priority()
9833 << " cost " << op
->get_req()->get_cost()
9834 << " latency " << latency
9835 << " epoch " << epoch
9836 << " " << *(op
->get_req()) << dendl
;
9837 op
->osd_trace
.event("enqueue op");
9838 op
->osd_trace
.keyval("priority", op
->get_req()->get_priority());
9839 op
->osd_trace
.keyval("cost", op
->get_req()->get_cost());
9840 op
->mark_queued_for_pg();
9841 logger
->tinc(l_osd_op_before_queue_op_lat
, latency
);
9842 op_shardedwq
.queue(make_pair(pg
, PGQueueable(op
, epoch
)));
9848 * NOTE: dequeue called in worker thread, with pg lock
9850 void OSD::dequeue_op(
9851 PGRef pg
, OpRequestRef op
,
9852 ThreadPool::TPHandle
&handle
)
9855 OID_EVENT_TRACE_WITH_MSG(op
->get_req(), "DEQUEUE_OP_BEGIN", false);
9857 utime_t now
= ceph_clock_now();
9858 op
->set_dequeued_time(now
);
9859 utime_t latency
= now
- op
->get_req()->get_recv_stamp();
9860 dout(10) << "dequeue_op " << op
<< " prio " << op
->get_req()->get_priority()
9861 << " cost " << op
->get_req()->get_cost()
9862 << " latency " << latency
9863 << " " << *(op
->get_req())
9864 << " pg " << *pg
<< dendl
;
9866 logger
->tinc(l_osd_op_before_dequeue_op_lat
, latency
);
9868 Session
*session
= static_cast<Session
*>(
9869 op
->get_req()->get_connection()->get_priv());
9871 maybe_share_map(session
, op
, pg
->get_osdmap());
9878 op
->mark_reached_pg();
9879 op
->osd_trace
.event("dequeue_op");
9881 pg
->do_request(op
, handle
);
9884 dout(10) << "dequeue_op " << op
<< " finish" << dendl
;
9885 OID_EVENT_TRACE_WITH_MSG(op
->get_req(), "DEQUEUE_OP_END", false);
9889 struct C_CompleteSplits
: public Context
{
9892 C_CompleteSplits(OSD
*osd
, const set
<PGRef
> &in
)
9893 : osd(osd
), pgs(in
) {}
9894 void finish(int r
) override
{
9895 Mutex::Locker
l(osd
->osd_lock
);
9896 if (osd
->is_stopping())
9898 PG::RecoveryCtx rctx
= osd
->create_context();
9899 for (set
<PGRef
>::iterator i
= pgs
.begin();
9902 osd
->pg_map_lock
.get_write();
9905 osd
->add_newly_split_pg(pg
, &rctx
);
9906 if (!((*i
)->deleting
)) {
9907 set
<spg_t
> to_complete
;
9908 to_complete
.insert((*i
)->info
.pgid
);
9909 osd
->service
.complete_split(to_complete
);
9911 osd
->pg_map_lock
.put_write();
9912 osd
->dispatch_context_transaction(rctx
, pg
);
9913 osd
->wake_pg_waiters(*i
);
9917 osd
->dispatch_context(rctx
, 0, osd
->service
.get_osdmap());
9921 void OSD::process_peering_events(
9922 const list
<PG
*> &pgs
,
9923 ThreadPool::TPHandle
&handle
9926 bool need_up_thru
= false;
9927 epoch_t same_interval_since
= 0;
9929 PG::RecoveryCtx rctx
= create_context();
9930 rctx
.handle
= &handle
;
9931 for (list
<PG
*>::const_iterator i
= pgs
.begin();
9934 set
<PGRef
> split_pgs
;
9936 pg
->lock_suspend_timeout(handle
);
9937 curmap
= service
.get_osdmap();
9942 if (!advance_pg(curmap
->get_epoch(), pg
, handle
, &rctx
, &split_pgs
)) {
9943 // we need to requeue the PG explicitly since we didn't actually
9945 peering_wq
.queue(pg
);
9947 assert(!pg
->peering_queue
.empty());
9948 PG::CephPeeringEvtRef evt
= pg
->peering_queue
.front();
9949 pg
->peering_queue
.pop_front();
9950 pg
->handle_peering_event(evt
, &rctx
);
9952 need_up_thru
= pg
->need_up_thru
|| need_up_thru
;
9953 same_interval_since
= MAX(pg
->info
.history
.same_interval_since
,
9954 same_interval_since
);
9955 pg
->write_if_dirty(*rctx
.transaction
);
9956 if (!split_pgs
.empty()) {
9957 rctx
.on_applied
->add(new C_CompleteSplits(this, split_pgs
));
9960 dispatch_context_transaction(rctx
, pg
, &handle
);
9964 queue_want_up_thru(same_interval_since
);
9965 dispatch_context(rctx
, 0, curmap
, &handle
);
9967 service
.send_pg_temp();
9970 // --------------------------------
9972 const char** OSD::get_tracked_conf_keys() const
9974 static const char* KEYS
[] = {
9975 "osd_max_backfills",
9976 "osd_min_recovery_priority",
9977 "osd_max_trimming_pgs",
9978 "osd_op_complaint_time",
9979 "osd_op_log_threshold",
9980 "osd_op_history_size",
9981 "osd_op_history_duration",
9982 "osd_op_history_slow_op_size",
9983 "osd_op_history_slow_op_threshold",
9984 "osd_enable_op_tracker",
9985 "osd_map_cache_size",
9986 "osd_map_max_advance",
9987 "osd_pg_epoch_persisted_max_stale",
9988 "osd_disk_thread_ioprio_class",
9989 "osd_disk_thread_ioprio_priority",
9990 // clog & admin clog
9993 "clog_to_syslog_facility",
9994 "clog_to_syslog_level",
9995 "osd_objectstore_fuse",
9997 "clog_to_graylog_host",
9998 "clog_to_graylog_port",
10001 "osd_recovery_delay_start",
10002 "osd_client_message_size_cap",
10003 "osd_client_message_cap",
10004 "osd_heartbeat_min_size",
10005 "osd_heartbeat_interval",
10011 void OSD::handle_conf_change(const struct md_config_t
*conf
,
10012 const std::set
<std::string
> &changed
)
10014 Mutex::Locker
l(osd_lock
);
10015 if (changed
.count("osd_max_backfills")) {
10016 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
10017 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
10019 if (changed
.count("osd_min_recovery_priority")) {
10020 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
10021 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
10023 if (changed
.count("osd_max_trimming_pgs")) {
10024 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
10026 if (changed
.count("osd_op_complaint_time") ||
10027 changed
.count("osd_op_log_threshold")) {
10028 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
10029 cct
->_conf
->osd_op_log_threshold
);
10031 if (changed
.count("osd_op_history_size") ||
10032 changed
.count("osd_op_history_duration")) {
10033 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
10034 cct
->_conf
->osd_op_history_duration
);
10036 if (changed
.count("osd_op_history_slow_op_size") ||
10037 changed
.count("osd_op_history_slow_op_threshold")) {
10038 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
10039 cct
->_conf
->osd_op_history_slow_op_threshold
);
10041 if (changed
.count("osd_enable_op_tracker")) {
10042 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
10044 if (changed
.count("osd_disk_thread_ioprio_class") ||
10045 changed
.count("osd_disk_thread_ioprio_priority")) {
10046 set_disk_tp_priority();
10048 if (changed
.count("osd_map_cache_size")) {
10049 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10050 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10051 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10053 if (changed
.count("clog_to_monitors") ||
10054 changed
.count("clog_to_syslog") ||
10055 changed
.count("clog_to_syslog_level") ||
10056 changed
.count("clog_to_syslog_facility") ||
10057 changed
.count("clog_to_graylog") ||
10058 changed
.count("clog_to_graylog_host") ||
10059 changed
.count("clog_to_graylog_port") ||
10060 changed
.count("host") ||
10061 changed
.count("fsid")) {
10062 update_log_config();
10065 #ifdef HAVE_LIBFUSE
10066 if (changed
.count("osd_objectstore_fuse")) {
10068 enable_disable_fuse(false);
10073 if (changed
.count("osd_recovery_delay_start")) {
10074 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
10075 service
.kick_recovery_queue();
10078 if (changed
.count("osd_client_message_cap")) {
10079 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
10080 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10081 if (pol
.throttler_messages
&& newval
> 0) {
10082 pol
.throttler_messages
->reset_max(newval
);
10085 if (changed
.count("osd_client_message_size_cap")) {
10086 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
10087 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10088 if (pol
.throttler_bytes
&& newval
> 0) {
10089 pol
.throttler_bytes
->reset_max(newval
);
10096 void OSD::update_log_config()
10098 map
<string
,string
> log_to_monitors
;
10099 map
<string
,string
> log_to_syslog
;
10100 map
<string
,string
> log_channel
;
10101 map
<string
,string
> log_prio
;
10102 map
<string
,string
> log_to_graylog
;
10103 map
<string
,string
> log_to_graylog_host
;
10104 map
<string
,string
> log_to_graylog_port
;
10108 if (parse_log_client_options(cct
, log_to_monitors
, log_to_syslog
,
10109 log_channel
, log_prio
, log_to_graylog
,
10110 log_to_graylog_host
, log_to_graylog_port
,
10112 clog
->update_config(log_to_monitors
, log_to_syslog
,
10113 log_channel
, log_prio
, log_to_graylog
,
10114 log_to_graylog_host
, log_to_graylog_port
,
10116 derr
<< "log_to_monitors " << log_to_monitors
<< dendl
;
10119 void OSD::check_config()
10121 // some sanity checks
10122 if (cct
->_conf
->osd_map_cache_size
<= cct
->_conf
->osd_map_max_advance
+ 2) {
10123 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
10124 << " is not > osd_map_max_advance ("
10125 << cct
->_conf
->osd_map_max_advance
<< ")";
10127 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
10128 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
10129 << " is not > osd_pg_epoch_persisted_max_stale ("
10130 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
10134 void OSD::set_disk_tp_priority()
10136 dout(10) << __func__
10137 << " class " << cct
->_conf
->osd_disk_thread_ioprio_class
10138 << " priority " << cct
->_conf
->osd_disk_thread_ioprio_priority
10140 if (cct
->_conf
->osd_disk_thread_ioprio_class
.empty() ||
10141 cct
->_conf
->osd_disk_thread_ioprio_priority
< 0)
10144 ceph_ioprio_string_to_class(cct
->_conf
->osd_disk_thread_ioprio_class
);
10146 derr
<< __func__
<< cpp_strerror(cls
) << ": "
10147 << "osd_disk_thread_ioprio_class is " << cct
->_conf
->osd_disk_thread_ioprio_class
10148 << " but only the following values are allowed: idle, be or rt" << dendl
;
10150 remove_tp
.set_ioprio(cls
, cct
->_conf
->osd_disk_thread_ioprio_priority
);
10151 recovery_tp
.set_ioprio(cls
, cct
->_conf
->osd_disk_thread_ioprio_priority
);
10155 // --------------------------------
10157 void OSD::get_latest_osdmap()
10159 dout(10) << __func__
<< " -- start" << dendl
;
10162 service
.objecter
->wait_for_latest_osdmap(&cond
);
10165 dout(10) << __func__
<< " -- finish" << dendl
;
10168 // --------------------------------
10170 int OSD::init_op_flags(OpRequestRef
& op
)
10172 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
10173 vector
<OSDOp
>::const_iterator iter
;
10175 // client flags have no bearing on whether an op is a read, write, etc.
10178 if (m
->has_flag(CEPH_OSD_FLAG_RWORDERED
)) {
10179 op
->set_force_rwordered();
10182 // set bits based on op codes, called methods.
10183 for (iter
= m
->ops
.begin(); iter
!= m
->ops
.end(); ++iter
) {
10184 if ((iter
->op
.op
== CEPH_OSD_OP_WATCH
&&
10185 iter
->op
.watch
.op
== CEPH_OSD_WATCH_OP_PING
)) {
10186 /* This a bit odd. PING isn't actually a write. It can't
10187 * result in an update to the object_info. PINGs also aren'ty
10188 * resent, so there's no reason to write out a log entry
10190 * However, we pipeline them behind writes, so let's force
10191 * the write_ordered flag.
10193 op
->set_force_rwordered();
10195 if (ceph_osd_op_mode_modify(iter
->op
.op
))
10198 if (ceph_osd_op_mode_read(iter
->op
.op
))
10201 // set READ flag if there are src_oids
10202 if (iter
->soid
.oid
.name
.length())
10205 // set PGOP flag if there are PG ops
10206 if (ceph_osd_op_type_pg(iter
->op
.op
))
10209 if (ceph_osd_op_mode_cache(iter
->op
.op
))
10212 // check for ec base pool
10213 int64_t poolid
= m
->get_pg().pool();
10214 const pg_pool_t
*pool
= osdmap
->get_pg_pool(poolid
);
10215 if (pool
&& pool
->is_tier()) {
10216 const pg_pool_t
*base_pool
= osdmap
->get_pg_pool(pool
->tier_of
);
10217 if (base_pool
&& base_pool
->require_rollback()) {
10218 if ((iter
->op
.op
!= CEPH_OSD_OP_READ
) &&
10219 (iter
->op
.op
!= CEPH_OSD_OP_CHECKSUM
) &&
10220 (iter
->op
.op
!= CEPH_OSD_OP_CMPEXT
) &&
10221 (iter
->op
.op
!= CEPH_OSD_OP_STAT
) &&
10222 (iter
->op
.op
!= CEPH_OSD_OP_ISDIRTY
) &&
10223 (iter
->op
.op
!= CEPH_OSD_OP_UNDIRTY
) &&
10224 (iter
->op
.op
!= CEPH_OSD_OP_GETXATTR
) &&
10225 (iter
->op
.op
!= CEPH_OSD_OP_GETXATTRS
) &&
10226 (iter
->op
.op
!= CEPH_OSD_OP_CMPXATTR
) &&
10227 (iter
->op
.op
!= CEPH_OSD_OP_ASSERT_VER
) &&
10228 (iter
->op
.op
!= CEPH_OSD_OP_LIST_WATCHERS
) &&
10229 (iter
->op
.op
!= CEPH_OSD_OP_LIST_SNAPS
) &&
10230 (iter
->op
.op
!= CEPH_OSD_OP_SETALLOCHINT
) &&
10231 (iter
->op
.op
!= CEPH_OSD_OP_WRITEFULL
) &&
10232 (iter
->op
.op
!= CEPH_OSD_OP_ROLLBACK
) &&
10233 (iter
->op
.op
!= CEPH_OSD_OP_CREATE
) &&
10234 (iter
->op
.op
!= CEPH_OSD_OP_DELETE
) &&
10235 (iter
->op
.op
!= CEPH_OSD_OP_SETXATTR
) &&
10236 (iter
->op
.op
!= CEPH_OSD_OP_RMXATTR
) &&
10237 (iter
->op
.op
!= CEPH_OSD_OP_STARTSYNC
) &&
10238 (iter
->op
.op
!= CEPH_OSD_OP_COPY_GET
) &&
10239 (iter
->op
.op
!= CEPH_OSD_OP_COPY_FROM
)) {
10245 switch (iter
->op
.op
) {
10246 case CEPH_OSD_OP_CALL
:
10248 bufferlist::iterator bp
= const_cast<bufferlist
&>(iter
->indata
).begin();
10249 int is_write
, is_read
;
10250 string cname
, mname
;
10251 bp
.copy(iter
->op
.cls
.class_len
, cname
);
10252 bp
.copy(iter
->op
.cls
.method_len
, mname
);
10254 ClassHandler::ClassData
*cls
;
10255 int r
= class_handler
->open_class(cname
, &cls
);
10257 derr
<< "class " << cname
<< " open got " << cpp_strerror(r
) << dendl
;
10260 else if (r
!= -EPERM
) // propagate permission errors
10264 int flags
= cls
->get_method_flags(mname
.c_str());
10266 if (flags
== -ENOENT
)
10272 is_read
= flags
& CLS_METHOD_RD
;
10273 is_write
= flags
& CLS_METHOD_WR
;
10274 bool is_promote
= flags
& CLS_METHOD_PROMOTE
;
10276 dout(10) << "class " << cname
<< " method " << mname
<< " "
10277 << "flags=" << (is_read
? "r" : "")
10278 << (is_write
? "w" : "")
10279 << (is_promote
? "p" : "")
10282 op
->set_class_read();
10284 op
->set_class_write();
10287 op
->add_class(cname
, is_read
, is_write
, cls
->whitelisted
);
10291 case CEPH_OSD_OP_WATCH
:
10292 // force the read bit for watch since it is depends on previous
10293 // watch state (and may return early if the watch exists) or, in
10294 // the case of ping, is simply a read op.
10297 case CEPH_OSD_OP_NOTIFY
:
10298 case CEPH_OSD_OP_NOTIFY_ACK
:
10304 case CEPH_OSD_OP_DELETE
:
10305 // if we get a delete with FAILOK we can skip handle cache. without
10306 // FAILOK we still need to promote (or do something smarter) to
10307 // determine whether to return ENOENT or 0.
10308 if (iter
== m
->ops
.begin() &&
10309 iter
->op
.flags
== CEPH_OSD_OP_FLAG_FAILOK
) {
10310 op
->set_skip_handle_cache();
10312 // skip promotion when proxying a delete op
10313 if (m
->ops
.size() == 1) {
10314 op
->set_skip_promote();
10318 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
10319 case CEPH_OSD_OP_CACHE_FLUSH
:
10320 case CEPH_OSD_OP_CACHE_EVICT
:
10321 // If try_flush/flush/evict is the only op, can skip handle cache.
10322 if (m
->ops
.size() == 1) {
10323 op
->set_skip_handle_cache();
10327 case CEPH_OSD_OP_READ
:
10328 case CEPH_OSD_OP_SYNC_READ
:
10329 case CEPH_OSD_OP_SPARSE_READ
:
10330 case CEPH_OSD_OP_CHECKSUM
:
10331 case CEPH_OSD_OP_WRITEFULL
:
10332 if (m
->ops
.size() == 1 &&
10333 (iter
->op
.flags
& CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
||
10334 iter
->op
.flags
& CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
)) {
10335 op
->set_skip_promote();
10339 // force promotion when pin an object in cache tier
10340 case CEPH_OSD_OP_CACHE_PIN
:
10349 if (op
->rmw_flags
== 0)
10355 void OSD::PeeringWQ::_dequeue(list
<PG
*> *out
) {
10356 for (list
<PG
*>::iterator i
= peering_queue
.begin();
10357 i
!= peering_queue
.end() &&
10358 out
->size() < osd
->cct
->_conf
->osd_peering_wq_batch_size
;
10360 if (in_use
.count(*i
)) {
10363 out
->push_back(*i
);
10364 peering_queue
.erase(i
++);
10367 in_use
.insert(out
->begin(), out
->end());
10371 // =============================================================
10373 #undef dout_context
10374 #define dout_context osd->cct
10376 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10378 void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid
)
10380 uint32_t shard_index
= pgid
.hash_to_shard(shard_list
.size());
10381 auto sdata
= shard_list
[shard_index
];
10382 bool queued
= false;
10384 Mutex::Locker
l(sdata
->sdata_op_ordering_lock
);
10385 auto p
= sdata
->pg_slots
.find(pgid
);
10386 if (p
!= sdata
->pg_slots
.end()) {
10387 dout(20) << __func__
<< " " << pgid
10388 << " to_process " << p
->second
.to_process
10389 << " waiting_for_pg=" << (int)p
->second
.waiting_for_pg
<< dendl
;
10390 for (auto i
= p
->second
.to_process
.rbegin();
10391 i
!= p
->second
.to_process
.rend();
10393 sdata
->_enqueue_front(make_pair(pgid
, *i
), osd
->op_prio_cutoff
);
10395 p
->second
.to_process
.clear();
10396 p
->second
.waiting_for_pg
= false;
10397 ++p
->second
.requeue_seq
;
10402 sdata
->sdata_lock
.Lock();
10403 sdata
->sdata_cond
.SignalOne();
10404 sdata
->sdata_lock
.Unlock();
10408 void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap
, int whoami
)
10410 unsigned pushes_to_free
= 0;
10411 for (auto sdata
: shard_list
) {
10412 Mutex::Locker
l(sdata
->sdata_op_ordering_lock
);
10413 sdata
->waiting_for_pg_osdmap
= osdmap
;
10414 auto p
= sdata
->pg_slots
.begin();
10415 while (p
!= sdata
->pg_slots
.end()) {
10416 ShardData::pg_slot
& slot
= p
->second
;
10417 if (!slot
.to_process
.empty() && slot
.num_running
== 0) {
10418 if (osdmap
->is_up_acting_osd_shard(p
->first
, whoami
)) {
10419 dout(20) << __func__
<< " " << p
->first
<< " maps to us, keeping"
10424 while (!slot
.to_process
.empty() &&
10425 slot
.to_process
.front().get_map_epoch() <= osdmap
->get_epoch()) {
10426 auto& qi
= slot
.to_process
.front();
10427 dout(20) << __func__
<< " " << p
->first
10429 << " epoch " << qi
.get_map_epoch()
10430 << " <= " << osdmap
->get_epoch()
10431 << ", stale, dropping" << dendl
;
10432 pushes_to_free
+= qi
.get_reserved_pushes();
10433 slot
.to_process
.pop_front();
10436 if (slot
.to_process
.empty() &&
10437 slot
.num_running
== 0 &&
10439 dout(20) << __func__
<< " " << p
->first
<< " empty, pruning" << dendl
;
10440 p
= sdata
->pg_slots
.erase(p
);
10446 if (pushes_to_free
> 0) {
10447 osd
->service
.release_reserved_pushes(pushes_to_free
);
10451 void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid
)
10453 uint32_t shard_index
= pgid
.hash_to_shard(shard_list
.size());
10454 auto sdata
= shard_list
[shard_index
];
10455 Mutex::Locker
l(sdata
->sdata_op_ordering_lock
);
10456 auto p
= sdata
->pg_slots
.find(pgid
);
10457 if (p
!= sdata
->pg_slots
.end()) {
10458 auto& slot
= p
->second
;
10459 dout(20) << __func__
<< " " << pgid
<< " pg " << slot
.pg
<< dendl
;
10460 assert(!slot
.pg
|| slot
.pg
->deleting
);
10465 void OSD::ShardedOpWQ::clear_pg_slots()
10467 for (auto sdata
: shard_list
) {
10468 Mutex::Locker
l(sdata
->sdata_op_ordering_lock
);
10469 sdata
->pg_slots
.clear();
10470 sdata
->waiting_for_pg_osdmap
.reset();
10471 // don't bother with reserved pushes; we are shutting down
10476 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10478 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
10480 uint32_t shard_index
= thread_index
% num_shards
;
10481 ShardData
*sdata
= shard_list
[shard_index
];
10482 assert(NULL
!= sdata
);
10485 sdata
->sdata_op_ordering_lock
.Lock();
10486 if (sdata
->pqueue
->empty()) {
10487 dout(20) << __func__
<< " empty q, waiting" << dendl
;
10488 // optimistically sleep a moment; maybe another work item will come along.
10489 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10490 osd
->cct
->_conf
->threadpool_default_timeout
, 0);
10491 sdata
->sdata_lock
.Lock();
10492 sdata
->sdata_op_ordering_lock
.Unlock();
10493 sdata
->sdata_cond
.WaitInterval(sdata
->sdata_lock
,
10494 utime_t(osd
->cct
->_conf
->threadpool_empty_queue_max_wait
, 0));
10495 sdata
->sdata_lock
.Unlock();
10496 sdata
->sdata_op_ordering_lock
.Lock();
10497 if (sdata
->pqueue
->empty()) {
10498 sdata
->sdata_op_ordering_lock
.Unlock();
10502 pair
<spg_t
, PGQueueable
> item
= sdata
->pqueue
->dequeue();
10503 if (osd
->is_stopping()) {
10504 sdata
->sdata_op_ordering_lock
.Unlock();
10505 return; // OSD shutdown, discard.
10508 uint64_t requeue_seq
;
10510 auto& slot
= sdata
->pg_slots
[item
.first
];
10511 dout(30) << __func__
<< " " << item
.first
10512 << " to_process " << slot
.to_process
10513 << " waiting_for_pg=" << (int)slot
.waiting_for_pg
<< dendl
;
10514 slot
.to_process
.push_back(item
.second
);
10515 // note the requeue seq now...
10516 requeue_seq
= slot
.requeue_seq
;
10517 if (slot
.waiting_for_pg
) {
10518 // save ourselves a bit of effort
10519 dout(20) << __func__
<< " " << item
.first
<< " item " << item
.second
10520 << " queued, waiting_for_pg" << dendl
;
10521 sdata
->sdata_op_ordering_lock
.Unlock();
10525 dout(20) << __func__
<< " " << item
.first
<< " item " << item
.second
10526 << " queued" << dendl
;
10527 ++slot
.num_running
;
10529 sdata
->sdata_op_ordering_lock
.Unlock();
10531 osd
->service
.maybe_inject_dispatch_delay();
10533 // [lookup +] lock pg (if we have it)
10535 pg
= osd
->_lookup_lock_pg(item
.first
);
10540 osd
->service
.maybe_inject_dispatch_delay();
10542 boost::optional
<PGQueueable
> qi
;
10544 // we don't use a Mutex::Locker here because of the
10545 // osd->service.release_reserved_pushes() call below
10546 sdata
->sdata_op_ordering_lock
.Lock();
10548 auto q
= sdata
->pg_slots
.find(item
.first
);
10549 assert(q
!= sdata
->pg_slots
.end());
10550 auto& slot
= q
->second
;
10551 --slot
.num_running
;
10553 if (slot
.to_process
.empty()) {
10554 // raced with wake_pg_waiters or prune_pg_waiters
10555 dout(20) << __func__
<< " " << item
.first
<< " nothing queued" << dendl
;
10559 sdata
->sdata_op_ordering_lock
.Unlock();
10562 if (requeue_seq
!= slot
.requeue_seq
) {
10563 dout(20) << __func__
<< " " << item
.first
10564 << " requeue_seq " << slot
.requeue_seq
<< " > our "
10565 << requeue_seq
<< ", we raced with wake_pg_waiters"
10570 sdata
->sdata_op_ordering_lock
.Unlock();
10573 if (pg
&& !slot
.pg
&& !pg
->deleting
) {
10574 dout(20) << __func__
<< " " << item
.first
<< " set pg to " << pg
<< dendl
;
10577 dout(30) << __func__
<< " " << item
.first
<< " to_process " << slot
.to_process
10578 << " waiting_for_pg=" << (int)slot
.waiting_for_pg
<< dendl
;
10580 // make sure we're not already waiting for this pg
10581 if (slot
.waiting_for_pg
) {
10582 dout(20) << __func__
<< " " << item
.first
<< " item " << item
.second
10583 << " slot is waiting_for_pg" << dendl
;
10587 sdata
->sdata_op_ordering_lock
.Unlock();
10592 qi
= slot
.to_process
.front();
10593 slot
.to_process
.pop_front();
10594 dout(20) << __func__
<< " " << item
.first
<< " item " << *qi
10595 << " pg " << pg
<< dendl
;
10598 // should this pg shard exist on this osd in this (or a later) epoch?
10599 OSDMapRef osdmap
= sdata
->waiting_for_pg_osdmap
;
10600 if (osdmap
->is_up_acting_osd_shard(item
.first
, osd
->whoami
)) {
10601 dout(20) << __func__
<< " " << item
.first
10602 << " no pg, should exist, will wait" << " on " << *qi
<< dendl
;
10603 slot
.to_process
.push_front(*qi
);
10604 slot
.waiting_for_pg
= true;
10605 } else if (qi
->get_map_epoch() > osdmap
->get_epoch()) {
10606 dout(20) << __func__
<< " " << item
.first
<< " no pg, item epoch is "
10607 << qi
->get_map_epoch() << " > " << osdmap
->get_epoch()
10608 << ", will wait on " << *qi
<< dendl
;
10609 slot
.to_process
.push_front(*qi
);
10610 slot
.waiting_for_pg
= true;
10612 dout(20) << __func__
<< " " << item
.first
<< " no pg, shouldn't exist,"
10613 << " dropping " << *qi
<< dendl
;
10614 // share map with client?
10615 if (boost::optional
<OpRequestRef
> _op
= qi
->maybe_get_op()) {
10616 Session
*session
= static_cast<Session
*>(
10617 (*_op
)->get_req()->get_connection()->get_priv());
10619 osd
->maybe_share_map(session
, *_op
, sdata
->waiting_for_pg_osdmap
);
10623 unsigned pushes_to_free
= qi
->get_reserved_pushes();
10624 if (pushes_to_free
> 0) {
10625 sdata
->sdata_op_ordering_lock
.Unlock();
10626 osd
->service
.release_reserved_pushes(pushes_to_free
);
10630 sdata
->sdata_op_ordering_lock
.Unlock();
10633 sdata
->sdata_op_ordering_lock
.Unlock();
10636 // osd_opwq_process marks the point at which an operation has been dequeued
10637 // and will begin to be handled by a worker thread.
10641 if (boost::optional
<OpRequestRef
> _op
= qi
->maybe_get_op()) {
10642 reqid
= (*_op
)->get_reqid();
10645 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
10646 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
10649 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
10650 Formatter
*f
= Formatter::create("json");
10651 f
->open_object_section("q");
10653 f
->close_section();
10658 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
10660 qi
->run(osd
, pg
, tp_handle
);
10665 if (boost::optional
<OpRequestRef
> _op
= qi
->maybe_get_op()) {
10666 reqid
= (*_op
)->get_reqid();
10669 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
10670 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
10676 void OSD::ShardedOpWQ::_enqueue(pair
<spg_t
, PGQueueable
> item
) {
10677 uint32_t shard_index
=
10678 item
.first
.hash_to_shard(shard_list
.size());
10680 ShardData
* sdata
= shard_list
[shard_index
];
10681 assert (NULL
!= sdata
);
10682 unsigned priority
= item
.second
.get_priority();
10683 unsigned cost
= item
.second
.get_cost();
10684 sdata
->sdata_op_ordering_lock
.Lock();
10686 dout(20) << __func__
<< " " << item
.first
<< " " << item
.second
<< dendl
;
10687 if (priority
>= osd
->op_prio_cutoff
)
10688 sdata
->pqueue
->enqueue_strict(
10689 item
.second
.get_owner(), priority
, item
);
10691 sdata
->pqueue
->enqueue(
10692 item
.second
.get_owner(),
10693 priority
, cost
, item
);
10694 sdata
->sdata_op_ordering_lock
.Unlock();
10696 sdata
->sdata_lock
.Lock();
10697 sdata
->sdata_cond
.SignalOne();
10698 sdata
->sdata_lock
.Unlock();
10702 void OSD::ShardedOpWQ::_enqueue_front(pair
<spg_t
, PGQueueable
> item
)
10704 uint32_t shard_index
= item
.first
.hash_to_shard(shard_list
.size());
10705 ShardData
* sdata
= shard_list
[shard_index
];
10706 assert (NULL
!= sdata
);
10707 sdata
->sdata_op_ordering_lock
.Lock();
10708 auto p
= sdata
->pg_slots
.find(item
.first
);
10709 if (p
!= sdata
->pg_slots
.end() && !p
->second
.to_process
.empty()) {
10710 // we may be racing with _process, which has dequeued a new item
10711 // from pqueue, put it on to_process, and is now busy taking the
10712 // pg lock. ensure this old requeued item is ordered before any
10713 // such newer item in to_process.
10714 p
->second
.to_process
.push_front(item
.second
);
10715 item
.second
= p
->second
.to_process
.back();
10716 p
->second
.to_process
.pop_back();
10717 dout(20) << __func__
<< " " << item
.first
10718 << " " << p
->second
.to_process
.front()
10719 << " shuffled w/ " << item
.second
<< dendl
;
10721 dout(20) << __func__
<< " " << item
.first
<< " " << item
.second
<< dendl
;
10723 sdata
->_enqueue_front(item
, osd
->op_prio_cutoff
);
10724 sdata
->sdata_op_ordering_lock
.Unlock();
10725 sdata
->sdata_lock
.Lock();
10726 sdata
->sdata_cond
.SignalOne();
10727 sdata
->sdata_lock
.Unlock();
10731 namespace osd_cmds
{
10733 int heap(CephContext
& cct
, cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
)
10735 if (!ceph_using_tcmalloc()) {
10736 os
<< "could not issue heap profiler command -- not using tcmalloc!";
10737 return -EOPNOTSUPP
;
10741 if (!cmd_getval(&cct
, cmdmap
, "heapcmd", cmd
)) {
10742 os
<< "unable to get value for command \"" << cmd
<< "\"";
10746 std::vector
<std::string
> cmd_vec
;
10747 get_str_vec(cmd
, cmd_vec
);
10749 ceph_heap_profiler_handle_command(cmd_vec
, os
);
10754 }} // namespace ceph::osd_cmds
10757 std::ostream
& operator<<(std::ostream
& out
, const OSD::io_queue
& q
) {
10759 case OSD::io_queue::prioritized
:
10760 out
<< "prioritized";
10762 case OSD::io_queue::weightedpriority
:
10763 out
<< "weightedpriority";
10765 case OSD::io_queue::mclock_opclass
:
10766 out
<< "mclock_opclass";
10768 case OSD::io_queue::mclock_client
:
10769 out
<< "mclock_client";