1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2012 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 #include "common/dout.h"
17 #include "common/HeartbeatMap.h"
18 #include "include/stringify.h"
19 #include "include/util.h"
21 #include "messages/MMDSBeacon.h"
22 #include "mon/MonClient.h"
23 #include "mds/MDLog.h"
24 #include "mds/MDSRank.h"
25 #include "mds/MDSMap.h"
26 #include "mds/Locker.h"
30 #define dout_context g_ceph_context
31 #define dout_subsys ceph_subsys_mds
33 #define dout_prefix *_dout << "mds.beacon." << name << ' '
36 class Beacon::C_MDS_BeaconSender
: public Context
{
38 explicit C_MDS_BeaconSender(Beacon
*beacon_
) : beacon(beacon_
) {}
39 void finish(int r
) override
{
40 assert(beacon
->lock
.is_locked_by_me());
41 beacon
->sender
= NULL
;
48 Beacon::Beacon(CephContext
*cct_
, MonClient
*monc_
, std::string name_
) :
49 Dispatcher(cct_
), lock("Beacon"), monc(monc_
), timer(g_ceph_context
, lock
),
50 name(name_
), standby_for_rank(MDS_RANK_NONE
),
51 standby_for_fscid(FS_CLUSTER_ID_NONE
), want_state(MDSMap::STATE_BOOT
),
67 void Beacon::init(MDSMap
const *mdsmap
)
69 Mutex::Locker
l(lock
);
70 assert(mdsmap
!= NULL
);
72 _notify_mdsmap(mdsmap
);
73 standby_for_rank
= mds_rank_t(g_conf
->mds_standby_for_rank
);
74 standby_for_name
= g_conf
->mds_standby_for_name
;
75 standby_for_fscid
= fs_cluster_id_t(g_conf
->mds_standby_for_fscid
);
76 standby_replay
= g_conf
->mds_standby_replay
;
78 // Spawn threads and start messaging
84 void Beacon::shutdown()
86 Mutex::Locker
l(lock
);
88 timer
.cancel_event(sender
);
95 bool Beacon::ms_dispatch(Message
*m
)
97 if (m
->get_type() == MSG_MDS_BEACON
) {
98 if (m
->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
99 handle_mds_beacon(static_cast<MMDSBeacon
*>(m
));
109 * Update lagginess state based on response from remote MDSMonitor
111 * This function puts the passed message before returning
113 void Beacon::handle_mds_beacon(MMDSBeacon
*m
)
115 Mutex::Locker
l(lock
);
118 version_t seq
= m
->get_seq();
121 if (seq_stamp
.count(seq
)) {
122 utime_t now
= ceph_clock_now();
123 if (seq_stamp
[seq
] > last_acked_stamp
) {
124 last_acked_stamp
= seq_stamp
[seq
];
125 utime_t rtt
= now
- last_acked_stamp
;
127 dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m
->get_state())
128 << " seq " << m
->get_seq() << " rtt " << rtt
<< dendl
;
130 if (was_laggy
&& rtt
< g_conf
->mds_beacon_grace
) {
131 dout(0) << "handle_mds_beacon no longer laggy" << dendl
;
136 // Mark myself laggy if system clock goes backwards. Hopping
137 // later beacons will clear it.
138 dout(1) << "handle_mds_beacon system clock goes backwards, "
139 << "mark myself laggy" << dendl
;
140 last_acked_stamp
= now
- utime_t(g_conf
->mds_beacon_grace
+ 1, 0);
144 // clean up seq_stamp map
145 while (!seq_stamp
.empty() &&
146 seq_stamp
.begin()->first
<= seq
)
147 seq_stamp
.erase(seq_stamp
.begin());
149 // Wake a waiter up if present
150 if (awaiting_seq
== seq
) {
151 waiting_cond
.Signal();
154 dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m
->get_state())
155 << " seq " << m
->get_seq() << " dne" << dendl
;
162 Mutex::Locker
l(lock
);
167 void Beacon::send_and_wait(const double duration
)
169 Mutex::Locker
l(lock
);
171 awaiting_seq
= last_seq
;
172 dout(20) << __func__
<< ": awaiting " << awaiting_seq
173 << " for up to " << duration
<< "s" << dendl
;
176 timeout
.set_from_double(ceph_clock_now() + duration
);
177 while ((!seq_stamp
.empty() && seq_stamp
.begin()->first
<= awaiting_seq
)
178 && ceph_clock_now() < timeout
) {
179 waiting_cond
.WaitUntil(lock
, timeout
);
187 * Call periodically, or when you have updated the desired state
192 timer
.cancel_event(sender
);
194 sender
= new C_MDS_BeaconSender(this);
195 timer
.add_event_after(g_conf
->mds_beacon_interval
, sender
);
197 if (!cct
->get_heartbeat_map()->is_healthy()) {
198 /* If anything isn't progressing, let avoid sending a beacon so that
199 * the MDS will consider us laggy */
200 dout(1) << __func__
<< " skipping beacon, heartbeat map not healthy" << dendl
;
205 dout(10) << __func__
<< " " << ceph_mds_state_name(want_state
)
206 << " seq " << last_seq
209 seq_stamp
[last_seq
] = ceph_clock_now();
211 assert(want_state
!= MDSMap::STATE_NULL
);
213 MMDSBeacon
*beacon
= new MMDSBeacon(
214 monc
->get_fsid(), mds_gid_t(monc
->get_global_id()),
219 CEPH_FEATURES_SUPPORTED_DEFAULT
);
221 beacon
->set_standby_for_rank(standby_for_rank
);
222 beacon
->set_standby_for_name(standby_for_name
);
223 beacon
->set_standby_for_fscid(standby_for_fscid
);
224 beacon
->set_standby_replay(standby_replay
);
225 beacon
->set_health(health
);
226 beacon
->set_compat(compat
);
227 // piggyback the sys info on beacon msg
228 if (want_state
== MDSMap::STATE_BOOT
) {
229 map
<string
, string
> sys_info
;
230 collect_sys_info(&sys_info
, cct
);
231 sys_info
["addr"] = stringify(monc
->get_myaddr());
232 beacon
->set_sys_info(sys_info
);
234 monc
->send_mon_message(beacon
);
238 * Call this when there is a new MDSMap available
240 void Beacon::notify_mdsmap(MDSMap
const *mdsmap
)
242 Mutex::Locker
l(lock
);
243 assert(mdsmap
!= NULL
);
245 _notify_mdsmap(mdsmap
);
248 void Beacon::_notify_mdsmap(MDSMap
const *mdsmap
)
250 assert(mdsmap
!= NULL
);
251 assert(mdsmap
->get_epoch() >= epoch
);
253 if (mdsmap
->get_epoch() != epoch
) {
254 epoch
= mdsmap
->get_epoch();
255 compat
= get_mdsmap_compat_set_default();
256 compat
.merge(mdsmap
->compat
);
261 bool Beacon::is_laggy()
263 Mutex::Locker
l(lock
);
265 if (last_acked_stamp
== utime_t())
268 utime_t now
= ceph_clock_now();
269 utime_t since
= now
- last_acked_stamp
;
270 if (since
> g_conf
->mds_beacon_grace
) {
271 dout(5) << "is_laggy " << since
<< " > " << g_conf
->mds_beacon_grace
272 << " since last acked beacon" << dendl
;
274 if (since
> (g_conf
->mds_beacon_grace
*2) &&
275 now
> last_mon_reconnect
+ g_conf
->mds_beacon_interval
) {
276 // maybe it's not us?
277 dout(5) << "initiating monitor reconnect; maybe we're not the slow one"
279 last_mon_reconnect
= now
;
280 monc
->reopen_session();
287 utime_t
Beacon::get_laggy_until() const
289 Mutex::Locker
l(lock
);
294 void Beacon::set_want_state(MDSMap
const *mdsmap
, MDSMap::DaemonState
const newstate
)
296 Mutex::Locker
l(lock
);
298 // Update mdsmap epoch atomically with updating want_state, so that when
299 // we send a beacon with the new want state it has the latest epoch, and
300 // once we have updated to the latest epoch, we are not sending out
301 // a stale want_state (i.e. one from before making it through MDSMap
303 _notify_mdsmap(mdsmap
);
305 if (want_state
!= newstate
) {
306 dout(10) << __func__
<< ": "
307 << ceph_mds_state_name(want_state
) << " -> "
308 << ceph_mds_state_name(newstate
) << dendl
;
309 want_state
= newstate
;
315 * We are 'shown' an MDS briefly in order to update
316 * some health metrics that we will send in the next
319 void Beacon::notify_health(MDSRank
const *mds
)
321 Mutex::Locker
l(lock
);
327 // I'm going to touch this MDS, so it must be locked
328 assert(mds
->mds_lock
.is_locked_by_me());
330 health
.metrics
.clear();
332 // Detect presence of entries in DamageTable
333 if (!mds
->damage_table
.empty()) {
334 MDSHealthMetric
m(MDS_HEALTH_DAMAGE
, HEALTH_ERR
, std::string(
335 "Metadata damage detected"));
336 health
.metrics
.push_back(m
);
339 // Detect MDS_HEALTH_TRIM condition
340 // Arbitrary factor of 2, indicates MDS is not trimming promptly
342 if (mds
->mdlog
->get_num_segments() > (size_t)(g_conf
->mds_log_max_segments
* 2)) {
343 std::ostringstream oss
;
344 oss
<< "Behind on trimming (" << mds
->mdlog
->get_num_segments()
345 << "/" << g_conf
->mds_log_max_segments
<< ")";
347 MDSHealthMetric
m(MDS_HEALTH_TRIM
, HEALTH_WARN
, oss
.str());
348 m
.metadata
["num_segments"] = stringify(mds
->mdlog
->get_num_segments());
349 m
.metadata
["max_segments"] = stringify(g_conf
->mds_log_max_segments
);
350 health
.metrics
.push_back(m
);
354 // Detect clients failing to respond to modifications to capabilities in
355 // CLIENT_CAPS messages.
357 std::list
<client_t
> late_clients
;
358 mds
->locker
->get_late_revoking_clients(&late_clients
);
359 std::list
<MDSHealthMetric
> late_cap_metrics
;
361 for (std::list
<client_t
>::iterator i
= late_clients
.begin(); i
!= late_clients
.end(); ++i
) {
363 // client_t is equivalent to session.info.inst.name.num
364 // Construct an entity_name_t to lookup into SessionMap
365 entity_name_t
ename(CEPH_ENTITY_TYPE_CLIENT
, i
->v
);
366 Session
const *s
= mds
->sessionmap
.get_session(ename
);
368 // Shouldn't happen, but not worth crashing if it does as this is
369 // just health-reporting code.
370 derr
<< "Client ID without session: " << i
->v
<< dendl
;
374 std::ostringstream oss
;
375 oss
<< "Client " << s
->get_human_name() << " failing to respond to capability release";
376 MDSHealthMetric
m(MDS_HEALTH_CLIENT_LATE_RELEASE
, HEALTH_WARN
, oss
.str());
377 m
.metadata
["client_id"] = stringify(i
->v
);
378 late_cap_metrics
.push_back(m
);
381 if (late_cap_metrics
.size() <= (size_t)g_conf
->mds_health_summarize_threshold
) {
382 health
.metrics
.splice(health
.metrics
.end(), late_cap_metrics
);
384 std::ostringstream oss
;
385 oss
<< "Many clients (" << late_cap_metrics
.size()
386 << ") failing to respond to capability release";
387 MDSHealthMetric
m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY
, HEALTH_WARN
, oss
.str());
388 m
.metadata
["client_count"] = stringify(late_cap_metrics
.size());
389 health
.metrics
.push_back(m
);
390 late_cap_metrics
.clear();
394 // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE
395 // messages. May be due to buggy client or resource-hogging application.
397 // Detect clients failing to advance their old_client_tid
399 set
<Session
*> sessions
;
400 mds
->sessionmap
.get_client_session_set(sessions
);
402 utime_t cutoff
= ceph_clock_now();
403 cutoff
-= g_conf
->mds_recall_state_timeout
;
404 utime_t last_recall
= mds
->mdcache
->last_recall_state
;
406 std::list
<MDSHealthMetric
> late_recall_metrics
;
407 std::list
<MDSHealthMetric
> large_completed_requests_metrics
;
408 for (set
<Session
*>::iterator i
= sessions
.begin(); i
!= sessions
.end(); ++i
) {
409 Session
*session
= *i
;
410 if (!session
->recalled_at
.is_zero()) {
411 dout(20) << "Session servicing RECALL " << session
->info
.inst
412 << ": " << session
->recalled_at
<< " " << session
->recall_release_count
413 << "/" << session
->recall_count
<< dendl
;
414 if (last_recall
< cutoff
|| session
->last_recall_sent
< last_recall
) {
415 dout(20) << " no longer recall" << dendl
;
416 session
->clear_recalled_at();
417 } else if (session
->recalled_at
< cutoff
) {
418 dout(20) << " exceeded timeout " << session
->recalled_at
<< " vs. " << cutoff
<< dendl
;
419 std::ostringstream oss
;
420 oss
<< "Client " << session
->get_human_name() << " failing to respond to cache pressure";
421 MDSHealthMetric
m(MDS_HEALTH_CLIENT_RECALL
, HEALTH_WARN
, oss
.str());
422 m
.metadata
["client_id"] = stringify(session
->info
.inst
.name
.num());
423 late_recall_metrics
.push_back(m
);
425 dout(20) << " within timeout " << session
->recalled_at
<< " vs. " << cutoff
<< dendl
;
428 if ((session
->get_num_trim_requests_warnings() > 0 &&
429 session
->get_num_completed_requests() >= g_conf
->mds_max_completed_requests
) ||
430 (session
->get_num_trim_flushes_warnings() > 0 &&
431 session
->get_num_completed_flushes() >= g_conf
->mds_max_completed_flushes
)) {
432 std::ostringstream oss
;
433 oss
<< "Client " << session
->get_human_name() << " failing to advance its oldest client/flush tid";
434 MDSHealthMetric
m(MDS_HEALTH_CLIENT_OLDEST_TID
, HEALTH_WARN
, oss
.str());
435 m
.metadata
["client_id"] = stringify(session
->info
.inst
.name
.num());
436 large_completed_requests_metrics
.push_back(m
);
440 if (late_recall_metrics
.size() <= (size_t)g_conf
->mds_health_summarize_threshold
) {
441 health
.metrics
.splice(health
.metrics
.end(), late_recall_metrics
);
443 std::ostringstream oss
;
444 oss
<< "Many clients (" << late_recall_metrics
.size()
445 << ") failing to respond to cache pressure";
446 MDSHealthMetric
m(MDS_HEALTH_CLIENT_RECALL_MANY
, HEALTH_WARN
, oss
.str());
447 m
.metadata
["client_count"] = stringify(late_recall_metrics
.size());
448 health
.metrics
.push_back(m
);
449 late_recall_metrics
.clear();
452 if (large_completed_requests_metrics
.size() <= (size_t)g_conf
->mds_health_summarize_threshold
) {
453 health
.metrics
.splice(health
.metrics
.end(), large_completed_requests_metrics
);
455 std::ostringstream oss
;
456 oss
<< "Many clients (" << large_completed_requests_metrics
.size()
457 << ") failing to advance their oldest client/flush tid";
458 MDSHealthMetric
m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY
, HEALTH_WARN
, oss
.str());
459 m
.metadata
["client_count"] = stringify(large_completed_requests_metrics
.size());
460 health
.metrics
.push_back(m
);
461 large_completed_requests_metrics
.clear();
465 // Detect MDS_HEALTH_SLOW_REQUEST condition
467 int slow
= mds
->get_mds_slow_req_count();
468 dout(20) << slow
<< " slow request found" << dendl
;
470 std::ostringstream oss
;
471 oss
<< slow
<< " slow requests are blocked > " << g_conf
->mds_op_complaint_time
<< " sec";
473 MDSHealthMetric
m(MDS_HEALTH_SLOW_REQUEST
, HEALTH_WARN
, oss
.str());
474 health
.metrics
.push_back(m
);
478 // Report a health warning if we are readonly
479 if (mds
->mdcache
->is_readonly()) {
480 MDSHealthMetric
m(MDS_HEALTH_READ_ONLY
, HEALTH_WARN
,
481 "MDS in read-only mode");
482 health
.metrics
.push_back(m
);
485 // Report if we have significantly exceeded our cache size limit
486 if (mds
->mdcache
->get_cache_size() >
487 g_conf
->mds_cache_size
* g_conf
->mds_health_cache_threshold
) {
488 std::ostringstream oss
;
489 oss
<< "Too many inodes in cache (" << mds
->mdcache
->get_cache_size()
490 << "/" << g_conf
->mds_cache_size
<< "), "
491 << mds
->mdcache
->num_inodes_with_caps
<< " inodes in use by clients, "
492 << mds
->mdcache
->get_num_strays() << " stray files";
494 MDSHealthMetric
m(MDS_HEALTH_CACHE_OVERSIZED
, HEALTH_WARN
, oss
.str());
495 health
.metrics
.push_back(m
);
499 MDSMap::DaemonState
Beacon::get_want_state() const
501 Mutex::Locker
l(lock
);