]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Beacon.cc
602240bfe12c41db52128f5e56c70b9d47a21125
[ceph.git] / ceph / src / mds / Beacon.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2012 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #include "common/dout.h"
17 #include "common/HeartbeatMap.h"
18 #include "include/stringify.h"
19 #include "include/util.h"
20
21 #include "messages/MMDSBeacon.h"
22 #include "mon/MonClient.h"
23 #include "mds/MDLog.h"
24 #include "mds/MDSRank.h"
25 #include "mds/MDSMap.h"
26 #include "mds/Locker.h"
27
28 #include "Beacon.h"
29
30 #define dout_context g_ceph_context
31 #define dout_subsys ceph_subsys_mds
32 #undef dout_prefix
33 #define dout_prefix *_dout << "mds.beacon." << name << ' '
34
35
36 class Beacon::C_MDS_BeaconSender : public Context {
37 public:
38 explicit C_MDS_BeaconSender(Beacon *beacon_) : beacon(beacon_) {}
39 void finish(int r) override {
40 assert(beacon->lock.is_locked_by_me());
41 beacon->sender = NULL;
42 beacon->_send();
43 }
44 private:
45 Beacon *beacon;
46 };
47
48 Beacon::Beacon(CephContext *cct_, MonClient *monc_, std::string name_) :
49 Dispatcher(cct_), lock("Beacon"), monc(monc_), timer(g_ceph_context, lock),
50 name(name_), standby_for_rank(MDS_RANK_NONE),
51 standby_for_fscid(FS_CLUSTER_ID_NONE), want_state(MDSMap::STATE_BOOT),
52 awaiting_seq(-1)
53 {
54 last_seq = 0;
55 sender = NULL;
56 was_laggy = false;
57
58 epoch = 0;
59 }
60
61
62 Beacon::~Beacon()
63 {
64 }
65
66
67 void Beacon::init(MDSMap const *mdsmap)
68 {
69 Mutex::Locker l(lock);
70 assert(mdsmap != NULL);
71
72 _notify_mdsmap(mdsmap);
73 standby_for_rank = mds_rank_t(g_conf->mds_standby_for_rank);
74 standby_for_name = g_conf->mds_standby_for_name;
75 standby_for_fscid = fs_cluster_id_t(g_conf->mds_standby_for_fscid);
76 standby_replay = g_conf->mds_standby_replay;
77
78 // Spawn threads and start messaging
79 timer.init();
80 _send();
81 }
82
83
84 void Beacon::shutdown()
85 {
86 Mutex::Locker l(lock);
87 if (sender) {
88 timer.cancel_event(sender);
89 sender = NULL;
90 }
91 timer.shutdown();
92 }
93
94
95 bool Beacon::ms_dispatch(Message *m)
96 {
97 if (m->get_type() == MSG_MDS_BEACON) {
98 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
99 handle_mds_beacon(static_cast<MMDSBeacon*>(m));
100 }
101 return true;
102 }
103
104 return false;
105 }
106
107
108 /**
109 * Update lagginess state based on response from remote MDSMonitor
110 *
111 * This function puts the passed message before returning
112 */
113 void Beacon::handle_mds_beacon(MMDSBeacon *m)
114 {
115 Mutex::Locker l(lock);
116 assert(m != NULL);
117
118 version_t seq = m->get_seq();
119
120 // update lab
121 if (seq_stamp.count(seq)) {
122 utime_t now = ceph_clock_now();
123 if (seq_stamp[seq] > last_acked_stamp) {
124 last_acked_stamp = seq_stamp[seq];
125 utime_t rtt = now - last_acked_stamp;
126
127 dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m->get_state())
128 << " seq " << m->get_seq() << " rtt " << rtt << dendl;
129
130 if (was_laggy && rtt < g_conf->mds_beacon_grace) {
131 dout(0) << "handle_mds_beacon no longer laggy" << dendl;
132 was_laggy = false;
133 laggy_until = now;
134 }
135 } else {
136 // Mark myself laggy if system clock goes backwards. Hopping
137 // later beacons will clear it.
138 dout(1) << "handle_mds_beacon system clock goes backwards, "
139 << "mark myself laggy" << dendl;
140 last_acked_stamp = now - utime_t(g_conf->mds_beacon_grace + 1, 0);
141 was_laggy = true;
142 }
143
144 // clean up seq_stamp map
145 while (!seq_stamp.empty() &&
146 seq_stamp.begin()->first <= seq)
147 seq_stamp.erase(seq_stamp.begin());
148
149 // Wake a waiter up if present
150 if (awaiting_seq == seq) {
151 waiting_cond.Signal();
152 }
153 } else {
154 dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m->get_state())
155 << " seq " << m->get_seq() << " dne" << dendl;
156 }
157 }
158
159
160 void Beacon::send()
161 {
162 Mutex::Locker l(lock);
163 _send();
164 }
165
166
167 void Beacon::send_and_wait(const double duration)
168 {
169 Mutex::Locker l(lock);
170 _send();
171 awaiting_seq = last_seq;
172 dout(20) << __func__ << ": awaiting " << awaiting_seq
173 << " for up to " << duration << "s" << dendl;
174
175 utime_t timeout;
176 timeout.set_from_double(ceph_clock_now() + duration);
177 while ((!seq_stamp.empty() && seq_stamp.begin()->first <= awaiting_seq)
178 && ceph_clock_now() < timeout) {
179 waiting_cond.WaitUntil(lock, timeout);
180 }
181
182 awaiting_seq = -1;
183 }
184
185
186 /**
187 * Call periodically, or when you have updated the desired state
188 */
189 void Beacon::_send()
190 {
191 if (sender) {
192 timer.cancel_event(sender);
193 }
194 sender = new C_MDS_BeaconSender(this);
195 timer.add_event_after(g_conf->mds_beacon_interval, sender);
196
197 if (!cct->get_heartbeat_map()->is_healthy()) {
198 /* If anything isn't progressing, let avoid sending a beacon so that
199 * the MDS will consider us laggy */
200 dout(1) << __func__ << " skipping beacon, heartbeat map not healthy" << dendl;
201 return;
202 }
203
204 ++last_seq;
205 dout(10) << __func__ << " " << ceph_mds_state_name(want_state)
206 << " seq " << last_seq
207 << dendl;
208
209 seq_stamp[last_seq] = ceph_clock_now();
210
211 assert(want_state != MDSMap::STATE_NULL);
212
213 MMDSBeacon *beacon = new MMDSBeacon(
214 monc->get_fsid(), mds_gid_t(monc->get_global_id()),
215 name,
216 epoch,
217 want_state,
218 last_seq,
219 CEPH_FEATURES_SUPPORTED_DEFAULT);
220
221 beacon->set_standby_for_rank(standby_for_rank);
222 beacon->set_standby_for_name(standby_for_name);
223 beacon->set_standby_for_fscid(standby_for_fscid);
224 beacon->set_standby_replay(standby_replay);
225 beacon->set_health(health);
226 beacon->set_compat(compat);
227 // piggyback the sys info on beacon msg
228 if (want_state == MDSMap::STATE_BOOT) {
229 map<string, string> sys_info;
230 collect_sys_info(&sys_info, cct);
231 sys_info["addr"] = stringify(monc->get_myaddr());
232 beacon->set_sys_info(sys_info);
233 }
234 monc->send_mon_message(beacon);
235 }
236
237 /**
238 * Call this when there is a new MDSMap available
239 */
240 void Beacon::notify_mdsmap(MDSMap const *mdsmap)
241 {
242 Mutex::Locker l(lock);
243 assert(mdsmap != NULL);
244
245 _notify_mdsmap(mdsmap);
246 }
247
248 void Beacon::_notify_mdsmap(MDSMap const *mdsmap)
249 {
250 assert(mdsmap != NULL);
251 assert(mdsmap->get_epoch() >= epoch);
252
253 if (mdsmap->get_epoch() != epoch) {
254 epoch = mdsmap->get_epoch();
255 compat = get_mdsmap_compat_set_default();
256 compat.merge(mdsmap->compat);
257 }
258 }
259
260
261 bool Beacon::is_laggy()
262 {
263 Mutex::Locker l(lock);
264
265 if (last_acked_stamp == utime_t())
266 return false;
267
268 utime_t now = ceph_clock_now();
269 utime_t since = now - last_acked_stamp;
270 if (since > g_conf->mds_beacon_grace) {
271 dout(5) << "is_laggy " << since << " > " << g_conf->mds_beacon_grace
272 << " since last acked beacon" << dendl;
273 was_laggy = true;
274 if (since > (g_conf->mds_beacon_grace*2) &&
275 now > last_mon_reconnect + g_conf->mds_beacon_interval) {
276 // maybe it's not us?
277 dout(5) << "initiating monitor reconnect; maybe we're not the slow one"
278 << dendl;
279 last_mon_reconnect = now;
280 monc->reopen_session();
281 }
282 return true;
283 }
284 return false;
285 }
286
287 utime_t Beacon::get_laggy_until() const
288 {
289 Mutex::Locker l(lock);
290
291 return laggy_until;
292 }
293
294 void Beacon::set_want_state(MDSMap const *mdsmap, MDSMap::DaemonState const newstate)
295 {
296 Mutex::Locker l(lock);
297
298 // Update mdsmap epoch atomically with updating want_state, so that when
299 // we send a beacon with the new want state it has the latest epoch, and
300 // once we have updated to the latest epoch, we are not sending out
301 // a stale want_state (i.e. one from before making it through MDSMap
302 // handling)
303 _notify_mdsmap(mdsmap);
304
305 if (want_state != newstate) {
306 dout(10) << __func__ << ": "
307 << ceph_mds_state_name(want_state) << " -> "
308 << ceph_mds_state_name(newstate) << dendl;
309 want_state = newstate;
310 }
311 }
312
313
314 /**
315 * We are 'shown' an MDS briefly in order to update
316 * some health metrics that we will send in the next
317 * beacon.
318 */
319 void Beacon::notify_health(MDSRank const *mds)
320 {
321 Mutex::Locker l(lock);
322 if (!mds) {
323 // No MDS rank held
324 return;
325 }
326
327 // I'm going to touch this MDS, so it must be locked
328 assert(mds->mds_lock.is_locked_by_me());
329
330 health.metrics.clear();
331
332 // Detect presence of entries in DamageTable
333 if (!mds->damage_table.empty()) {
334 MDSHealthMetric m(MDS_HEALTH_DAMAGE, HEALTH_ERR, std::string(
335 "Metadata damage detected"));
336 health.metrics.push_back(m);
337 }
338
339 // Detect MDS_HEALTH_TRIM condition
340 // Arbitrary factor of 2, indicates MDS is not trimming promptly
341 {
342 if (mds->mdlog->get_num_segments() > (size_t)(g_conf->mds_log_max_segments * 2)) {
343 std::ostringstream oss;
344 oss << "Behind on trimming (" << mds->mdlog->get_num_segments()
345 << "/" << g_conf->mds_log_max_segments << ")";
346
347 MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, oss.str());
348 m.metadata["num_segments"] = stringify(mds->mdlog->get_num_segments());
349 m.metadata["max_segments"] = stringify(g_conf->mds_log_max_segments);
350 health.metrics.push_back(m);
351 }
352 }
353
354 // Detect clients failing to respond to modifications to capabilities in
355 // CLIENT_CAPS messages.
356 {
357 std::list<client_t> late_clients;
358 mds->locker->get_late_revoking_clients(&late_clients);
359 std::list<MDSHealthMetric> late_cap_metrics;
360
361 for (std::list<client_t>::iterator i = late_clients.begin(); i != late_clients.end(); ++i) {
362
363 // client_t is equivalent to session.info.inst.name.num
364 // Construct an entity_name_t to lookup into SessionMap
365 entity_name_t ename(CEPH_ENTITY_TYPE_CLIENT, i->v);
366 Session const *s = mds->sessionmap.get_session(ename);
367 if (s == NULL) {
368 // Shouldn't happen, but not worth crashing if it does as this is
369 // just health-reporting code.
370 derr << "Client ID without session: " << i->v << dendl;
371 continue;
372 }
373
374 std::ostringstream oss;
375 oss << "Client " << s->get_human_name() << " failing to respond to capability release";
376 MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE, HEALTH_WARN, oss.str());
377 m.metadata["client_id"] = stringify(i->v);
378 late_cap_metrics.push_back(m);
379 }
380
381 if (late_cap_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
382 health.metrics.splice(health.metrics.end(), late_cap_metrics);
383 } else {
384 std::ostringstream oss;
385 oss << "Many clients (" << late_cap_metrics.size()
386 << ") failing to respond to capability release";
387 MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY, HEALTH_WARN, oss.str());
388 m.metadata["client_count"] = stringify(late_cap_metrics.size());
389 health.metrics.push_back(m);
390 late_cap_metrics.clear();
391 }
392 }
393
394 // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE
395 // messages. May be due to buggy client or resource-hogging application.
396 //
397 // Detect clients failing to advance their old_client_tid
398 {
399 set<Session*> sessions;
400 mds->sessionmap.get_client_session_set(sessions);
401
402 utime_t cutoff = ceph_clock_now();
403 cutoff -= g_conf->mds_recall_state_timeout;
404 utime_t last_recall = mds->mdcache->last_recall_state;
405
406 std::list<MDSHealthMetric> late_recall_metrics;
407 std::list<MDSHealthMetric> large_completed_requests_metrics;
408 for (set<Session*>::iterator i = sessions.begin(); i != sessions.end(); ++i) {
409 Session *session = *i;
410 if (!session->recalled_at.is_zero()) {
411 dout(20) << "Session servicing RECALL " << session->info.inst
412 << ": " << session->recalled_at << " " << session->recall_release_count
413 << "/" << session->recall_count << dendl;
414 if (last_recall < cutoff || session->last_recall_sent < last_recall) {
415 dout(20) << " no longer recall" << dendl;
416 session->clear_recalled_at();
417 } else if (session->recalled_at < cutoff) {
418 dout(20) << " exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl;
419 std::ostringstream oss;
420 oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
421 MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
422 m.metadata["client_id"] = stringify(session->info.inst.name.num());
423 late_recall_metrics.push_back(m);
424 } else {
425 dout(20) << " within timeout " << session->recalled_at << " vs. " << cutoff << dendl;
426 }
427 }
428 if ((session->get_num_trim_requests_warnings() > 0 &&
429 session->get_num_completed_requests() >= g_conf->mds_max_completed_requests) ||
430 (session->get_num_trim_flushes_warnings() > 0 &&
431 session->get_num_completed_flushes() >= g_conf->mds_max_completed_flushes)) {
432 std::ostringstream oss;
433 oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid";
434 MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
435 m.metadata["client_id"] = stringify(session->info.inst.name.num());
436 large_completed_requests_metrics.push_back(m);
437 }
438 }
439
440 if (late_recall_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
441 health.metrics.splice(health.metrics.end(), late_recall_metrics);
442 } else {
443 std::ostringstream oss;
444 oss << "Many clients (" << late_recall_metrics.size()
445 << ") failing to respond to cache pressure";
446 MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL_MANY, HEALTH_WARN, oss.str());
447 m.metadata["client_count"] = stringify(late_recall_metrics.size());
448 health.metrics.push_back(m);
449 late_recall_metrics.clear();
450 }
451
452 if (large_completed_requests_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
453 health.metrics.splice(health.metrics.end(), large_completed_requests_metrics);
454 } else {
455 std::ostringstream oss;
456 oss << "Many clients (" << large_completed_requests_metrics.size()
457 << ") failing to advance their oldest client/flush tid";
458 MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY, HEALTH_WARN, oss.str());
459 m.metadata["client_count"] = stringify(large_completed_requests_metrics.size());
460 health.metrics.push_back(m);
461 large_completed_requests_metrics.clear();
462 }
463 }
464
465 // Detect MDS_HEALTH_SLOW_REQUEST condition
466 {
467 int slow = mds->get_mds_slow_req_count();
468 dout(20) << slow << " slow request found" << dendl;
469 if (slow) {
470 std::ostringstream oss;
471 oss << slow << " slow requests are blocked > " << g_conf->mds_op_complaint_time << " sec";
472
473 MDSHealthMetric m(MDS_HEALTH_SLOW_REQUEST, HEALTH_WARN, oss.str());
474 health.metrics.push_back(m);
475 }
476 }
477
478 // Report a health warning if we are readonly
479 if (mds->mdcache->is_readonly()) {
480 MDSHealthMetric m(MDS_HEALTH_READ_ONLY, HEALTH_WARN,
481 "MDS in read-only mode");
482 health.metrics.push_back(m);
483 }
484
485 // Report if we have significantly exceeded our cache size limit
486 if (mds->mdcache->get_cache_size() >
487 g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
488 std::ostringstream oss;
489 oss << "Too many inodes in cache (" << mds->mdcache->get_cache_size()
490 << "/" << g_conf->mds_cache_size << "), "
491 << mds->mdcache->num_inodes_with_caps << " inodes in use by clients, "
492 << mds->mdcache->get_num_strays() << " stray files";
493
494 MDSHealthMetric m(MDS_HEALTH_CACHE_OVERSIZED, HEALTH_WARN, oss.str());
495 health.metrics.push_back(m);
496 }
497 }
498
499 MDSMap::DaemonState Beacon::get_want_state() const
500 {
501 Mutex::Locker l(lock);
502 return want_state;
503 }
504