1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013 Inktank, Inc
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <boost/regex.hpp>
20 #include "include/assert.h"
21 #include "include/stringify.h"
23 #include "mon/Monitor.h"
24 #include "mon/HealthService.h"
25 #include "mon/HealthMonitor.h"
26 #include "mon/DataHealthService.h"
28 #include "messages/MMonHealth.h"
29 #include "messages/MMonHealthChecks.h"
31 #include "common/Formatter.h"
33 #define dout_subsys ceph_subsys_mon
35 #define dout_prefix _prefix(_dout, mon, this)
36 static ostream
& _prefix(std::ostream
*_dout
, const Monitor
*mon
,
37 const HealthMonitor
*hmon
) {
38 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
39 << "(" << mon
->get_state_name() << ").health ";
42 HealthMonitor::HealthMonitor(Monitor
*m
, Paxos
*p
, const string
& service_name
)
43 : PaxosService(m
, p
, service_name
) {
46 void HealthMonitor::init()
48 dout(10) << __func__
<< dendl
;
51 void HealthMonitor::create_initial()
53 dout(10) << __func__
<< dendl
;
56 void HealthMonitor::update_from_paxos(bool *need_bootstrap
)
58 version
= get_last_committed();
59 dout(10) << __func__
<< dendl
;
63 mon
->store
->get(service_name
, "quorum", qbl
);
66 ::decode(quorum_checks
, p
);
68 quorum_checks
.clear();
72 mon
->store
->get(service_name
, "leader", lbl
);
75 ::decode(leader_checks
, p
);
77 leader_checks
.clear();
81 JSONFormatter
jf(true);
82 jf
.open_object_section("health");
83 jf
.open_object_section("quorum_health");
84 for (auto& p
: quorum_checks
) {
85 string s
= string("mon.") + stringify(p
.first
);
86 jf
.dump_object(s
.c_str(), p
.second
);
89 jf
.dump_object("leader_health", leader_checks
);
95 void HealthMonitor::create_pending()
97 dout(10) << " " << version
<< dendl
;
100 void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
103 dout(10) << " " << version
<< dendl
;
104 put_last_committed(t
, version
);
107 ::encode(quorum_checks
, qbl
);
108 t
->put(service_name
, "quorum", qbl
);
110 ::encode(leader_checks
, lbl
);
111 t
->put(service_name
, "leader", lbl
);
113 health_check_map_t pending_health
;
115 // combine per-mon details carefully...
116 map
<string
,set
<string
>> names
; // code -> <mon names>
117 for (auto p
: quorum_checks
) {
118 for (auto q
: p
.second
.checks
) {
119 names
[q
.first
].insert(mon
->monmap
->get_name(p
.first
));
121 pending_health
.merge(p
.second
);
123 for (auto &p
: pending_health
.checks
) {
124 p
.second
.summary
= boost::regex_replace(
126 boost::regex("%hasorhave%"),
127 names
[p
.first
].size() > 1 ? "have" : "has");
128 p
.second
.summary
= boost::regex_replace(
130 boost::regex("%names%"), stringify(names
[p
.first
]));
131 p
.second
.summary
= boost::regex_replace(
133 boost::regex("%plurals%"),
134 names
[p
.first
].size() > 1 ? "s" : "");
135 p
.second
.summary
= boost::regex_replace(
137 boost::regex("%isorare%"),
138 names
[p
.first
].size() > 1 ? "are" : "is");
141 pending_health
.merge(leader_checks
);
142 encode_health(pending_health
, t
);
145 version_t
HealthMonitor::get_trim_to()
147 // we don't actually need *any* old states, but keep a few.
154 bool HealthMonitor::preprocess_query(MonOpRequestRef op
)
159 bool HealthMonitor::prepare_update(MonOpRequestRef op
)
161 Message
*m
= op
->get_req();
162 dout(7) << "prepare_update " << *m
163 << " from " << m
->get_orig_source_inst() << dendl
;
164 switch (m
->get_type()) {
167 MMonHealth
*hm
= static_cast<MMonHealth
*>(op
->get_req());
168 int service_type
= hm
->get_service_type();
169 if (services
.count(service_type
) == 0) {
170 dout(1) << __func__
<< " service type " << service_type
171 << " not registered -- drop message!" << dendl
;
174 return services
[service_type
]->service_dispatch(op
);
176 case MSG_MON_HEALTH_CHECKS
:
177 return prepare_health_checks(op
);
183 bool HealthMonitor::prepare_health_checks(MonOpRequestRef op
)
185 MMonHealthChecks
*m
= static_cast<MMonHealthChecks
*>(op
->get_req());
186 // no need to check if it's changed, the peon has done so
187 quorum_checks
[m
->get_source().num()] = std::move(m
->health_checks
);
191 void HealthMonitor::tick()
196 dout(10) << __func__
<< dendl
;
197 bool changed
= false;
198 if (check_member_health()) {
201 if (!mon
->is_leader()) {
204 if (check_leader_health()) {
212 bool HealthMonitor::check_member_health()
214 dout(20) << __func__
<< dendl
;
215 bool changed
= false;
219 get_fs_stats(stats
.fs_stats
, g_conf
->mon_data
.c_str());
220 map
<string
,uint64_t> extra
;
221 uint64_t store_size
= mon
->store
->get_estimated_size(extra
);
222 assert(store_size
> 0);
223 stats
.store_stats
.bytes_total
= store_size
;
224 stats
.store_stats
.bytes_sst
= extra
["sst"];
225 stats
.store_stats
.bytes_log
= extra
["log"];
226 stats
.store_stats
.bytes_misc
= extra
["misc"];
227 stats
.last_update
= ceph_clock_now();
228 dout(10) << __func__
<< " avail " << stats
.fs_stats
.avail_percent
<< "%"
229 << " total " << byte_u_t(stats
.fs_stats
.byte_total
)
230 << ", used " << byte_u_t(stats
.fs_stats
.byte_used
)
231 << ", avail " << byte_u_t(stats
.fs_stats
.byte_avail
) << dendl
;
233 // MON_DISK_{LOW,CRIT,BIG}
234 health_check_map_t next
;
235 if (stats
.fs_stats
.avail_percent
<= g_conf
->mon_data_avail_crit
) {
236 stringstream ss
, ss2
;
237 ss
<< "mon%plurals% %names% %isorare% very low on available space";
238 auto& d
= next
.add("MON_DISK_CRIT", HEALTH_ERR
, ss
.str());
239 ss2
<< "mon." << mon
->name
<< " has " << stats
.fs_stats
.avail_percent
241 d
.detail
.push_back(ss2
.str());
242 } else if (stats
.fs_stats
.avail_percent
<= g_conf
->mon_data_avail_warn
) {
243 stringstream ss
, ss2
;
244 ss
<< "mon%plurals% %names% %isorare% low on available space";
245 auto& d
= next
.add("MON_DISK_LOW", HEALTH_WARN
, ss
.str());
246 ss2
<< "mon." << mon
->name
<< " has " << stats
.fs_stats
.avail_percent
248 d
.detail
.push_back(ss2
.str());
250 if (stats
.store_stats
.bytes_total
>= g_conf
->mon_data_size_warn
) {
251 stringstream ss
, ss2
;
252 ss
<< "mon%plurals% %names% %isorare% using a lot of disk space";
253 auto& d
= next
.add("MON_DISK_BIG", HEALTH_WARN
, ss
.str());
254 ss2
<< "mon." << mon
->name
<< " is "
255 << byte_u_t(stats
.store_stats
.bytes_total
)
256 << " >= mon_data_size_warn ("
257 << byte_u_t(g_conf
->mon_data_size_warn
) << ")";
258 d
.detail
.push_back(ss2
.str());
261 // OSD_NO_DOWN_OUT_INTERVAL
263 // Warn if 'mon_osd_down_out_interval' is set to zero.
264 // Having this option set to zero on the leader acts much like the
265 // 'noout' flag. It's hard to figure out what's going wrong with clusters
266 // without the 'noout' flag set but acting like that just the same, so
267 // we report a HEALTH_WARN in case this option is set to zero.
268 // This is an ugly hack to get the warning out, but until we find a way
269 // to spread global options throughout the mon cluster and have all mons
270 // using a base set of the same options, we need to work around this sort
272 // There's also the obvious drawback that if this is set on a single
273 // monitor on a 3-monitor cluster, this warning will only be shown every
274 // third monitor connection.
275 if (g_conf
->mon_warn_on_osd_down_out_interval_zero
&&
276 g_conf
->mon_osd_down_out_interval
== 0) {
277 ostringstream ss
, ds
;
278 ss
<< "mon%plurals% %names% %hasorhave% mon_osd_down_out_interval set to 0";
279 auto& d
= next
.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN
, ss
.str());
280 ds
<< "mon." << mon
->name
<< " has mon_osd_down_out_interval set to 0";
281 d
.detail
.push_back(ds
.str());
285 auto p
= quorum_checks
.find(mon
->rank
);
286 if (p
== quorum_checks
.end()) {
291 if (p
->second
== next
) {
296 if (mon
->is_leader()) {
297 // prepare to propose
298 quorum_checks
[mon
->rank
] = next
;
301 // tell the leader, but only if the quorum is luminous
302 if (mon
->quorum_mon_features
.contains_all(
303 ceph::features::mon::FEATURE_LUMINOUS
)) {
304 mon
->messenger
->send_message(new MMonHealthChecks(next
),
305 mon
->monmap
->get_inst(mon
->get_leader()));
312 bool HealthMonitor::check_leader_health()
314 dout(20) << __func__
<< dendl
;
315 bool changed
= false;
317 // prune quorum_health
319 auto& qset
= mon
->get_quorum();
320 auto p
= quorum_checks
.begin();
321 while (p
!= quorum_checks
.end()) {
322 if (qset
.count(p
->first
) == 0) {
323 p
= quorum_checks
.erase(p
);
331 health_check_map_t next
;
335 int max
= mon
->monmap
->size();
336 int actual
= mon
->get_quorum().size();
339 ss
<< (max
-actual
) << "/" << max
<< " mons down, quorum "
340 << mon
->get_quorum_names();
341 auto& d
= next
.add("MON_DOWN", HEALTH_WARN
, ss
.str());
342 set
<int> q
= mon
->get_quorum();
343 for (int i
=0; i
<max
; i
++) {
344 if (q
.count(i
) == 0) {
346 ss
<< "mon." << mon
->monmap
->get_name(i
) << " (rank " << i
347 << ") addr " << mon
->monmap
->get_addr(i
)
348 << " is down (out of quorum)";
349 d
.detail
.push_back(ss
.str());
356 if (!mon
->timecheck_skews
.empty()) {
358 list
<string
> details
;
359 for (map
<entity_inst_t
,double>::iterator i
= mon
->timecheck_skews
.begin();
360 i
!= mon
->timecheck_skews
.end(); ++i
) {
361 entity_inst_t inst
= i
->first
;
362 double skew
= i
->second
;
363 double latency
= mon
->timecheck_latencies
[inst
];
364 string name
= mon
->monmap
->get_name(inst
.addr
);
366 health_status_t tcstatus
= mon
->timecheck_status(tcss
, skew
, latency
);
367 if (tcstatus
!= HEALTH_OK
) {
368 warns
.push_back(name
);
369 ostringstream tmp_ss
;
370 tmp_ss
<< "mon." << name
371 << " addr " << inst
.addr
<< " " << tcss
.str()
372 << " (latency " << latency
<< "s)";
373 details
.push_back(tmp_ss
.str());
376 if (!warns
.empty()) {
378 ss
<< "clock skew detected on";
379 while (!warns
.empty()) {
380 ss
<< " mon." << warns
.front();
385 auto& d
= next
.add("MON_CLOCK_SKEW", HEALTH_WARN
, ss
.str());
386 d
.detail
.swap(details
);
390 if (next
!= leader_checks
) {
392 leader_checks
= next
;