1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013 Inktank, Inc
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
14 #include "include/memory.h"
27 #ifdef HAVE_SYS_MOUNT_H
28 #include <sys/mount.h>
31 #ifdef HAVE_SYS_PARAM_H
32 #include <sys/param.h>
35 #include "messages/MMonHealth.h"
36 #include "include/assert.h"
37 #include "common/Formatter.h"
38 #include "common/errno.h"
40 #include "mon/Monitor.h"
41 #include "mon/DataHealthService.h"
43 #define dout_subsys ceph_subsys_mon
45 #define dout_prefix _prefix(_dout, mon, this)
46 static ostream
& _prefix(std::ostream
*_dout
, const Monitor
*mon
,
47 const DataHealthService
*svc
) {
50 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
51 << "(" << mon
->get_state_name() << ")." << svc
->get_name()
52 << "(" << svc
->get_epoch() << ") ";
55 void DataHealthService::start_epoch()
57 dout(10) << __func__
<< " epoch " << get_epoch() << dendl
;
58 // we are not bound by election epochs, but we should clear the stats
59 // everytime an election is triggerd. As far as we know, a monitor might
60 // have been running out of disk space and someone fixed it. We don't want
61 // to hold the cluster back, even confusing the user, due to some possibly
64 last_warned_percent
= 0;
67 void DataHealthService::get_health(
69 list
<pair
<health_status_t
,string
> >& summary
,
70 list
<pair
<health_status_t
,string
> > *detail
)
72 dout(10) << __func__
<< dendl
;
74 f
->open_object_section("data_health");
75 f
->open_array_section("mons");
78 for (map
<entity_inst_t
,DataStats
>::iterator it
= stats
.begin();
79 it
!= stats
.end(); ++it
) {
80 string mon_name
= mon
->monmap
->get_name(it
->first
.addr
);
81 DataStats
& stats
= it
->second
;
83 health_status_t health_status
= HEALTH_OK
;
85 if (stats
.fs_stats
.avail_percent
<= g_conf
->mon_data_avail_crit
) {
86 health_status
= HEALTH_ERR
;
87 health_detail
= "low disk space, shutdown imminent";
88 } else if (stats
.fs_stats
.avail_percent
<= g_conf
->mon_data_avail_warn
) {
89 health_status
= HEALTH_WARN
;
90 health_detail
= "low disk space";
93 if (stats
.store_stats
.bytes_total
>= g_conf
->mon_data_size_warn
) {
94 if (health_status
> HEALTH_WARN
)
95 health_status
= HEALTH_WARN
;
96 if (!health_detail
.empty())
97 health_detail
.append("; ");
99 ss
<< "store is getting too big! "
100 << prettybyte_t(stats
.store_stats
.bytes_total
)
101 << " >= " << prettybyte_t(g_conf
->mon_data_size_warn
);
102 health_detail
.append(ss
.str());
105 if (health_status
!= HEALTH_OK
) {
107 ss
<< "mon." << mon_name
<< " " << health_detail
;
108 summary
.push_back(make_pair(health_status
, ss
.str()));
109 ss
<< " -- " << stats
.fs_stats
.avail_percent
<< "% avail";
111 detail
->push_back(make_pair(health_status
, ss
.str()));
115 f
->open_object_section("mon");
116 f
->dump_string("name", mon_name
.c_str());
117 // leave this unenclosed by an object section to avoid breaking backward-compatibility
119 f
->dump_stream("health") << health_status
;
120 if (health_status
!= HEALTH_OK
)
121 f
->dump_string("health_detail", health_detail
);
127 f
->close_section(); // mons
128 f
->close_section(); // data_health
132 int DataHealthService::update_store_stats(DataStats
&ours
)
134 map
<string
,uint64_t> extra
;
135 uint64_t store_size
= mon
->store
->get_estimated_size(extra
);
136 assert(store_size
> 0);
138 ours
.store_stats
.bytes_total
= store_size
;
139 ours
.store_stats
.bytes_sst
= extra
["sst"];
140 ours
.store_stats
.bytes_log
= extra
["log"];
141 ours
.store_stats
.bytes_misc
= extra
["misc"];
142 ours
.last_update
= ceph_clock_now();
148 int DataHealthService::update_stats()
150 entity_inst_t our_inst
= mon
->messenger
->get_myinst();
151 DataStats
& ours
= stats
[our_inst
];
153 int err
= get_fs_stats(ours
.fs_stats
, g_conf
->mon_data
.c_str());
155 derr
<< __func__
<< " get_fs_stats error: " << cpp_strerror(err
) << dendl
;
158 dout(0) << __func__
<< " avail " << ours
.fs_stats
.avail_percent
<< "%"
159 << " total " << prettybyte_t(ours
.fs_stats
.byte_total
)
160 << ", used " << prettybyte_t(ours
.fs_stats
.byte_used
)
161 << ", avail " << prettybyte_t(ours
.fs_stats
.byte_avail
) << dendl
;
162 ours
.last_update
= ceph_clock_now();
164 return update_store_stats(ours
);
167 void DataHealthService::share_stats()
169 dout(10) << __func__
<< dendl
;
173 assert(!stats
.empty());
174 entity_inst_t our_inst
= mon
->messenger
->get_myinst();
175 assert(stats
.count(our_inst
) > 0);
176 DataStats
&ours
= stats
[our_inst
];
177 const set
<int>& quorum
= mon
->get_quorum();
178 for (set
<int>::const_iterator it
= quorum
.begin();
179 it
!= quorum
.end(); ++it
) {
180 if (mon
->monmap
->get_name(*it
) == mon
->name
)
182 entity_inst_t inst
= mon
->monmap
->get_inst(*it
);
183 MMonHealth
*m
= new MMonHealth(HealthService::SERVICE_HEALTH_DATA
,
184 MMonHealth::OP_TELL
);
185 m
->data_stats
= ours
;
186 dout(20) << __func__
<< " send " << *m
<< " to " << inst
<< dendl
;
187 mon
->messenger
->send_message(m
, inst
);
191 void DataHealthService::service_tick()
193 dout(10) << __func__
<< dendl
;
195 int err
= update_stats();
197 derr
<< "something went wrong obtaining our disk stats: "
198 << cpp_strerror(err
) << dendl
;
205 DataStats
&ours
= stats
[mon
->messenger
->get_myinst()];
207 if (ours
.fs_stats
.avail_percent
<= g_conf
->mon_data_avail_crit
) {
208 derr
<< "reached critical levels of available space on local monitor storage"
209 << " -- shutdown!" << dendl
;
214 // we must backoff these warnings, and track how much data is being
215 // consumed in-between reports to assess if it's worth to log this info,
216 // otherwise we may very well contribute to the consumption of the
217 // already low available disk space.
218 if (ours
.fs_stats
.avail_percent
<= g_conf
->mon_data_avail_warn
) {
219 if (ours
.fs_stats
.avail_percent
!= last_warned_percent
)
221 << "reached concerning levels of available space on local monitor storage"
222 << " (" << ours
.fs_stats
.avail_percent
<< "% free)";
223 last_warned_percent
= ours
.fs_stats
.avail_percent
;
225 last_warned_percent
= 0;
229 void DataHealthService::handle_tell(MonOpRequestRef op
)
231 op
->mark_event("datahealth:handle_tell");
232 MMonHealth
*m
= static_cast<MMonHealth
*>(op
->get_req());
233 dout(10) << __func__
<< " " << *m
<< dendl
;
234 assert(m
->get_service_op() == MMonHealth::OP_TELL
);
236 stats
[m
->get_source_inst()] = m
->data_stats
;
239 bool DataHealthService::service_dispatch_op(MonOpRequestRef op
)
241 op
->mark_event("datahealth:service_dispatch_op");
242 MMonHealth
*m
= static_cast<MMonHealth
*>(op
->get_req());
243 dout(10) << __func__
<< " " << *m
<< dendl
;
244 assert(m
->get_service_type() == get_type());
246 dout(1) << __func__
<< " not in quorum -- drop message" << dendl
;
250 switch (m
->service_op
) {
251 case MMonHealth::OP_TELL
:
252 // someone is telling us their stats
256 dout(0) << __func__
<< " unknown op " << m
->service_op
<< dendl
;
257 assert(0 == "Unknown service op");