]> git.proxmox.com Git - ceph.git/blob - ceph/src/exporter/DaemonMetricCollector.cc
7f88113b9905690e6abc426041b64fef70654190
[ceph.git] / ceph / src / exporter / DaemonMetricCollector.cc
1 #include "DaemonMetricCollector.h"
2 #include "common/admin_socket_client.h"
3 #include "common/debug.h"
4 #include "common/hostname.h"
5 #include "common/perf_counters.h"
6 #include "global/global_init.h"
7 #include "global/global_context.h"
8 #include "common/split.h"
9 #include "include/common_fwd.h"
10 #include "util.h"
11
12 #include <boost/json/src.hpp>
13 #include <chrono>
14 #include <filesystem>
15 #include <iostream>
16 #include <map>
17 #include <memory>
18 #include <regex>
19 #include <string>
20 #include <utility>
21
22 #define dout_context g_ceph_context
23 #define dout_subsys ceph_subsys_ceph_exporter
24
25 using json_object = boost::json::object;
26 using json_value = boost::json::value;
27 using json_array = boost::json::array;
28
29 void DaemonMetricCollector::request_loop(boost::asio::steady_timer &timer) {
30 timer.async_wait([&](const boost::system::error_code &e) {
31 std::cerr << e << std::endl;
32 update_sockets();
33 dump_asok_metrics();
34 auto stats_period = g_conf().get_val<int64_t>("exporter_stats_period");
35 // time to wait before sending requests again
36 timer.expires_from_now(std::chrono::seconds(stats_period));
37 request_loop(timer);
38 });
39 }
40
41 void DaemonMetricCollector::main() {
42 // time to wait before sending requests again
43
44 boost::asio::io_service io;
45 boost::asio::steady_timer timer{io, std::chrono::seconds(0)};
46 request_loop(timer);
47 io.run();
48 }
49
50 std::string DaemonMetricCollector::get_metrics() {
51 const std::lock_guard<std::mutex> lock(metrics_mutex);
52 return metrics;
53 }
54
55 template <class T>
56 void add_metric(std::unique_ptr<MetricsBuilder> &builder, T value,
57 std::string name, std::string description, std::string mtype,
58 labels_t labels) {
59 builder->add(std::to_string(value), name, description, mtype, labels);
60 }
61
62 void add_double_or_int_metric(std::unique_ptr<MetricsBuilder> &builder,
63 json_value value, std::string name,
64 std::string description, std::string mtype,
65 labels_t labels) {
66 if (value.is_int64()) {
67 int64_t v = value.as_int64();
68 add_metric(builder, v, name, description, mtype, labels);
69 } else if (value.is_double()) {
70 double v = value.as_double();
71 add_metric(builder, v, name, description, mtype, labels);
72 }
73 }
74
75 std::string boost_string_to_std(boost::json::string js) {
76 std::string res(js.data());
77 return res;
78 }
79
80 std::string quote(std::string value) { return "\"" + value + "\""; }
81
82 bool is_hyphen(char ch) { return ch == '-'; }
83
84 void DaemonMetricCollector::dump_asok_metrics() {
85 BlockTimer timer(__FILE__, __FUNCTION__);
86
87 std::vector<std::pair<std::string, int>> daemon_pids;
88
89 bool sort = g_conf().get_val<bool>("exporter_sort_metrics");
90 if (sort) {
91 builder = std::unique_ptr<OrderedMetricsBuilder>(new OrderedMetricsBuilder());
92 } else {
93 builder = std::unique_ptr<UnorderedMetricsBuilder>(new UnorderedMetricsBuilder());
94 }
95 for (auto &[daemon_name, sock_client] : clients) {
96 bool ok;
97 sock_client.ping(&ok);
98 if (!ok) {
99 continue;
100 }
101 std::string perf_dump_response = asok_request(sock_client, "perf dump", daemon_name);
102 if (perf_dump_response.size() == 0) {
103 continue;
104 }
105 std::string perf_schema_response = asok_request(sock_client, "perf schema", daemon_name);
106 if (perf_schema_response.size() == 0) {
107 continue;
108 }
109 std::string config_show = asok_request(sock_client, "config show", daemon_name);
110 json_object pid_file_json = boost::json::parse(config_show).as_object();
111 std::string pid_path =
112 boost_string_to_std(pid_file_json["pid_file"].as_string());
113 std::string pid_str = read_file_to_string(pid_path);
114 if (!pid_path.size()) {
115 continue;
116 }
117 daemon_pids.push_back({daemon_name, std::stoi(pid_str)});
118 json_object dump = boost::json::parse(perf_dump_response).as_object();
119 json_object schema = boost::json::parse(perf_schema_response).as_object();
120 for (auto &perf : schema) {
121 auto sv = perf.key();
122 std::string perf_group = {sv.begin(), sv.end()};
123 json_object perf_group_object = perf.value().as_object();
124 for (auto &perf_counter : perf_group_object) {
125 auto sv1 = perf_counter.key();
126 std::string perf_name = {sv1.begin(), sv1.end()};
127 json_object perf_info = perf_counter.value().as_object();
128 auto prio_limit = g_conf().get_val<int64_t>("exporter_prio_limit");
129 if (perf_info["priority"].as_int64() <
130 prio_limit) {
131 continue;
132 }
133 std::string name = "ceph_" + perf_group + "_" + perf_name;
134 std::replace_if(name.begin(), name.end(), is_hyphen, '_');
135
136 // FIXME: test this, based on mgr_module perfpath_to_path_labels
137 auto labels_and_name = get_labels_and_metric_name(daemon_name, name);
138 labels_t labels = labels_and_name.first;
139 name = labels_and_name.second;
140
141 json_value perf_values = dump[perf_group].as_object()[perf_name];
142 dump_asok_metric(perf_info, perf_values, name, labels);
143 }
144 }
145 }
146 dout(10) << "Perf counters retrieved for " << clients.size() << " daemons." << dendl;
147 // get time spent on this function
148 timer.stop();
149 std::string scrap_desc("Time spent scraping and transforming perfcounters to metrics");
150 labels_t scrap_labels;
151 scrap_labels["host"] = quote(ceph_get_hostname());
152 scrap_labels["function"] = quote(__FUNCTION__);
153 add_metric(builder, timer.get_ms(), "ceph_exporter_scrape_time", scrap_desc,
154 "gauge", scrap_labels);
155
156 const std::lock_guard<std::mutex> lock(metrics_mutex);
157 get_process_metrics(daemon_pids);
158 metrics = builder->dump();
159 }
160
161 std::vector<std::string> read_proc_stat_file(std::string path) {
162 std::string stat = read_file_to_string(path);
163 std::vector<std::string> strings;
164 auto parts = ceph::split(stat);
165 strings.assign(parts.begin(), parts.end());
166 return strings;
167 }
168
169 struct pstat read_pid_stat(int pid) {
170 std::string stat_path("/proc/" + std::to_string(pid) + "/stat");
171 std::vector<std::string> stats = read_proc_stat_file(stat_path);
172 struct pstat stat;
173 stat.minflt = std::stoul(stats[9]);
174 stat.majflt = std::stoul(stats[11]);
175 stat.utime = std::stoul(stats[13]);
176 stat.stime = std::stoul(stats[14]);
177 stat.num_threads = std::stoul(stats[19]);
178 stat.start_time = std::stoul(stats[21]);
179 stat.vm_size = std::stoul(stats[22]);
180 stat.resident_size = std::stoi(stats[23]);
181 return stat;
182 }
183
184 void DaemonMetricCollector::get_process_metrics(std::vector<std::pair<std::string, int>> daemon_pids) {
185 std::string path("/proc");
186 std::stringstream ss;
187 for (auto &[daemon_name, pid] : daemon_pids) {
188 std::vector<std::string> uptimes = read_proc_stat_file("/proc/uptime");
189 struct pstat stat = read_pid_stat(pid);
190 int clk_tck = sysconf(_SC_CLK_TCK);
191 double start_time_seconds = stat.start_time / (double)clk_tck;
192 double user_time = stat.utime / (double)clk_tck;
193 double kernel_time = stat.stime / (double)clk_tck;
194 double total_time_seconds = user_time + kernel_time;
195 double uptime = std::stod(uptimes[0]);
196 double elapsed_time = uptime - start_time_seconds;
197 double idle_time = elapsed_time - total_time_seconds;
198 double usage = total_time_seconds * 100 / elapsed_time;
199
200 labels_t labels;
201 labels["ceph_daemon"] = quote(daemon_name);
202 add_metric(builder, stat.minflt, "ceph_exporter_minflt_total",
203 "Number of minor page faults of daemon", "counter", labels);
204 add_metric(builder, stat.majflt, "ceph_exporter_majflt_total",
205 "Number of major page faults of daemon", "counter", labels);
206 add_metric(builder, stat.num_threads, "ceph_exporter_num_threads",
207 "Number of threads used by daemon", "gauge", labels);
208 add_metric(builder, usage, "ceph_exporter_cpu_usage", "CPU usage of a daemon",
209 "gauge", labels);
210
211 std::string cpu_time_desc = "Process time in kernel/user/idle mode";
212 labels_t cpu_total_labels;
213 cpu_total_labels["ceph_daemon"] = quote(daemon_name);
214 cpu_total_labels["mode"] = quote("kernel");
215 add_metric(builder, kernel_time, "ceph_exporter_cpu_total", cpu_time_desc,
216 "counter", cpu_total_labels);
217 cpu_total_labels["mode"] = quote("user");
218 add_metric(builder, user_time, "ceph_exporter_cpu_total", cpu_time_desc,
219 "counter", cpu_total_labels);
220 cpu_total_labels["mode"] = quote("idle");
221 add_metric(builder, idle_time, "ceph_exporter_cpu_total", cpu_time_desc,
222 "counter", cpu_total_labels);
223 add_metric(builder, stat.vm_size, "ceph_exporter_vm_size", "Virtual memory used in a daemon",
224 "gauge", labels);
225 add_metric(builder, stat.resident_size, "ceph_exporter_resident_size",
226 "Resident memory in a daemon", "gauge", labels);
227 }
228 }
229
230 std::string DaemonMetricCollector::asok_request(AdminSocketClient &asok,
231 std::string command, std::string daemon_name) {
232 std::string request("{\"prefix\": \"" + command + "\"}");
233 std::string response;
234 std::string err = asok.do_request(request, &response);
235 if (err.length() > 0 || response.substr(0, 5) == "ERROR") {
236 dout(1) << "command " << command << "failed for daemon " << daemon_name
237 << "with error: " << err << dendl;
238 return "";
239 }
240 return response;
241 }
242
243 std::pair<labels_t, std::string>
244 DaemonMetricCollector::get_labels_and_metric_name(std::string daemon_name,
245 std::string metric_name) {
246 std::string new_metric_name;
247 labels_t labels;
248 new_metric_name = metric_name;
249 if (daemon_name.find("rgw") != std::string::npos) {
250 std::string tmp = daemon_name.substr(16, std::string::npos);
251 std::string::size_type pos = tmp.find('.');
252 labels["instance_id"] = quote("rgw." + tmp.substr(0, pos));
253 } else {
254 labels["ceph_daemon"] = quote(daemon_name);
255 if (daemon_name.find("rbd-mirror") != std::string::npos) {
256 std::regex re("^rbd_mirror_image_([^/]+)/(?:(?:([^/]+)/"
257 ")?)(.*)\\.(replay(?:_bytes|_latency)?)$");
258 std::smatch match;
259 if (std::regex_search(daemon_name, match, re) == true) {
260 new_metric_name = "ceph_rbd_mirror_image_" + match.str(4);
261 labels["pool"] = quote(match.str(1));
262 labels["namespace"] = quote(match.str(2));
263 labels["image"] = quote(match.str(3));
264 }
265 }
266 }
267 return {labels, new_metric_name};
268 }
269
270 /*
271 perf_values can be either a int/double or a json_object. Since
272 json_value is a wrapper of both we use that class.
273 */
274 void DaemonMetricCollector::dump_asok_metric(json_object perf_info,
275 json_value perf_values,
276 std::string name,
277 labels_t labels) {
278 int64_t type = perf_info["type"].as_int64();
279 std::string metric_type =
280 boost_string_to_std(perf_info["metric_type"].as_string());
281 std::string description =
282 boost_string_to_std(perf_info["description"].as_string());
283
284 if (type & PERFCOUNTER_LONGRUNAVG) {
285 int64_t count = perf_values.as_object()["avgcount"].as_int64();
286 add_metric(builder, count, name + "_count", description, metric_type,
287 labels);
288 json_value sum_value = perf_values.as_object()["sum"];
289 add_double_or_int_metric(builder, sum_value, name + "_sum", description,
290 metric_type, labels);
291 } else if (type & PERFCOUNTER_TIME) {
292 if (perf_values.is_int64()) {
293 double value = perf_values.as_int64() / 1000000000.0f;
294 add_metric(builder, value, name, description, metric_type, labels);
295 } else if (perf_values.is_double()) {
296 double value = perf_values.as_double() / 1000000000.0f;
297 add_metric(builder, value, name, description, metric_type, labels);
298 }
299 } else {
300 add_double_or_int_metric(builder, perf_values, name, description,
301 metric_type, labels);
302 }
303 }
304
305 void DaemonMetricCollector::update_sockets() {
306 std::string sock_dir = g_conf().get_val<std::string>("exporter_sock_dir");
307 clients.clear();
308 std::filesystem::path sock_path = sock_dir;
309 if(!std::filesystem::is_directory(sock_path.parent_path())) {
310 dout(1) << "ERROR: No such directory exist" << sock_dir << dendl;
311 return;
312 }
313 for (const auto &entry :
314 std::filesystem::directory_iterator(sock_dir)) {
315 if (entry.path().extension() == ".asok") {
316 std::string daemon_socket_name = entry.path().filename().string();
317 std::string daemon_name =
318 daemon_socket_name.substr(0, daemon_socket_name.size() - 5);
319 if (clients.find(daemon_name) == clients.end() &&
320 !(daemon_name.find("mgr") != std::string::npos) &&
321 !(daemon_name.find("ceph-exporter") != std::string::npos)) {
322 AdminSocketClient sock(entry.path().string());
323 clients.insert({daemon_name, std::move(sock)});
324 }
325 }
326 }
327 }
328
329 void OrderedMetricsBuilder::add(std::string value, std::string name,
330 std::string description, std::string mtype,
331 labels_t labels) {
332
333 if (metrics.find(name) == metrics.end()) {
334 Metric metric(name, mtype, description);
335 metrics[name] = std::move(metric);
336 }
337 Metric &metric = metrics[name];
338 metric.add(labels, value);
339 }
340
341 std::string OrderedMetricsBuilder::dump() {
342 for (auto &[name, metric] : metrics) {
343 out += metric.dump() + "\n";
344 }
345 return out;
346 }
347
348 void UnorderedMetricsBuilder::add(std::string value, std::string name,
349 std::string description, std::string mtype,
350 labels_t labels) {
351
352 Metric metric(name, mtype, description);
353 metric.add(labels, value);
354 out += metric.dump() + "\n\n";
355 }
356
357 std::string UnorderedMetricsBuilder::dump() { return out; }
358
359 void Metric::add(labels_t labels, std::string value) {
360 metric_entry entry;
361 entry.labels = labels;
362 entry.value = value;
363 entries.push_back(entry);
364 }
365
366 std::string Metric::dump() {
367 std::stringstream metric_ss;
368 metric_ss << "# HELP " << name << " " << description << "\n";
369 metric_ss << "# TYPE " << name << " " << mtype << "\n";
370 for (auto &entry : entries) {
371 std::stringstream labels_ss;
372 size_t i = 0;
373 for (auto &[label_name, label_value] : entry.labels) {
374 labels_ss << label_name << "=" << label_value;
375 if (i < entry.labels.size() - 1) {
376 labels_ss << ",";
377 }
378 i++;
379 }
380 metric_ss << name << "{" << labels_ss.str() << "} " << entry.value;
381 if (&entry != &entries.back()) {
382 metric_ss << "\n";
383 }
384 }
385 return metric_ss.str();
386 }
387
388 DaemonMetricCollector &collector_instance() {
389 static DaemonMetricCollector instance;
390 return instance;
391 }