]> git.proxmox.com Git - ceph.git/blame - ceph/src/exporter/DaemonMetricCollector.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / exporter / DaemonMetricCollector.cc
CommitLineData
2a845540 1#include "DaemonMetricCollector.h"
2a845540
TL
2
3#include <boost/json/src.hpp>
4#include <chrono>
5#include <filesystem>
6#include <iostream>
7#include <map>
8#include <memory>
9#include <regex>
10#include <string>
11#include <utility>
12
39ae355f
TL
13#include "common/admin_socket_client.h"
14#include "common/debug.h"
15#include "common/hostname.h"
16#include "common/perf_counters.h"
17#include "common/split.h"
18#include "global/global_context.h"
19#include "global/global_init.h"
20#include "include/common_fwd.h"
21#include "util.h"
22
2a845540
TL
23#define dout_context g_ceph_context
24#define dout_subsys ceph_subsys_ceph_exporter
25
26using json_object = boost::json::object;
27using json_value = boost::json::value;
28using json_array = boost::json::array;
29
30void DaemonMetricCollector::request_loop(boost::asio::steady_timer &timer) {
31 timer.async_wait([&](const boost::system::error_code &e) {
32 std::cerr << e << std::endl;
33 update_sockets();
34 dump_asok_metrics();
35 auto stats_period = g_conf().get_val<int64_t>("exporter_stats_period");
36 // time to wait before sending requests again
37 timer.expires_from_now(std::chrono::seconds(stats_period));
38 request_loop(timer);
39 });
40}
41
42void DaemonMetricCollector::main() {
43 // time to wait before sending requests again
44
45 boost::asio::io_service io;
46 boost::asio::steady_timer timer{io, std::chrono::seconds(0)};
47 request_loop(timer);
48 io.run();
49}
50
51std::string DaemonMetricCollector::get_metrics() {
52 const std::lock_guard<std::mutex> lock(metrics_mutex);
53 return metrics;
54}
55
56template <class T>
57void add_metric(std::unique_ptr<MetricsBuilder> &builder, T value,
58 std::string name, std::string description, std::string mtype,
59 labels_t labels) {
60 builder->add(std::to_string(value), name, description, mtype, labels);
61}
62
63void add_double_or_int_metric(std::unique_ptr<MetricsBuilder> &builder,
64 json_value value, std::string name,
65 std::string description, std::string mtype,
66 labels_t labels) {
67 if (value.is_int64()) {
68 int64_t v = value.as_int64();
69 add_metric(builder, v, name, description, mtype, labels);
70 } else if (value.is_double()) {
71 double v = value.as_double();
72 add_metric(builder, v, name, description, mtype, labels);
73 }
74}
75
76std::string boost_string_to_std(boost::json::string js) {
77 std::string res(js.data());
78 return res;
79}
80
81std::string quote(std::string value) { return "\"" + value + "\""; }
82
2a845540
TL
83void DaemonMetricCollector::dump_asok_metrics() {
84 BlockTimer timer(__FILE__, __FUNCTION__);
85
86 std::vector<std::pair<std::string, int>> daemon_pids;
87
39ae355f 88 int failures = 0;
2a845540
TL
89 bool sort = g_conf().get_val<bool>("exporter_sort_metrics");
90 if (sort) {
39ae355f
TL
91 builder =
92 std::unique_ptr<OrderedMetricsBuilder>(new OrderedMetricsBuilder());
2a845540 93 } else {
39ae355f
TL
94 builder =
95 std::unique_ptr<UnorderedMetricsBuilder>(new UnorderedMetricsBuilder());
2a845540 96 }
1e59de90 97 auto prio_limit = g_conf().get_val<int64_t>("exporter_prio_limit");
2a845540
TL
98 for (auto &[daemon_name, sock_client] : clients) {
99 bool ok;
100 sock_client.ping(&ok);
101 if (!ok) {
39ae355f 102 failures++;
2a845540
TL
103 continue;
104 }
1e59de90
TL
105 std::string counter_dump_response =
106 asok_request(sock_client, "counter dump", daemon_name);
107 if (counter_dump_response.size() == 0) {
108 failures++;
109 continue;
2a845540 110 }
1e59de90
TL
111 std::string counter_schema_response =
112 asok_request(sock_client, "counter schema", daemon_name);
113 if (counter_schema_response.size() == 0) {
39ae355f
TL
114 failures++;
115 continue;
116 }
1e59de90
TL
117
118 json_object counter_dump = boost::json::parse(counter_dump_response).as_object();
119 json_object counter_schema = boost::json::parse(counter_schema_response).as_object();
120
121 for (auto &perf_group_item : counter_schema) {
122 std::string perf_group = {perf_group_item.key().begin(),
123 perf_group_item.key().end()};
124 json_object perf_group_object = perf_group_item.value().as_object();
125 auto counters = perf_group_object["counters"].as_object();
126 auto counters_labels = perf_group_object["labels"].as_object();
127 auto counters_values =
128 counter_dump[perf_group].as_object()["counters"].as_object();
129 labels_t labels;
130
131 for(auto &label: counters_labels) {
132 std::string label_key = {label.key().begin(), label.key().end()};
133 labels[label_key] = quote(label.value().as_string().c_str());
134 }
135 for (auto &counter : counters) {
136 json_object counter_group = counter.value().as_object();
137 if (counter_group["priority"].as_int64() < prio_limit) {
138 continue;
139 }
140 std::string counter_name_init = {counter.key().begin(), counter.key().end()};
141 std::string counter_name = perf_group + "_" + counter_name_init;
142 promethize(counter_name);
143
144 if (counters_labels.empty()) {
145 auto labels_and_name = get_labels_and_metric_name(daemon_name, counter_name);
146 labels = labels_and_name.first;
147 counter_name = labels_and_name.second;
148 }
149 // For now this is only required for rgw multi-site metrics
150 auto multisite_labels_and_name = add_fixed_name_metrics(counter_name);
151 if (!multisite_labels_and_name.first.empty()) {
152 labels.insert(multisite_labels_and_name.first.begin(), multisite_labels_and_name.first.end());
153 counter_name = multisite_labels_and_name.second;
154 }
155 labels.insert({"ceph_daemon", quote(daemon_name)});
156 auto perf_values = counters_values.at(counter_name_init);
157 dump_asok_metric(counter_group, perf_values, counter_name, labels);
158 }
159 }
39ae355f
TL
160 std::string config_show =
161 asok_request(sock_client, "config show", daemon_name);
162 if (config_show.size() == 0) {
163 failures++;
2a845540
TL
164 continue;
165 }
2a845540
TL
166 json_object pid_file_json = boost::json::parse(config_show).as_object();
167 std::string pid_path =
39ae355f 168 boost_string_to_std(pid_file_json["pid_file"].as_string());
2a845540
TL
169 std::string pid_str = read_file_to_string(pid_path);
170 if (!pid_path.size()) {
39ae355f
TL
171 dout(1) << "pid path is empty; process metrics won't be fetched for: "
172 << daemon_name << dendl;
173 }
174 if (!pid_str.empty()) {
175 daemon_pids.push_back({daemon_name, std::stoi(pid_str)});
2a845540 176 }
2a845540 177 }
39ae355f
TL
178 dout(10) << "Perf counters retrieved for " << clients.size() - failures << "/"
179 << clients.size() << " daemons." << dendl;
2a845540
TL
180 // get time spent on this function
181 timer.stop();
39ae355f
TL
182 std::string scrap_desc(
183 "Time spent scraping and transforming perf counters to metrics");
2a845540
TL
184 labels_t scrap_labels;
185 scrap_labels["host"] = quote(ceph_get_hostname());
186 scrap_labels["function"] = quote(__FUNCTION__);
187 add_metric(builder, timer.get_ms(), "ceph_exporter_scrape_time", scrap_desc,
188 "gauge", scrap_labels);
189
190 const std::lock_guard<std::mutex> lock(metrics_mutex);
39ae355f
TL
191 // only get metrics if there's pid path for some or all daemons isn't empty
192 if (daemon_pids.size() != 0) {
193 get_process_metrics(daemon_pids);
194 }
2a845540
TL
195 metrics = builder->dump();
196}
197
198std::vector<std::string> read_proc_stat_file(std::string path) {
199 std::string stat = read_file_to_string(path);
200 std::vector<std::string> strings;
201 auto parts = ceph::split(stat);
202 strings.assign(parts.begin(), parts.end());
203 return strings;
204}
205
206struct pstat read_pid_stat(int pid) {
207 std::string stat_path("/proc/" + std::to_string(pid) + "/stat");
208 std::vector<std::string> stats = read_proc_stat_file(stat_path);
209 struct pstat stat;
210 stat.minflt = std::stoul(stats[9]);
211 stat.majflt = std::stoul(stats[11]);
212 stat.utime = std::stoul(stats[13]);
213 stat.stime = std::stoul(stats[14]);
214 stat.num_threads = std::stoul(stats[19]);
215 stat.start_time = std::stoul(stats[21]);
216 stat.vm_size = std::stoul(stats[22]);
217 stat.resident_size = std::stoi(stats[23]);
218 return stat;
219}
220
39ae355f
TL
221void DaemonMetricCollector::get_process_metrics(
222 std::vector<std::pair<std::string, int>> daemon_pids) {
2a845540
TL
223 std::string path("/proc");
224 std::stringstream ss;
225 for (auto &[daemon_name, pid] : daemon_pids) {
226 std::vector<std::string> uptimes = read_proc_stat_file("/proc/uptime");
227 struct pstat stat = read_pid_stat(pid);
228 int clk_tck = sysconf(_SC_CLK_TCK);
229 double start_time_seconds = stat.start_time / (double)clk_tck;
230 double user_time = stat.utime / (double)clk_tck;
231 double kernel_time = stat.stime / (double)clk_tck;
232 double total_time_seconds = user_time + kernel_time;
233 double uptime = std::stod(uptimes[0]);
234 double elapsed_time = uptime - start_time_seconds;
39ae355f 235 double idle_time = elapsed_time - total_time_seconds;
2a845540
TL
236 double usage = total_time_seconds * 100 / elapsed_time;
237
238 labels_t labels;
239 labels["ceph_daemon"] = quote(daemon_name);
240 add_metric(builder, stat.minflt, "ceph_exporter_minflt_total",
241 "Number of minor page faults of daemon", "counter", labels);
242 add_metric(builder, stat.majflt, "ceph_exporter_majflt_total",
243 "Number of major page faults of daemon", "counter", labels);
244 add_metric(builder, stat.num_threads, "ceph_exporter_num_threads",
245 "Number of threads used by daemon", "gauge", labels);
39ae355f
TL
246 add_metric(builder, usage, "ceph_exporter_cpu_usage",
247 "CPU usage of a daemon", "gauge", labels);
2a845540
TL
248
249 std::string cpu_time_desc = "Process time in kernel/user/idle mode";
250 labels_t cpu_total_labels;
251 cpu_total_labels["ceph_daemon"] = quote(daemon_name);
252 cpu_total_labels["mode"] = quote("kernel");
253 add_metric(builder, kernel_time, "ceph_exporter_cpu_total", cpu_time_desc,
254 "counter", cpu_total_labels);
255 cpu_total_labels["mode"] = quote("user");
256 add_metric(builder, user_time, "ceph_exporter_cpu_total", cpu_time_desc,
257 "counter", cpu_total_labels);
258 cpu_total_labels["mode"] = quote("idle");
259 add_metric(builder, idle_time, "ceph_exporter_cpu_total", cpu_time_desc,
260 "counter", cpu_total_labels);
39ae355f
TL
261 add_metric(builder, stat.vm_size, "ceph_exporter_vm_size",
262 "Virtual memory used in a daemon", "gauge", labels);
2a845540
TL
263 add_metric(builder, stat.resident_size, "ceph_exporter_resident_size",
264 "Resident memory in a daemon", "gauge", labels);
265 }
266}
267
268std::string DaemonMetricCollector::asok_request(AdminSocketClient &asok,
39ae355f
TL
269 std::string command,
270 std::string daemon_name) {
2a845540
TL
271 std::string request("{\"prefix\": \"" + command + "\"}");
272 std::string response;
273 std::string err = asok.do_request(request, &response);
274 if (err.length() > 0 || response.substr(0, 5) == "ERROR") {
39ae355f
TL
275 dout(1) << "command " << command << "failed for daemon " << daemon_name
276 << "with error: " << err << dendl;
2a845540
TL
277 return "";
278 }
279 return response;
280}
281
282std::pair<labels_t, std::string>
283DaemonMetricCollector::get_labels_and_metric_name(std::string daemon_name,
284 std::string metric_name) {
285 std::string new_metric_name;
286 labels_t labels;
287 new_metric_name = metric_name;
1e59de90
TL
288 // In vstart cluster socket files for rgw are stored as radosgw.<instance_id>.asok
289 if (daemon_name.find("radosgw") != std::string::npos) {
290 std::size_t pos = daemon_name.find_last_of('.');
291 std::string tmp = daemon_name.substr(pos+1);
292 labels["instance_id"] = quote(tmp);
293 }
294 else if (daemon_name.find("rgw") != std::string::npos) {
2a845540
TL
295 std::string tmp = daemon_name.substr(16, std::string::npos);
296 std::string::size_type pos = tmp.find('.');
297 labels["instance_id"] = quote("rgw." + tmp.substr(0, pos));
1e59de90
TL
298 }
299 else if (daemon_name.find("rbd-mirror") != std::string::npos) {
300 std::regex re(
301 "^rbd_mirror_image_([^/]+)/(?:(?:([^/]+)/"
302 ")?)(.*)\\.(replay(?:_bytes|_latency)?)$");
303 std::smatch match;
304 if (std::regex_search(daemon_name, match, re) == true) {
305 new_metric_name = "ceph_rbd_mirror_image_" + match.str(4);
306 labels["pool"] = quote(match.str(1));
307 labels["namespace"] = quote(match.str(2));
308 labels["image"] = quote(match.str(3));
2a845540
TL
309 }
310 }
311 return {labels, new_metric_name};
312}
313
1e59de90
TL
314// Add fixed name metrics from existing ones that have details in their names
315// that should be in labels (not in name). For backward compatibility,
316// a new fixed name metric is created (instead of replacing)and details are put
317// in new labels. Intended for RGW sync perf. counters but extendable as required.
318// See: https://tracker.ceph.com/issues/45311
319std::pair<labels_t, std::string>
320DaemonMetricCollector::add_fixed_name_metrics(std::string metric_name) {
321 std::string new_metric_name;
322 labels_t labels;
323 new_metric_name = metric_name;
324
325 std::regex re("^data_sync_from_(.*)\\.");
326 std::smatch match;
327 if (std::regex_search(metric_name, match, re) == true) {
328 new_metric_name = std::regex_replace(metric_name, re, "from_([^.]*)', 'from_zone");
329 labels["source_zone"] = quote(match.str(1));
330 return {labels, new_metric_name};
331 }
332 return {};
333}
334
2a845540
TL
335/*
336perf_values can be either a int/double or a json_object. Since
337 json_value is a wrapper of both we use that class.
338 */
339void DaemonMetricCollector::dump_asok_metric(json_object perf_info,
340 json_value perf_values,
341 std::string name,
342 labels_t labels) {
343 int64_t type = perf_info["type"].as_int64();
344 std::string metric_type =
39ae355f 345 boost_string_to_std(perf_info["metric_type"].as_string());
2a845540 346 std::string description =
39ae355f 347 boost_string_to_std(perf_info["description"].as_string());
2a845540
TL
348
349 if (type & PERFCOUNTER_LONGRUNAVG) {
350 int64_t count = perf_values.as_object()["avgcount"].as_int64();
351 add_metric(builder, count, name + "_count", description, metric_type,
352 labels);
353 json_value sum_value = perf_values.as_object()["sum"];
354 add_double_or_int_metric(builder, sum_value, name + "_sum", description,
355 metric_type, labels);
356 } else if (type & PERFCOUNTER_TIME) {
357 if (perf_values.is_int64()) {
358 double value = perf_values.as_int64() / 1000000000.0f;
359 add_metric(builder, value, name, description, metric_type, labels);
360 } else if (perf_values.is_double()) {
361 double value = perf_values.as_double() / 1000000000.0f;
362 add_metric(builder, value, name, description, metric_type, labels);
363 }
364 } else {
365 add_double_or_int_metric(builder, perf_values, name, description,
366 metric_type, labels);
367 }
368}
369
370void DaemonMetricCollector::update_sockets() {
371 std::string sock_dir = g_conf().get_val<std::string>("exporter_sock_dir");
372 clients.clear();
373 std::filesystem::path sock_path = sock_dir;
39ae355f 374 if (!std::filesystem::is_directory(sock_path.parent_path())) {
2a845540
TL
375 dout(1) << "ERROR: No such directory exist" << sock_dir << dendl;
376 return;
377 }
39ae355f 378 for (const auto &entry : std::filesystem::directory_iterator(sock_dir)) {
2a845540
TL
379 if (entry.path().extension() == ".asok") {
380 std::string daemon_socket_name = entry.path().filename().string();
381 std::string daemon_name =
39ae355f 382 daemon_socket_name.substr(0, daemon_socket_name.size() - 5);
2a845540
TL
383 if (clients.find(daemon_name) == clients.end() &&
384 !(daemon_name.find("mgr") != std::string::npos) &&
385 !(daemon_name.find("ceph-exporter") != std::string::npos)) {
386 AdminSocketClient sock(entry.path().string());
387 clients.insert({daemon_name, std::move(sock)});
388 }
389 }
390 }
391}
392
393void OrderedMetricsBuilder::add(std::string value, std::string name,
394 std::string description, std::string mtype,
395 labels_t labels) {
2a845540
TL
396 if (metrics.find(name) == metrics.end()) {
397 Metric metric(name, mtype, description);
398 metrics[name] = std::move(metric);
399 }
400 Metric &metric = metrics[name];
401 metric.add(labels, value);
402}
403
404std::string OrderedMetricsBuilder::dump() {
405 for (auto &[name, metric] : metrics) {
406 out += metric.dump() + "\n";
407 }
408 return out;
409}
410
411void UnorderedMetricsBuilder::add(std::string value, std::string name,
412 std::string description, std::string mtype,
413 labels_t labels) {
2a845540
TL
414 Metric metric(name, mtype, description);
415 metric.add(labels, value);
416 out += metric.dump() + "\n\n";
417}
418
419std::string UnorderedMetricsBuilder::dump() { return out; }
420
421void Metric::add(labels_t labels, std::string value) {
422 metric_entry entry;
423 entry.labels = labels;
424 entry.value = value;
425 entries.push_back(entry);
426}
427
428std::string Metric::dump() {
429 std::stringstream metric_ss;
430 metric_ss << "# HELP " << name << " " << description << "\n";
431 metric_ss << "# TYPE " << name << " " << mtype << "\n";
432 for (auto &entry : entries) {
433 std::stringstream labels_ss;
434 size_t i = 0;
435 for (auto &[label_name, label_value] : entry.labels) {
436 labels_ss << label_name << "=" << label_value;
437 if (i < entry.labels.size() - 1) {
438 labels_ss << ",";
439 }
440 i++;
441 }
442 metric_ss << name << "{" << labels_ss.str() << "} " << entry.value;
443 if (&entry != &entries.back()) {
444 metric_ss << "\n";
445 }
446 }
447 return metric_ss.str();
448}
449
450DaemonMetricCollector &collector_instance() {
451 static DaemonMetricCollector instance;
452 return instance;
453}