]>
Commit | Line | Data |
---|---|---|
9f95a23c TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include "osd_perf_counters.h" | |
5 | #include "include/common_fwd.h" | |
6 | ||
7 | ||
8 | PerfCounters *build_osd_logger(CephContext *cct) { | |
9 | PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last); | |
10 | ||
11 | // Latency axis configuration for op histograms, values are in nanoseconds | |
12 | PerfHistogramCommon::axis_config_d op_hist_x_axis_config{ | |
13 | "Latency (usec)", | |
14 | PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale | |
15 | 0, ///< Start at 0 | |
16 | 100000, ///< Quantization unit is 100usec | |
17 | 32, ///< Enough to cover much longer than slow requests | |
18 | }; | |
19 | ||
20 | // Op size axis configuration for op histograms, values are in bytes | |
21 | PerfHistogramCommon::axis_config_d op_hist_y_axis_config{ | |
22 | "Request size (bytes)", | |
23 | PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale | |
24 | 0, ///< Start at 0 | |
25 | 512, ///< Quantization unit is 512 bytes | |
26 | 32, ///< Enough to cover requests larger than GB | |
27 | }; | |
28 | ||
29 | ||
30 | // All the basic OSD operation stats are to be considered useful | |
31 | osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); | |
32 | ||
33 | osd_plb.add_u64( | |
34 | l_osd_op_wip, "op_wip", | |
35 | "Replication operations currently being processed (primary)"); | |
36 | osd_plb.add_u64_counter( | |
37 | l_osd_op, "op", | |
38 | "Client operations", | |
39 | "ops", PerfCountersBuilder::PRIO_CRITICAL); | |
40 | osd_plb.add_u64_counter( | |
41 | l_osd_op_inb, "op_in_bytes", | |
42 | "Client operations total write size", | |
43 | "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); | |
44 | osd_plb.add_u64_counter( | |
45 | l_osd_op_outb, "op_out_bytes", | |
46 | "Client operations total read size", | |
47 | "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); | |
48 | osd_plb.add_time_avg( | |
49 | l_osd_op_lat, "op_latency", | |
50 | "Latency of client operations (including queue time)", | |
51 | "l", 9); | |
52 | osd_plb.add_time_avg( | |
53 | l_osd_op_process_lat, "op_process_latency", | |
54 | "Latency of client operations (excluding queue time)"); | |
55 | osd_plb.add_time_avg( | |
56 | l_osd_op_prepare_lat, "op_prepare_latency", | |
57 | "Latency of client operations (excluding queue time and wait for finished)"); | |
58 | ||
59 | osd_plb.add_u64_counter( | |
60 | l_osd_op_r, "op_r", "Client read operations"); | |
61 | osd_plb.add_u64_counter( | |
62 | l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
63 | osd_plb.add_time_avg( | |
64 | l_osd_op_r_lat, "op_r_latency", | |
65 | "Latency of read operation (including queue time)"); | |
66 | osd_plb.add_u64_counter_histogram( | |
67 | l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram", | |
68 | op_hist_x_axis_config, op_hist_y_axis_config, | |
69 | "Histogram of operation latency (including queue time) + data read"); | |
70 | osd_plb.add_time_avg( | |
71 | l_osd_op_r_process_lat, "op_r_process_latency", | |
72 | "Latency of read operation (excluding queue time)"); | |
73 | osd_plb.add_time_avg( | |
74 | l_osd_op_r_prepare_lat, "op_r_prepare_latency", | |
75 | "Latency of read operations (excluding queue time and wait for finished)"); | |
76 | osd_plb.add_u64_counter( | |
77 | l_osd_op_w, "op_w", "Client write operations"); | |
78 | osd_plb.add_u64_counter( | |
79 | l_osd_op_w_inb, "op_w_in_bytes", "Client data written"); | |
80 | osd_plb.add_time_avg( | |
81 | l_osd_op_w_lat, "op_w_latency", | |
82 | "Latency of write operation (including queue time)"); | |
83 | osd_plb.add_u64_counter_histogram( | |
84 | l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram", | |
85 | op_hist_x_axis_config, op_hist_y_axis_config, | |
86 | "Histogram of operation latency (including queue time) + data written"); | |
87 | osd_plb.add_time_avg( | |
88 | l_osd_op_w_process_lat, "op_w_process_latency", | |
89 | "Latency of write operation (excluding queue time)"); | |
90 | osd_plb.add_time_avg( | |
91 | l_osd_op_w_prepare_lat, "op_w_prepare_latency", | |
92 | "Latency of write operations (excluding queue time and wait for finished)"); | |
93 | osd_plb.add_u64_counter( | |
94 | l_osd_op_rw, "op_rw", | |
95 | "Client read-modify-write operations"); | |
96 | osd_plb.add_u64_counter( | |
97 | l_osd_op_rw_inb, "op_rw_in_bytes", | |
98 | "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
99 | osd_plb.add_u64_counter( | |
100 | l_osd_op_rw_outb,"op_rw_out_bytes", | |
101 | "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
102 | osd_plb.add_time_avg( | |
103 | l_osd_op_rw_lat, "op_rw_latency", | |
104 | "Latency of read-modify-write operation (including queue time)"); | |
105 | osd_plb.add_u64_counter_histogram( | |
106 | l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram", | |
107 | op_hist_x_axis_config, op_hist_y_axis_config, | |
108 | "Histogram of rw operation latency (including queue time) + data written"); | |
109 | osd_plb.add_u64_counter_histogram( | |
110 | l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram", | |
111 | op_hist_x_axis_config, op_hist_y_axis_config, | |
112 | "Histogram of rw operation latency (including queue time) + data read"); | |
113 | osd_plb.add_time_avg( | |
114 | l_osd_op_rw_process_lat, "op_rw_process_latency", | |
115 | "Latency of read-modify-write operation (excluding queue time)"); | |
116 | osd_plb.add_time_avg( | |
117 | l_osd_op_rw_prepare_lat, "op_rw_prepare_latency", | |
118 | "Latency of read-modify-write operations (excluding queue time and wait for finished)"); | |
119 | ||
120 | // Now we move on to some more obscure stats, revert to assuming things | |
121 | // are low priority unless otherwise specified. | |
122 | osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY); | |
123 | ||
124 | osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat", | |
125 | "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency | |
126 | osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat", | |
127 | "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency | |
128 | ||
129 | osd_plb.add_u64_counter( | |
130 | l_osd_sop, "subop", "Suboperations"); | |
131 | osd_plb.add_u64_counter( | |
132 | l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES)); | |
133 | osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency"); | |
134 | ||
135 | osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes"); | |
136 | osd_plb.add_u64_counter( | |
137 | l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES)); | |
138 | osd_plb.add_time_avg( | |
139 | l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency"); | |
140 | osd_plb.add_u64_counter( | |
141 | l_osd_sop_pull, "subop_pull", "Suboperations pull requests"); | |
142 | osd_plb.add_time_avg( | |
143 | l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency"); | |
144 | osd_plb.add_u64_counter( | |
145 | l_osd_sop_push, "subop_push", "Suboperations push messages"); | |
146 | osd_plb.add_u64_counter( | |
147 | l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES)); | |
148 | osd_plb.add_time_avg( | |
149 | l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency"); | |
150 | ||
151 | osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent"); | |
152 | osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent"); | |
153 | osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES)); | |
154 | ||
155 | osd_plb.add_u64_counter( | |
156 | l_osd_rop, "recovery_ops", | |
157 | "Started recovery operations", | |
158 | "rop", PerfCountersBuilder::PRIO_INTERESTING); | |
159 | ||
160 | osd_plb.add_u64_counter( | |
161 | l_osd_rbytes, "recovery_bytes", | |
162 | "recovery bytes", | |
163 | "rbt", PerfCountersBuilder::PRIO_INTERESTING); | |
164 | ||
165 | osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load"); | |
166 | osd_plb.add_u64( | |
167 | l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache"); | |
168 | osd_plb.add_u64( | |
169 | l_osd_cached_crc_adjusted, "cached_crc_adjusted", | |
170 | "Total number getting crc from crc_cache with adjusting"); | |
171 | osd_plb.add_u64(l_osd_missed_crc, "missed_crc", | |
172 | "Total number of crc cache misses"); | |
173 | ||
174 | osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups", | |
175 | "pgs", PerfCountersBuilder::PRIO_USEFUL); | |
176 | osd_plb.add_u64( | |
177 | l_osd_pg_primary, "numpg_primary", | |
178 | "Placement groups for which this osd is primary"); | |
179 | osd_plb.add_u64( | |
180 | l_osd_pg_replica, "numpg_replica", | |
181 | "Placement groups for which this osd is replica"); | |
182 | osd_plb.add_u64( | |
183 | l_osd_pg_stray, "numpg_stray", | |
184 | "Placement groups ready to be deleted from this osd"); | |
185 | osd_plb.add_u64( | |
186 | l_osd_pg_removing, "numpg_removing", | |
187 | "Placement groups queued for local deletion", "pgsr", | |
188 | PerfCountersBuilder::PRIO_USEFUL); | |
189 | osd_plb.add_u64( | |
190 | l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to"); | |
191 | osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages"); | |
192 | osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs"); | |
193 | osd_plb.add_u64_counter( | |
194 | l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates"); | |
195 | osd_plb.add_u64_counter( | |
196 | l_osd_waiting_for_map, "messages_delayed_for_map", | |
197 | "Operations waiting for OSD map"); | |
198 | ||
199 | osd_plb.add_u64_counter( | |
200 | l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit"); | |
201 | osd_plb.add_u64_counter( | |
202 | l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss"); | |
203 | osd_plb.add_u64_counter( | |
204 | l_osd_map_cache_miss_low, "osd_map_cache_miss_low", | |
205 | "osdmap cache miss below cache lower bound"); | |
206 | osd_plb.add_u64_avg( | |
207 | l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg", | |
208 | "osdmap cache miss, avg distance below cache lower bound"); | |
209 | osd_plb.add_u64_counter( | |
210 | l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit", | |
211 | "OSDMap buffer cache hits"); | |
212 | osd_plb.add_u64_counter( | |
213 | l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss", | |
214 | "OSDMap buffer cache misses"); | |
215 | ||
216 | osd_plb.add_u64( | |
217 | l_osd_stat_bytes, "stat_bytes", "OSD size", "size", | |
218 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
219 | osd_plb.add_u64( | |
220 | l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used", | |
221 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
222 | osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES)); | |
223 | ||
224 | osd_plb.add_u64_counter( | |
225 | l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations"); | |
226 | ||
227 | osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions"); | |
228 | osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes"); | |
229 | osd_plb.add_u64_counter( | |
230 | l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes"); | |
231 | osd_plb.add_u64_counter( | |
232 | l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts"); | |
233 | osd_plb.add_u64_counter( | |
234 | l_osd_tier_try_flush_fail, "tier_try_flush_fail", | |
235 | "Failed tier flush attempts"); | |
236 | osd_plb.add_u64_counter( | |
237 | l_osd_tier_evict, "tier_evict", "Tier evictions"); | |
238 | osd_plb.add_u64_counter( | |
239 | l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts"); | |
240 | osd_plb.add_u64_counter( | |
241 | l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set"); | |
242 | osd_plb.add_u64_counter( | |
243 | l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned"); | |
244 | osd_plb.add_u64_counter( | |
245 | l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)"); | |
246 | osd_plb.add_u64_counter( | |
247 | l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads"); | |
248 | osd_plb.add_u64_counter( | |
249 | l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes"); | |
250 | ||
251 | osd_plb.add_u64_counter( | |
252 | l_osd_agent_wake, "agent_wake", "Tiering agent wake up"); | |
253 | osd_plb.add_u64_counter( | |
254 | l_osd_agent_skip, "agent_skip", "Objects skipped by agent"); | |
255 | osd_plb.add_u64_counter( | |
256 | l_osd_agent_flush, "agent_flush", "Tiering agent flushes"); | |
257 | osd_plb.add_u64_counter( | |
258 | l_osd_agent_evict, "agent_evict", "Tiering agent evictions"); | |
259 | ||
260 | osd_plb.add_u64_counter( | |
261 | l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits"); | |
262 | osd_plb.add_u64_counter( | |
263 | l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups"); | |
264 | ||
265 | osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit"); | |
266 | osd_plb.add_time_avg( | |
267 | l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency"); | |
268 | osd_plb.add_time_avg( | |
269 | l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency"); | |
270 | osd_plb.add_time_avg( | |
271 | l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency"); | |
272 | ||
273 | osd_plb.add_u64_counter( | |
274 | l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)"); | |
275 | osd_plb.add_u64_counter( | |
276 | l_osd_pg_fastinfo, "osd_pg_fastinfo", | |
277 | "PG updated its info using fastinfo attr"); | |
278 | osd_plb.add_u64_counter( | |
279 | l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr"); | |
280 | ||
281 | return osd_plb.create_perf_counters(); | |
282 | } | |
283 | ||
284 | ||
285 | PerfCounters *build_recoverystate_perf(CephContext *cct) { | |
286 | PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last); | |
287 | ||
288 | rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency"); | |
289 | rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency"); | |
290 | rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency"); | |
291 | rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency"); | |
292 | rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency"); | |
293 | rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency"); | |
294 | rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency"); | |
295 | rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency"); | |
296 | rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency"); | |
297 | rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency"); | |
298 | rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency"); | |
299 | rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency"); | |
300 | rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency"); | |
301 | rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency"); | |
302 | rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency"); | |
303 | rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency"); | |
304 | rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency"); | |
305 | rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency"); | |
306 | rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency"); | |
307 | rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency"); | |
308 | rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency"); | |
309 | rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency"); | |
310 | rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency"); | |
311 | rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency"); | |
312 | rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency"); | |
313 | rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency"); | |
314 | rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency"); | |
315 | rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency"); | |
316 | rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency"); | |
317 | rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency"); | |
318 | rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency"); | |
319 | ||
320 | return rs_perf.create_perf_counters(); | |
321 | } |