1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 * Placement Group Map. Placement Groups are logical sets of objects
17 * that are replicated by the same set of devices. pgid=(r,hash(o)&m)
18 * where & is a bit-wise AND and m=2^k-1
24 #include "include/health.h"
25 #include "common/debug.h"
26 #include "common/TextTable.h"
27 #include "osd/osd_types.h"
28 #include "include/mempool.h"
29 #include "mon/health_check.h"
32 namespace ceph
{ class Formatter
; }
36 MEMPOOL_CLASS_HELPERS();
37 virtual ~PGMapDigest() {}
39 mempool::pgmap::vector
<uint64_t> osd_last_seq
;
41 mutable std::map
<int, int64_t> avail_space_by_rule
;
43 // aggregate state, populated by PGMap child
44 int64_t num_pg
= 0, num_osd
= 0;
45 int64_t num_pg_active
= 0;
46 int64_t num_pg_unknown
= 0;
47 mempool::pgmap::unordered_map
<int32_t,pool_stat_t
> pg_pool_sum
;
48 mempool::pgmap::map
<int64_t,int64_t> num_pg_by_pool
;
51 mempool::pgmap::map
<std::string
,osd_stat_t
> osd_sum_by_class
;
52 mempool::pgmap::unordered_map
<uint64_t,int32_t> num_pg_by_state
;
55 int32_t up_not_acting
= 0;
57 void encode(ceph::buffer::list
& bl
) const {
60 encode(up_not_acting
, bl
);
63 void decode(ceph::buffer::list::const_iterator
& p
) {
66 decode(up_not_acting
, p
);
70 mempool::pgmap::unordered_map
<int32_t,pg_count
> num_pg_by_osd
;
72 mempool::pgmap::map
<int64_t,interval_set
<snapid_t
>> purged_snaps
;
74 bool use_per_pool_stats() const {
75 return osd_sum
.num_osds
== osd_sum
.num_per_pool_osds
;
77 bool use_per_pool_omap_stats() const {
78 return osd_sum
.num_osds
== osd_sum
.num_per_pool_omap_osds
;
81 // recent deltas, and summation
83 * keep track of last deltas for each pool, calculated using
84 * @p pg_pool_sum as baseline.
86 mempool::pgmap::unordered_map
<int64_t, mempool::pgmap::list
<std::pair
<pool_stat_t
, utime_t
> > > per_pool_sum_deltas
;
88 * keep track of per-pool timestamp deltas, according to last update on
91 mempool::pgmap::unordered_map
<int64_t, utime_t
> per_pool_sum_deltas_stamps
;
93 * keep track of sum deltas, per-pool, taking into account any previous
94 * deltas existing in @p per_pool_sum_deltas. The utime_t as second member
95 * of the pair is the timestamp referring to the last update (i.e., the first
96 * member of the pair) for a given pool.
98 mempool::pgmap::unordered_map
<int64_t, std::pair
<pool_stat_t
,utime_t
> > per_pool_sum_delta
;
100 pool_stat_t pg_sum_delta
;
103 void get_recovery_stats(
104 double *misplaced_ratio
,
105 double *degraded_ratio
,
106 double *inactive_ratio
,
107 double *unknown_pgs_ratio
) const;
109 void print_summary(ceph::Formatter
*f
, std::ostream
*out
) const;
110 void print_oneline_summary(ceph::Formatter
*f
, std::ostream
*out
) const;
112 void recovery_summary(ceph::Formatter
*f
, std::list
<std::string
> *psl
,
113 const pool_stat_t
& pool_sum
) const;
114 void overall_recovery_summary(ceph::Formatter
*f
, std::list
<std::string
> *psl
) const;
115 void pool_recovery_summary(ceph::Formatter
*f
, std::list
<std::string
> *psl
,
116 uint64_t poolid
) const;
117 void recovery_rate_summary(ceph::Formatter
*f
, std::ostream
*out
,
118 const pool_stat_t
& delta_sum
,
119 utime_t delta_stamp
) const;
120 void overall_recovery_rate_summary(ceph::Formatter
*f
, std::ostream
*out
) const;
121 void pool_recovery_rate_summary(ceph::Formatter
*f
, std::ostream
*out
,
122 uint64_t poolid
) const;
124 * Obtain a formatted/plain output for client I/O, source from stats for a
125 * given @p delta_sum pool over a given @p delta_stamp period of time.
127 void client_io_rate_summary(ceph::Formatter
*f
, std::ostream
*out
,
128 const pool_stat_t
& delta_sum
,
129 utime_t delta_stamp
) const;
131 * Obtain a formatted/plain output for the overall client I/O, which is
132 * calculated resorting to @p pg_sum_delta and @p stamp_delta.
134 void overall_client_io_rate_summary(ceph::Formatter
*f
, std::ostream
*out
) const;
136 * Obtain a formatted/plain output for client I/O over a given pool
137 * with id @p pool_id. We will then obtain pool-specific data
138 * from @p per_pool_sum_delta.
140 void pool_client_io_rate_summary(ceph::Formatter
*f
, std::ostream
*out
,
141 uint64_t poolid
) const;
143 * Obtain a formatted/plain output for cache tier IO, source from stats for a
144 * given @p delta_sum pool over a given @p delta_stamp period of time.
146 void cache_io_rate_summary(ceph::Formatter
*f
, std::ostream
*out
,
147 const pool_stat_t
& delta_sum
,
148 utime_t delta_stamp
) const;
150 * Obtain a formatted/plain output for the overall cache tier IO, which is
151 * calculated resorting to @p pg_sum_delta and @p stamp_delta.
153 void overall_cache_io_rate_summary(ceph::Formatter
*f
, std::ostream
*out
) const;
155 * Obtain a formatted/plain output for cache tier IO over a given pool
156 * with id @p pool_id. We will then obtain pool-specific data
157 * from @p per_pool_sum_delta.
159 void pool_cache_io_rate_summary(ceph::Formatter
*f
, std::ostream
*out
,
160 uint64_t poolid
) const;
163 * Return the number of additional bytes that can be stored in this
164 * pool before the first OSD fills up, accounting for PG overhead.
166 int64_t get_pool_free_space(const OSDMap
&osd_map
, int64_t poolid
) const;
170 * Dump pool usage and io ops/bytes, used by "ceph df" command
172 virtual void dump_pool_stats_full(const OSDMap
&osd_map
, std::stringstream
*ss
,
173 ceph::Formatter
*f
, bool verbose
) const;
174 void dump_cluster_stats(std::stringstream
*ss
, ceph::Formatter
*f
, bool verbose
) const;
175 static void dump_object_stat_sum(TextTable
&tbl
, ceph::Formatter
*f
,
176 const pool_stat_t
&pool_stat
,
182 const pg_pool_t
*pool
);
184 size_t get_num_pg_by_osd(int osd
) const {
185 auto p
= num_pg_by_osd
.find(osd
);
186 if (p
== num_pg_by_osd
.end())
189 return p
->second
.acting
;
191 int get_num_primary_pg_by_osd(int osd
) const {
192 auto p
= num_pg_by_osd
.find(osd
);
193 if (p
== num_pg_by_osd
.end())
196 return p
->second
.primary
;
199 ceph_statfs
get_statfs(OSDMap
&osdmap
,
200 boost::optional
<int64_t> data_pool
) const;
202 int64_t get_rule_avail(int ruleno
) const {
203 auto i
= avail_space_by_rule
.find(ruleno
);
204 if (i
!= avail_space_by_rule
.end())
205 return avail_space_by_rule
[ruleno
];
210 // kill me post-mimic or -nautilus
211 bool definitely_converted_snapsets() const {
212 // false negative is okay; false positive is not!
215 num_pg_unknown
== 0 &&
216 pg_sum
.stats
.sum
.num_legacy_snapsets
== 0;
219 uint64_t get_last_osd_stat_seq(int osd
) {
220 if (osd
< (int)osd_last_seq
.size())
221 return osd_last_seq
[osd
];
225 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
226 void decode(ceph::buffer::list::const_iterator
& p
);
227 void dump(ceph::Formatter
*f
) const;
228 static void generate_test_instances(std::list
<PGMapDigest
*>& ls
);
230 WRITE_CLASS_ENCODER(PGMapDigest::pg_count
);
231 WRITE_CLASS_ENCODER_FEATURES(PGMapDigest
);
233 class PGMap
: public PGMapDigest
{
235 MEMPOOL_CLASS_HELPERS();
239 epoch_t last_osdmap_epoch
; // last osdmap epoch i applied to the pgmap
240 epoch_t last_pg_scan
; // osdmap epoch
241 mempool::pgmap::unordered_map
<int32_t,osd_stat_t
> osd_stat
;
242 mempool::pgmap::unordered_map
<pg_t
,pg_stat_t
> pg_stat
;
244 typedef mempool::pgmap::map
<
245 std::pair
<int64_t, int>, // <pool, osd>
247 per_osd_pool_statfs_t
;
249 per_osd_pool_statfs_t pool_statfs
;
253 MEMPOOL_CLASS_HELPERS();
255 mempool::pgmap::map
<pg_t
,pg_stat_t
> pg_stat_updates
;
256 epoch_t osdmap_epoch
;
257 epoch_t pg_scan
; // osdmap epoch
258 mempool::pgmap::set
<pg_t
> pg_remove
;
260 per_osd_pool_statfs_t pool_statfs_updates
;
263 mempool::pgmap::map
<int32_t,osd_stat_t
> osd_stat_updates
;
264 mempool::pgmap::set
<int32_t> osd_stat_rm
;
267 const mempool::pgmap::map
<int32_t, osd_stat_t
> &get_osd_stat_updates() const {
268 return osd_stat_updates
;
270 const mempool::pgmap::set
<int32_t> &get_osd_stat_rm() const {
273 template<typename OsdStat
>
274 void update_stat(int32_t osd
, OsdStat
&& stat
) {
275 osd_stat_updates
[osd
] = std::forward
<OsdStat
>(stat
);
277 void stat_osd_out(int32_t osd
) {
278 osd_stat_updates
[osd
] = osd_stat_t();
280 void stat_osd_down_up(int32_t osd
, const PGMap
& pg_map
) {
281 // 0 the op_queue_age_hist for this osd
282 auto p
= osd_stat_updates
.find(osd
);
283 if (p
!= osd_stat_updates
.end()) {
284 p
->second
.op_queue_age_hist
.clear();
287 auto q
= pg_map
.osd_stat
.find(osd
);
288 if (q
!= pg_map
.osd_stat
.end()) {
289 osd_stat_t
& t
= osd_stat_updates
[osd
] = q
->second
;
290 t
.op_queue_age_hist
.clear();
293 void rm_stat(int32_t osd
) {
294 osd_stat_rm
.insert(osd
);
295 osd_stat_updates
.erase(osd
);
297 void dump(ceph::Formatter
*f
) const;
298 static void generate_test_instances(std::list
<Incremental
*>& o
);
300 Incremental() : version(0), osdmap_epoch(0), pg_scan(0) {}
304 // aggregate stats (soft state), generated by calc_stats()
305 mempool::pgmap::unordered_map
<int,std::set
<pg_t
> > pg_by_osd
;
306 mempool::pgmap::unordered_map
<int,int> blocked_by_sum
;
307 mempool::pgmap::list
<std::pair
<pool_stat_t
, utime_t
> > pg_sum_deltas
;
308 mempool::pgmap::unordered_map
<int64_t,mempool::pgmap::unordered_map
<uint64_t,int32_t>> num_pg_by_pool_state
;
312 void update_pool_deltas(
315 const mempool::pgmap::unordered_map
<int32_t, pool_stat_t
>& pg_pool_sum_old
);
318 void deleted_pool(int64_t pool
) {
319 for (auto i
= pool_statfs
.begin(); i
!= pool_statfs
.end();) {
320 if (i
->first
.first
== pool
) {
321 i
= pool_statfs
.erase(i
);
327 pg_pool_sum
.erase(pool
);
328 num_pg_by_pool_state
.erase(pool
);
329 num_pg_by_pool
.erase(pool
);
330 per_pool_sum_deltas
.erase(pool
);
331 per_pool_sum_deltas_stamps
.erase(pool
);
332 per_pool_sum_delta
.erase(pool
);
339 const pool_stat_t
& old_pool_sum
,
341 const pool_stat_t
& current_pool_sum
,
342 pool_stat_t
*result_pool_delta
,
343 utime_t
*result_ts_delta
,
344 mempool::pgmap::list
<std::pair
<pool_stat_t
,utime_t
> > *delta_avg_list
);
346 void update_one_pool_delta(CephContext
*cct
,
349 const pool_stat_t
& old_pool_sum
);
353 mempool::pgmap::set
<pg_t
> creating_pgs
;
354 mempool::pgmap::map
<int,std::map
<epoch_t
,std::set
<pg_t
> > > creating_pgs_by_osd_epoch
;
356 // Bits that use to be enum StuckPG
357 static const int STUCK_INACTIVE
= (1<<0);
358 static const int STUCK_UNCLEAN
= (1<<1);
359 static const int STUCK_UNDERSIZED
= (1<<2);
360 static const int STUCK_DEGRADED
= (1<<3);
361 static const int STUCK_STALE
= (1<<4);
365 last_osdmap_epoch(0), last_pg_scan(0)
368 version_t
get_version() const {
371 void set_version(version_t v
) {
374 epoch_t
get_last_osdmap_epoch() const {
375 return last_osdmap_epoch
;
377 void set_last_osdmap_epoch(epoch_t e
) {
378 last_osdmap_epoch
= e
;
380 epoch_t
get_last_pg_scan() const {
383 void set_last_pg_scan(epoch_t e
) {
386 utime_t
get_stamp() const {
389 void set_stamp(utime_t s
) {
393 pool_stat_t
get_pg_pool_sum_stat(int64_t pool
) const {
394 auto p
= pg_pool_sum
.find(pool
);
395 if (p
!= pg_pool_sum
.end())
397 return pool_stat_t();
400 osd_stat_t
get_osd_sum(const std::set
<int>& osds
) const {
401 if (osds
.empty()) // all
404 for (auto i
: osds
) {
405 auto os
= get_osd_stat(i
);
412 const osd_stat_t
*get_osd_stat(int osd
) const {
413 auto i
= osd_stat
.find(osd
);
414 if (i
== osd_stat
.end()) {
421 void apply_incremental(CephContext
*cct
, const Incremental
& inc
);
423 void stat_pg_add(const pg_t
&pgid
, const pg_stat_t
&s
,
424 bool sameosds
=false);
425 bool stat_pg_sub(const pg_t
&pgid
, const pg_stat_t
&s
,
426 bool sameosds
=false);
427 void calc_purged_snaps();
428 void calc_osd_sum_by_class(const OSDMap
& osdmap
);
429 void stat_osd_add(int osd
, const osd_stat_t
&s
);
430 void stat_osd_sub(int osd
, const osd_stat_t
&s
);
432 void encode(ceph::buffer::list
&bl
, uint64_t features
=-1) const;
433 void decode(ceph::buffer::list::const_iterator
&bl
);
435 /// encode subset of our data to a PGMapDigest
436 void encode_digest(const OSDMap
& osdmap
,
437 ceph::buffer::list
& bl
, uint64_t features
);
439 int64_t get_rule_avail(const OSDMap
& osdmap
, int ruleno
) const;
440 void get_rules_avail(const OSDMap
& osdmap
,
441 std::map
<int,int64_t> *avail_map
) const;
442 void dump(ceph::Formatter
*f
, bool with_net
= true) const;
443 void dump_basic(ceph::Formatter
*f
) const;
444 void dump_pg_stats(ceph::Formatter
*f
, bool brief
) const;
445 void dump_pool_stats(ceph::Formatter
*f
) const;
446 void dump_osd_stats(ceph::Formatter
*f
, bool with_net
= true) const;
447 void dump_osd_ping_times(ceph::Formatter
*f
) const;
448 void dump_delta(ceph::Formatter
*f
) const;
449 void dump_filtered_pg_stats(ceph::Formatter
*f
, std::set
<pg_t
>& pgs
) const;
450 void dump_pool_stats_full(const OSDMap
&osd_map
, std::stringstream
*ss
,
451 ceph::Formatter
*f
, bool verbose
) const override
{
452 get_rules_avail(osd_map
, &avail_space_by_rule
);
453 PGMapDigest::dump_pool_stats_full(osd_map
, ss
, f
, verbose
);
457 * Dump client io rate, recovery io rate, cache io rate and recovery information.
458 * this function is used by "ceph osd pool stats" command
460 void dump_pool_stats_and_io_rate(int64_t poolid
, const OSDMap
&osd_map
, ceph::Formatter
*f
,
461 std::stringstream
*ss
) const;
463 void dump_pg_stats_plain(
465 const mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
>& pg_stats
,
467 void get_stuck_stats(
468 int types
, const utime_t cutoff
,
469 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
>& stuck_pgs
) const;
470 bool get_stuck_counts(const utime_t cutoff
, std::map
<std::string
, int>& note
) const;
471 void dump_stuck(ceph::Formatter
*f
, int types
, utime_t cutoff
) const;
472 void dump_stuck_plain(std::ostream
& ss
, int types
, utime_t cutoff
) const;
473 int dump_stuck_pg_stats(std::stringstream
&ds
,
476 std::vector
<std::string
>& args
) const;
477 void dump(std::ostream
& ss
) const;
478 void dump_basic(std::ostream
& ss
) const;
479 void dump_pg_stats(std::ostream
& ss
, bool brief
) const;
480 void dump_pg_sum_stats(std::ostream
& ss
, bool header
) const;
481 void dump_pool_stats(std::ostream
& ss
, bool header
) const;
482 void dump_osd_stats(std::ostream
& ss
) const;
483 void dump_osd_sum_stats(std::ostream
& ss
) const;
484 void dump_filtered_pg_stats(std::ostream
& ss
, std::set
<pg_t
>& pgs
) const;
486 void dump_osd_perf_stats(ceph::Formatter
*f
) const;
487 void print_osd_perf_stats(std::ostream
*ss
) const;
489 void dump_osd_blocked_by_stats(ceph::Formatter
*f
) const;
490 void print_osd_blocked_by_stats(std::ostream
*ss
) const;
492 void get_filtered_pg_stats(uint64_t state
, int64_t poolid
, int64_t osdid
,
493 bool primary
, std::set
<pg_t
>& pgs
) const;
495 set
<std::string
> osd_parentage(const OSDMap
& osdmap
, int id
) const;
496 void get_health_checks(
498 const OSDMap
& osdmap
,
499 health_check_map_t
*checks
) const;
500 void print_summary(ceph::Formatter
*f
, ostream
*out
) const;
502 static void generate_test_instances(std::list
<PGMap
*>& o
);
504 WRITE_CLASS_ENCODER_FEATURES(PGMap
)
506 inline std::ostream
& operator<<(std::ostream
& out
, const PGMapDigest
& m
) {
507 m
.print_oneline_summary(NULL
, &out
);
511 int process_pg_map_command(
512 const std::string
& prefix
,
513 const cmdmap_t
& cmdmap
,
515 const OSDMap
& osdmap
,
517 std::stringstream
*ss
,
518 ceph::buffer::list
*odata
);
523 static void check_osd_map(
525 const OSDMap
&osdmap
,
527 PGMap::Incremental
*pending_inc
);
529 // mark pg's state stale if its acting primary osd is down
530 static void check_down_pgs(
531 const OSDMap
&osd_map
,
534 const std::set
<int>& need_check_down_pg_osds
,
535 PGMap::Incremental
*pending_inc
);
539 /* Assign a lower weight to overloaded OSDs.
541 * The osds that will get a lower weight are those with with a utilization
542 * percentage 'oload' percent greater than the average utilization.
544 int by_utilization(const OSDMap
&osd_map
,
549 bool by_pg
, const std::set
<int64_t> *pools
,
551 mempool::osdmap::map
<int32_t, uint32_t>* new_weights
,
552 std::stringstream
*ss
,
553 std::string
*out_str
,