1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 * Placement Group Map. Placement Groups are logical sets of objects
17 * that are replicated by the same set of devices. pgid=(r,hash(o)&m)
18 * where & is a bit-wise AND and m=2^k-1
24 #include "include/health.h"
25 #include "common/debug.h"
26 #include "common/TextTable.h"
27 #include "osd/osd_types.h"
28 #include "include/mempool.h"
29 #include "mon/health_check.h"
31 #include "mon/PGStatService.h"
33 // FIXME: don't like including this here to get OSDMap::Incremental, maybe
34 // PGMapUpdater needs its own header.
35 #include "osd/OSDMap.h"
37 namespace ceph
{ class Formatter
; }
41 MEMPOOL_CLASS_HELPERS();
42 virtual ~PGMapDigest() {}
44 mempool::pgmap::vector
<uint64_t> osd_last_seq
;
46 mutable std::map
<int, int64_t> avail_space_by_rule
;
48 // aggregate state, populated by PGMap child
49 int64_t num_pg
= 0, num_osd
= 0;
50 int64_t num_pg_active
= 0;
51 int64_t num_pg_unknown
= 0;
52 mempool::pgmap::unordered_map
<int32_t,pool_stat_t
> pg_pool_sum
;
53 mempool::pgmap::map
<int64_t,int64_t> num_pg_by_pool
;
56 mempool::pgmap::unordered_map
<int32_t,int32_t> num_pg_by_state
;
61 void encode(bufferlist
& bl
) const {
64 ::encode(primary
, bl
);
66 void decode(bufferlist::iterator
& p
) {
72 mempool::pgmap::unordered_map
<int32_t,pg_count
> num_pg_by_osd
;
74 // recent deltas, and summation
76 * keep track of last deltas for each pool, calculated using
77 * @p pg_pool_sum as baseline.
79 mempool::pgmap::unordered_map
<uint64_t, mempool::pgmap::list
< pair
<pool_stat_t
, utime_t
> > > per_pool_sum_deltas
;
81 * keep track of per-pool timestamp deltas, according to last update on
84 mempool::pgmap::unordered_map
<uint64_t, utime_t
> per_pool_sum_deltas_stamps
;
86 * keep track of sum deltas, per-pool, taking into account any previous
87 * deltas existing in @p per_pool_sum_deltas. The utime_t as second member
88 * of the pair is the timestamp refering to the last update (i.e., the first
89 * member of the pair) for a given pool.
91 mempool::pgmap::unordered_map
<uint64_t, pair
<pool_stat_t
,utime_t
> > per_pool_sum_delta
;
93 pool_stat_t pg_sum_delta
;
97 void print_summary(Formatter
*f
, ostream
*out
) const;
98 void print_oneline_summary(Formatter
*f
, ostream
*out
) const;
100 void recovery_summary(Formatter
*f
, list
<string
> *psl
,
101 const pool_stat_t
& delta_sum
) const;
102 void overall_recovery_summary(Formatter
*f
, list
<string
> *psl
) const;
103 void pool_recovery_summary(Formatter
*f
, list
<string
> *psl
,
104 uint64_t poolid
) const;
105 void recovery_rate_summary(Formatter
*f
, ostream
*out
,
106 const pool_stat_t
& delta_sum
,
107 utime_t delta_stamp
) const;
108 void overall_recovery_rate_summary(Formatter
*f
, ostream
*out
) const;
109 void pool_recovery_rate_summary(Formatter
*f
, ostream
*out
,
110 uint64_t poolid
) const;
112 * Obtain a formatted/plain output for client I/O, source from stats for a
113 * given @p delta_sum pool over a given @p delta_stamp period of time.
115 void client_io_rate_summary(Formatter
*f
, ostream
*out
,
116 const pool_stat_t
& delta_sum
,
117 utime_t delta_stamp
) const;
119 * Obtain a formatted/plain output for the overall client I/O, which is
120 * calculated resorting to @p pg_sum_delta and @p stamp_delta.
122 void overall_client_io_rate_summary(Formatter
*f
, ostream
*out
) const;
124 * Obtain a formatted/plain output for client I/O over a given pool
125 * with id @p pool_id. We will then obtain pool-specific data
126 * from @p per_pool_sum_delta.
128 void pool_client_io_rate_summary(Formatter
*f
, ostream
*out
,
129 uint64_t poolid
) const;
131 * Obtain a formatted/plain output for cache tier IO, source from stats for a
132 * given @p delta_sum pool over a given @p delta_stamp period of time.
134 void cache_io_rate_summary(Formatter
*f
, ostream
*out
,
135 const pool_stat_t
& delta_sum
,
136 utime_t delta_stamp
) const;
138 * Obtain a formatted/plain output for the overall cache tier IO, which is
139 * calculated resorting to @p pg_sum_delta and @p stamp_delta.
141 void overall_cache_io_rate_summary(Formatter
*f
, ostream
*out
) const;
143 * Obtain a formatted/plain output for cache tier IO over a given pool
144 * with id @p pool_id. We will then obtain pool-specific data
145 * from @p per_pool_sum_delta.
147 void pool_cache_io_rate_summary(Formatter
*f
, ostream
*out
,
148 uint64_t poolid
) const;
151 * Return the number of additional bytes that can be stored in this
152 * pool before the first OSD fills up, accounting for PG overhead.
154 int64_t get_pool_free_space(const OSDMap
&osd_map
, int64_t poolid
) const;
156 virtual void dump_pool_stats_full(const OSDMap
&osd_map
, stringstream
*ss
,
157 Formatter
*f
, bool verbose
) const;
158 void dump_fs_stats(stringstream
*ss
, Formatter
*f
, bool verbose
) const;
159 static void dump_object_stat_sum(TextTable
&tbl
, Formatter
*f
,
160 const object_stat_sum_t
&sum
,
163 bool verbose
, const pg_pool_t
*pool
);
165 size_t get_num_pg_by_osd(int osd
) const {
166 auto p
= num_pg_by_osd
.find(osd
);
167 if (p
== num_pg_by_osd
.end())
170 return p
->second
.acting
;
172 int get_num_primary_pg_by_osd(int osd
) const {
173 auto p
= num_pg_by_osd
.find(osd
);
174 if (p
== num_pg_by_osd
.end())
177 return p
->second
.primary
;
180 ceph_statfs
get_statfs(OSDMap
&osdmap
,
181 boost::optional
<int64_t> data_pool
) const;
183 int64_t get_rule_avail(int ruleno
) const {
184 auto i
= avail_space_by_rule
.find(ruleno
);
185 if (i
!= avail_space_by_rule
.end())
186 return avail_space_by_rule
[ruleno
];
191 // kill me post-luminous:
192 virtual float get_fallback_full_ratio() const {
196 uint64_t get_last_osd_stat_seq(int osd
) {
197 if (osd
< (int)osd_last_seq
.size())
198 return osd_last_seq
[osd
];
202 void encode(bufferlist
& bl
, uint64_t features
) const;
203 void decode(bufferlist::iterator
& p
);
204 void dump(Formatter
*f
) const;
205 static void generate_test_instances(list
<PGMapDigest
*>& ls
);
207 WRITE_CLASS_ENCODER(PGMapDigest::pg_count
);
208 WRITE_CLASS_ENCODER_FEATURES(PGMapDigest
);
210 class PGMap
: public PGMapDigest
{
212 MEMPOOL_CLASS_HELPERS();
216 epoch_t last_osdmap_epoch
; // last osdmap epoch i applied to the pgmap
217 epoch_t last_pg_scan
; // osdmap epoch
218 mempool::pgmap::unordered_map
<int32_t,osd_stat_t
> osd_stat
;
219 mempool::pgmap::unordered_map
<pg_t
,pg_stat_t
> pg_stat
;
220 mempool::pgmap::set
<int32_t> full_osds
; // for pre-luminous only
221 mempool::pgmap::set
<int32_t> nearfull_osds
; // for pre-luminous only
223 float nearfull_ratio
;
225 // mapping of osd to most recently reported osdmap epoch
226 mempool::pgmap::unordered_map
<int32_t,epoch_t
> osd_epochs
;
230 MEMPOOL_CLASS_HELPERS();
232 mempool::pgmap::map
<pg_t
,pg_stat_t
> pg_stat_updates
;
233 epoch_t osdmap_epoch
;
234 epoch_t pg_scan
; // osdmap epoch
235 mempool::pgmap::set
<pg_t
> pg_remove
;
237 float nearfull_ratio
;
241 mempool::pgmap::map
<int32_t,osd_stat_t
> osd_stat_updates
;
242 mempool::pgmap::set
<int32_t> osd_stat_rm
;
244 // mapping of osd to most recently reported osdmap epoch.
245 // 1:1 with osd_stat_updates.
246 mempool::pgmap::map
<int32_t,epoch_t
> osd_epochs
;
249 const mempool::pgmap::map
<int32_t, osd_stat_t
> &get_osd_stat_updates() const {
250 return osd_stat_updates
;
252 const mempool::pgmap::set
<int32_t> &get_osd_stat_rm() const {
255 const mempool::pgmap::map
<int32_t, epoch_t
> &get_osd_epochs() const {
259 template<typename OsdStat
>
260 void update_stat(int32_t osd
, epoch_t epoch
, OsdStat
&& stat
) {
261 osd_stat_updates
[osd
] = std::forward
<OsdStat
>(stat
);
262 osd_epochs
[osd
] = epoch
;
263 assert(osd_epochs
.size() == osd_stat_updates
.size());
265 void stat_osd_out(int32_t osd
, epoch_t epoch
) {
266 // 0 the stats for the osd
267 osd_stat_updates
[osd
] = osd_stat_t();
268 // only fill in the epoch if the osd didn't already report htis
269 // epoch. that way we zero the stat but still preserve a reported
271 if (!osd_epochs
.count(osd
))
272 osd_epochs
[osd
] = epoch
;
273 // ...and maintain our invariant.
274 assert(osd_epochs
.size() == osd_stat_updates
.size());
276 void stat_osd_down_up(int32_t osd
, epoch_t epoch
, const PGMap
& pg_map
) {
277 // 0 the op_queue_age_hist for this osd
278 auto p
= osd_stat_updates
.find(osd
);
279 if (p
!= osd_stat_updates
.end()) {
280 p
->second
.op_queue_age_hist
.clear();
283 auto q
= pg_map
.osd_stat
.find(osd
);
284 if (q
!= pg_map
.osd_stat
.end()) {
285 osd_stat_t
& t
= osd_stat_updates
[osd
] = q
->second
;
286 t
.op_queue_age_hist
.clear();
287 osd_epochs
[osd
] = epoch
;
290 void rm_stat(int32_t osd
) {
291 osd_stat_rm
.insert(osd
);
292 osd_epochs
.erase(osd
);
293 osd_stat_updates
.erase(osd
);
295 void encode(bufferlist
&bl
, uint64_t features
=-1) const;
296 void decode(bufferlist::iterator
&bl
);
297 void dump(Formatter
*f
) const;
298 static void generate_test_instances(list
<Incremental
*>& o
);
300 Incremental() : version(0), osdmap_epoch(0), pg_scan(0),
301 full_ratio(0), nearfull_ratio(0) {}
305 // aggregate stats (soft state), generated by calc_stats()
306 mutable epoch_t min_last_epoch_clean
= 0;
307 mempool::pgmap::unordered_map
<int,set
<pg_t
> > pg_by_osd
;
308 mempool::pgmap::unordered_map
<int,int> blocked_by_sum
;
309 mempool::pgmap::list
< pair
<pool_stat_t
, utime_t
> > pg_sum_deltas
;
313 void update_global_delta(
315 const utime_t ts
, const pool_stat_t
& pg_sum_old
);
316 void update_pool_deltas(
319 const mempool::pgmap::unordered_map
<uint64_t, pool_stat_t
>& pg_pool_sum_old
);
322 void deleted_pool(int64_t pool
) {
323 pg_pool_sum
.erase(pool
);
324 num_pg_by_pool
.erase(pool
);
325 per_pool_sum_deltas
.erase(pool
);
326 per_pool_sum_deltas_stamps
.erase(pool
);
327 per_pool_sum_delta
.erase(pool
);
334 const pool_stat_t
& old_pool_sum
,
336 const pool_stat_t
& current_pool_sum
,
337 pool_stat_t
*result_pool_delta
,
338 utime_t
*result_ts_delta
,
339 mempool::pgmap::list
<pair
<pool_stat_t
,utime_t
> > *delta_avg_list
);
341 void update_one_pool_delta(CephContext
*cct
,
344 const pool_stat_t
& old_pool_sum
);
346 epoch_t
calc_min_last_epoch_clean() const;
350 mempool::pgmap::set
<pg_t
> creating_pgs
;
351 mempool::pgmap::map
<int,map
<epoch_t
,set
<pg_t
> > > creating_pgs_by_osd_epoch
;
353 // Bits that use to be enum StuckPG
354 static const int STUCK_INACTIVE
= (1<<0);
355 static const int STUCK_UNCLEAN
= (1<<1);
356 static const int STUCK_UNDERSIZED
= (1<<2);
357 static const int STUCK_DEGRADED
= (1<<3);
358 static const int STUCK_STALE
= (1<<4);
362 last_osdmap_epoch(0), last_pg_scan(0),
363 full_ratio(0), nearfull_ratio(0)
366 void set_full_ratios(float full
, float nearfull
) {
367 if (full_ratio
== full
&& nearfull_ratio
== nearfull
)
370 nearfull_ratio
= nearfull
;
374 version_t
get_version() const {
377 void set_version(version_t v
) {
380 epoch_t
get_last_osdmap_epoch() const {
381 return last_osdmap_epoch
;
383 void set_last_osdmap_epoch(epoch_t e
) {
384 last_osdmap_epoch
= e
;
386 epoch_t
get_last_pg_scan() const {
389 void set_last_pg_scan(epoch_t e
) {
392 utime_t
get_stamp() const {
395 void set_stamp(utime_t s
) {
399 pool_stat_t
get_pg_pool_sum_stat(int64_t pool
) const {
400 auto p
= pg_pool_sum
.find(pool
);
401 if (p
!= pg_pool_sum
.end())
403 return pool_stat_t();
407 void update_pg(pg_t pgid
, bufferlist
& bl
);
408 void remove_pg(pg_t pgid
);
409 void update_osd(int osd
, bufferlist
& bl
);
410 void remove_osd(int osd
);
412 void apply_incremental(CephContext
*cct
, const Incremental
& inc
);
413 void redo_full_sets();
414 void register_nearfull_status(int osd
, const osd_stat_t
& s
);
416 void stat_pg_add(const pg_t
&pgid
, const pg_stat_t
&s
,
417 bool sameosds
=false);
418 void stat_pg_sub(const pg_t
&pgid
, const pg_stat_t
&s
,
419 bool sameosds
=false);
420 void stat_pg_update(const pg_t pgid
, pg_stat_t
&prev
, bufferlist::iterator
& blp
);
421 void stat_osd_add(int osd
, const osd_stat_t
&s
);
422 void stat_osd_sub(int osd
, const osd_stat_t
&s
);
424 void encode(bufferlist
&bl
, uint64_t features
=-1) const;
425 void decode(bufferlist::iterator
&bl
);
427 /// encode subset of our data to a PGMapDigest
428 void encode_digest(const OSDMap
& osdmap
,
429 bufferlist
& bl
, uint64_t features
) const;
431 void dirty_all(Incremental
& inc
);
433 int64_t get_rule_avail(const OSDMap
& osdmap
, int ruleno
) const;
434 void get_rules_avail(const OSDMap
& osdmap
,
435 std::map
<int,int64_t> *avail_map
) const;
436 void dump(Formatter
*f
) const;
437 void dump_basic(Formatter
*f
) const;
438 void dump_pg_stats(Formatter
*f
, bool brief
) const;
439 void dump_pool_stats(Formatter
*f
) const;
440 void dump_osd_stats(Formatter
*f
) const;
441 void dump_delta(Formatter
*f
) const;
442 void dump_filtered_pg_stats(Formatter
*f
, set
<pg_t
>& pgs
) const;
443 void dump_pool_stats_full(const OSDMap
&osd_map
, stringstream
*ss
,
444 Formatter
*f
, bool verbose
) const override
{
445 get_rules_avail(osd_map
, &avail_space_by_rule
);
446 PGMapDigest::dump_pool_stats_full(osd_map
, ss
, f
, verbose
);
449 void dump_pg_stats_plain(
451 const mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
>& pg_stats
,
453 void get_stuck_stats(
454 int types
, const utime_t cutoff
,
455 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
>& stuck_pgs
) const;
456 bool get_stuck_counts(const utime_t cutoff
, map
<string
, int>& note
) const;
457 void dump_stuck(Formatter
*f
, int types
, utime_t cutoff
) const;
458 void dump_stuck_plain(ostream
& ss
, int types
, utime_t cutoff
) const;
459 int dump_stuck_pg_stats(stringstream
&ds
,
462 vector
<string
>& args
) const;
463 void dump(ostream
& ss
) const;
464 void dump_basic(ostream
& ss
) const;
465 void dump_pg_stats(ostream
& ss
, bool brief
) const;
466 void dump_pg_sum_stats(ostream
& ss
, bool header
) const;
467 void dump_pool_stats(ostream
& ss
, bool header
) const;
468 void dump_osd_stats(ostream
& ss
) const;
469 void dump_osd_sum_stats(ostream
& ss
) const;
470 void dump_filtered_pg_stats(ostream
& ss
, set
<pg_t
>& pgs
) const;
472 void dump_osd_perf_stats(Formatter
*f
) const;
473 void print_osd_perf_stats(std::ostream
*ss
) const;
475 void dump_osd_blocked_by_stats(Formatter
*f
) const;
476 void print_osd_blocked_by_stats(std::ostream
*ss
) const;
478 void get_filtered_pg_stats(uint32_t state
, int64_t poolid
, int64_t osdid
,
479 bool primary
, set
<pg_t
>& pgs
) const;
481 epoch_t
get_min_last_epoch_clean() const {
482 if (!min_last_epoch_clean
)
483 min_last_epoch_clean
= calc_min_last_epoch_clean();
484 return min_last_epoch_clean
;
487 float get_fallback_full_ratio() const override
{
488 if (full_ratio
> 0) {
494 void get_health(CephContext
*cct
,
495 const OSDMap
& osdmap
,
496 list
<pair
<health_status_t
,string
> >& summary
,
497 list
<pair
<health_status_t
,string
> > *detail
) const;
499 void get_health_checks(
501 const OSDMap
& osdmap
,
502 health_check_map_t
*checks
) const;
504 static void generate_test_instances(list
<PGMap
*>& o
);
506 WRITE_CLASS_ENCODER_FEATURES(PGMap::Incremental
)
507 WRITE_CLASS_ENCODER_FEATURES(PGMap
)
509 inline ostream
& operator<<(ostream
& out
, const PGMapDigest
& m
) {
510 m
.print_oneline_summary(NULL
, &out
);
514 int process_pg_map_command(
515 const string
& prefix
,
516 const map
<string
,cmd_vartype
>& cmdmap
,
518 const OSDMap
& osdmap
,
526 static void check_osd_map(
527 const OSDMap::Incremental
&osd_inc
,
528 std::set
<int> *need_check_down_pg_osds
,
529 std::map
<int,utime_t
> *last_osd_report
,
531 PGMap::Incremental
*pending_inc
);
533 static void check_osd_map(
535 const OSDMap
&osdmap
,
537 PGMap::Incremental
*pending_inc
);
539 * check latest osdmap for new pgs to register
541 static void register_new_pgs(
542 const OSDMap
&osd_map
,
544 PGMap::Incremental
*pending_inc
);
547 * recalculate creating pg mappings
549 static void update_creating_pgs(
550 const OSDMap
&osd_map
,
552 PGMap::Incremental
*pending_inc
);
554 static void register_pg(
555 const OSDMap
&osd_map
,
556 pg_t pgid
, epoch_t epoch
,
559 PGMap::Incremental
*pending_inc
);
561 // mark pg's state stale if its acting primary osd is down
562 static void check_down_pgs(
563 const OSDMap
&osd_map
,
566 const set
<int>& need_check_down_pg_osds
,
567 PGMap::Incremental
*pending_inc
);
571 /* Assign a lower weight to overloaded OSDs.
573 * The osds that will get a lower weight are those with with a utilization
574 * percentage 'oload' percent greater than the average utilization.
576 int by_utilization(const OSDMap
&osd_map
,
581 bool by_pg
, const set
<int64_t> *pools
,
583 mempool::osdmap::map
<int32_t, uint32_t>* new_weights
,
584 std::stringstream
*ss
,
585 std::string
*out_str
,
590 class PGMapStatService
: virtual public PGStatService
{
594 PGMapStatService(const PGMap
& o
)
597 bool is_readable() const override
{ return true; }
599 const pool_stat_t
* get_pool_stat(int64_t poolid
) const override
{
600 auto i
= pgmap
.pg_pool_sum
.find(poolid
);
601 if (i
!= pgmap
.pg_pool_sum
.end()) {
607 const osd_stat_t
& get_osd_sum() const override
{ return pgmap
.osd_sum
; }
609 const osd_stat_t
*get_osd_stat(int osd
) const override
{
610 auto i
= pgmap
.osd_stat
.find(osd
);
611 if (i
== pgmap
.osd_stat
.end()) {
616 const mempool::pgmap::unordered_map
<int32_t,osd_stat_t
>& get_osd_stat() const override
{
617 return pgmap
.osd_stat
;
619 float get_full_ratio() const override
{ return pgmap
.full_ratio
; }
620 float get_nearfull_ratio() const override
{ return pgmap
.nearfull_ratio
; }
622 bool have_creating_pgs() const override
{
623 return !pgmap
.creating_pgs
.empty();
625 bool is_creating_pg(pg_t pgid
) const override
{
626 return pgmap
.creating_pgs
.count(pgid
);
629 epoch_t
get_min_last_epoch_clean() const override
{
630 return pgmap
.get_min_last_epoch_clean();
633 bool have_full_osds() const override
{ return !pgmap
.full_osds
.empty(); }
634 bool have_nearfull_osds() const override
{
635 return !pgmap
.nearfull_osds
.empty();
638 size_t get_num_pg_by_osd(int osd
) const override
{
639 return pgmap
.get_num_pg_by_osd(osd
);
641 ceph_statfs
get_statfs(OSDMap
& osd_map
,
642 boost::optional
<int64_t> data_pool
) const override
{
644 statfs
.kb
= pgmap
.osd_sum
.kb
;
645 statfs
.kb_used
= pgmap
.osd_sum
.kb_used
;
646 statfs
.kb_avail
= pgmap
.osd_sum
.kb_avail
;
647 statfs
.num_objects
= pgmap
.pg_sum
.stats
.sum
.num_objects
;
650 void print_summary(Formatter
*f
, ostream
*out
) const override
{
651 pgmap
.print_summary(f
, out
);
653 virtual void dump_info(Formatter
*f
) const override
{
654 f
->dump_object("pgmap", pgmap
);
656 void dump_fs_stats(stringstream
*ss
,
658 bool verbose
) const override
{
659 pgmap
.dump_fs_stats(ss
, f
, verbose
);
661 void dump_pool_stats(const OSDMap
& osdm
, stringstream
*ss
, Formatter
*f
,
662 bool verbose
) const override
{
663 pgmap
.dump_pool_stats_full(osdm
, ss
, f
, verbose
);
666 int process_pg_command(const string
& prefix
,
667 const map
<string
,cmd_vartype
>& cmdmap
,
668 const OSDMap
& osdmap
,
671 bufferlist
*odata
) const override
{
672 return process_pg_map_command(prefix
, cmdmap
, pgmap
, osdmap
, f
, ss
, odata
);