1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 * Placement Group Map. Placement Groups are logical sets of objects
17 * that are replicated by the same set of devices. pgid=(r,hash(o)&m)
18 * where & is a bit-wise AND and m=2^k-1
24 #include "common/debug.h"
25 #include "common/TextTable.h"
26 #include "osd/osd_types.h"
29 // FIXME: don't like including this here to get OSDMap::Incremental, maybe
30 // PGMapUpdater needs its own header.
31 #include "osd/OSDMap.h"
33 namespace ceph
{ class Formatter
; }
39 epoch_t last_osdmap_epoch
; // last osdmap epoch i applied to the pgmap
40 epoch_t last_pg_scan
; // osdmap epoch
41 ceph::unordered_map
<pg_t
,pg_stat_t
> pg_stat
;
42 ceph::unordered_map
<int32_t,osd_stat_t
> osd_stat
;
43 set
<int32_t> full_osds
;
44 set
<int32_t> nearfull_osds
;
48 // mapping of osd to most recently reported osdmap epoch
49 ceph::unordered_map
<int32_t,epoch_t
> osd_epochs
;
54 map
<pg_t
,pg_stat_t
> pg_stat_updates
;
56 epoch_t pg_scan
; // osdmap epoch
63 map
<int32_t,osd_stat_t
> osd_stat_updates
;
64 set
<int32_t> osd_stat_rm
;
66 // mapping of osd to most recently reported osdmap epoch
67 map
<int32_t,epoch_t
> osd_epochs
;
70 const map
<int32_t, osd_stat_t
> &get_osd_stat_updates() const {
71 return osd_stat_updates
;
73 const set
<int32_t> &get_osd_stat_rm() const {
76 const map
<int32_t, epoch_t
> &get_osd_epochs() const {
80 void update_stat(int32_t osd
, epoch_t epoch
, const osd_stat_t
&stat
) {
81 osd_stat_updates
[osd
] = stat
;
82 osd_epochs
[osd
] = epoch
;
83 assert(osd_epochs
.size() == osd_stat_updates
.size());
85 void stat_osd_out(int32_t osd
) {
86 // 0 the stats for the osd
87 osd_stat_updates
[osd
] = osd_stat_t();
89 void stat_osd_down_up(int32_t osd
, PGMap
& pg_map
) {
90 // 0 the op_queue_age_hist for this osd
91 map
<int32_t,osd_stat_t
>::iterator p
= osd_stat_updates
.find(osd
);
92 if (p
!= osd_stat_updates
.end()) {
93 p
->second
.op_queue_age_hist
.clear();
96 ceph::unordered_map
<int32_t,osd_stat_t
>::iterator q
=
97 pg_map
.osd_stat
.find(osd
);
98 if (q
!= pg_map
.osd_stat
.end()) {
99 osd_stat_t
& t
= osd_stat_updates
[osd
] = q
->second
;
100 t
.op_queue_age_hist
.clear();
103 void rm_stat(int32_t osd
) {
104 osd_stat_rm
.insert(osd
);
105 osd_epochs
.erase(osd
);
106 osd_stat_updates
.erase(osd
);
108 void encode(bufferlist
&bl
, uint64_t features
=-1) const;
109 void decode(bufferlist::iterator
&bl
);
110 void dump(Formatter
*f
) const;
111 static void generate_test_instances(list
<Incremental
*>& o
);
113 Incremental() : version(0), osdmap_epoch(0), pg_scan(0),
114 full_ratio(0), nearfull_ratio(0) {}
118 // aggregate stats (soft state), generated by calc_stats()
119 ceph::unordered_map
<int,int> num_pg_by_state
;
120 int64_t num_pg
= 0, num_osd
= 0;
121 int64_t num_pg_active
= 0;
122 ceph::unordered_map
<int,pool_stat_t
> pg_pool_sum
;
125 mutable epoch_t min_last_epoch_clean
= 0;
126 ceph::unordered_map
<int,int> blocked_by_sum
;
127 ceph::unordered_map
<int,set
<pg_t
> > pg_by_osd
;
128 ceph::unordered_map
<int,int> num_primary_pg_by_osd
;
132 // recent deltas, and summation
134 * keep track of last deltas for each pool, calculated using
135 * @p pg_pool_sum as baseline.
137 ceph::unordered_map
<uint64_t, list
< pair
<pool_stat_t
, utime_t
> > > per_pool_sum_deltas
;
139 * keep track of per-pool timestamp deltas, according to last update on
142 ceph::unordered_map
<uint64_t, utime_t
> per_pool_sum_deltas_stamps
;
144 * keep track of sum deltas, per-pool, taking into account any previous
145 * deltas existing in @p per_pool_sum_deltas. The utime_t as second member
146 * of the pair is the timestamp refering to the last update (i.e., the first
147 * member of the pair) for a given pool.
149 ceph::unordered_map
<uint64_t, pair
<pool_stat_t
,utime_t
> > per_pool_sum_delta
;
151 list
< pair
<pool_stat_t
, utime_t
> > pg_sum_deltas
;
152 pool_stat_t pg_sum_delta
;
155 void update_global_delta(CephContext
*cct
,
156 const utime_t ts
, const pool_stat_t
& pg_sum_old
);
157 void update_pool_deltas(CephContext
*cct
,
159 const ceph::unordered_map
<uint64_t, pool_stat_t
>& pg_pool_sum_old
);
162 void deleted_pool(int64_t pool
) {
163 pg_pool_sum
.erase(pool
);
164 per_pool_sum_deltas
.erase(pool
);
165 per_pool_sum_deltas_stamps
.erase(pool
);
166 per_pool_sum_delta
.erase(pool
);
170 void update_delta(CephContext
*cct
,
172 const pool_stat_t
& old_pool_sum
,
174 const pool_stat_t
& current_pool_sum
,
175 pool_stat_t
*result_pool_delta
,
176 utime_t
*result_ts_delta
,
177 list
<pair
<pool_stat_t
,utime_t
> > *delta_avg_list
);
179 void update_one_pool_delta(CephContext
*cct
,
182 const pool_stat_t
& old_pool_sum
);
184 epoch_t
calc_min_last_epoch_clean() const;
186 int64_t get_rule_avail(const OSDMap
& osdmap
, int ruleno
) const;
190 set
<pg_t
> creating_pgs
;
191 map
<int,map
<epoch_t
,set
<pg_t
> > > creating_pgs_by_osd_epoch
;
193 // Bits that use to be enum StuckPG
194 static const int STUCK_INACTIVE
= (1<<0);
195 static const int STUCK_UNCLEAN
= (1<<1);
196 static const int STUCK_UNDERSIZED
= (1<<2);
197 static const int STUCK_DEGRADED
= (1<<3);
198 static const int STUCK_STALE
= (1<<4);
202 last_osdmap_epoch(0), last_pg_scan(0),
203 full_ratio(0), nearfull_ratio(0)
206 void set_full_ratios(float full
, float nearfull
) {
207 if (full_ratio
== full
&& nearfull_ratio
== nearfull
)
210 nearfull_ratio
= nearfull
;
214 version_t
get_version() const {
217 void set_version(version_t v
) {
220 epoch_t
get_last_osdmap_epoch() const {
221 return last_osdmap_epoch
;
223 void set_last_osdmap_epoch(epoch_t e
) {
224 last_osdmap_epoch
= e
;
226 epoch_t
get_last_pg_scan() const {
229 void set_last_pg_scan(epoch_t e
) {
232 utime_t
get_stamp() const {
235 void set_stamp(utime_t s
) {
239 size_t get_num_pg_by_osd(int osd
) const {
240 ceph::unordered_map
<int,set
<pg_t
> >::const_iterator p
= pg_by_osd
.find(osd
);
241 if (p
== pg_by_osd
.end())
244 return p
->second
.size();
247 pool_stat_t
get_pg_pool_sum_stat(int64_t pool
) const {
248 ceph::unordered_map
<int,pool_stat_t
>::const_iterator p
=
249 pg_pool_sum
.find(pool
);
250 if (p
!= pg_pool_sum
.end())
252 return pool_stat_t();
255 int get_num_primary_pg_by_osd(int osd
) const {
258 auto it
= num_primary_pg_by_osd
.find(osd
);
259 if (it
!= num_primary_pg_by_osd
.end())
264 void update_pg(pg_t pgid
, bufferlist
& bl
);
265 void remove_pg(pg_t pgid
);
266 void update_osd(int osd
, bufferlist
& bl
);
267 void remove_osd(int osd
);
269 void apply_incremental(CephContext
*cct
, const Incremental
& inc
);
270 void redo_full_sets();
271 void register_nearfull_status(int osd
, const osd_stat_t
& s
);
273 void stat_pg_add(const pg_t
&pgid
, const pg_stat_t
&s
,
274 bool sameosds
=false);
275 void stat_pg_sub(const pg_t
&pgid
, const pg_stat_t
&s
,
276 bool sameosds
=false);
277 void stat_pg_update(const pg_t pgid
, pg_stat_t
&prev
, bufferlist::iterator
& blp
);
278 void stat_osd_add(const osd_stat_t
&s
);
279 void stat_osd_sub(const osd_stat_t
&s
);
281 void encode(bufferlist
&bl
, uint64_t features
=-1) const;
282 void decode(bufferlist::iterator
&bl
);
284 void dirty_all(Incremental
& inc
);
286 void dump(Formatter
*f
) const;
287 void dump_pool_stats(const OSDMap
&osd_map
, stringstream
*ss
, Formatter
*f
,
289 void dump_fs_stats(stringstream
*ss
, Formatter
*f
, bool verbose
) const;
290 static void dump_object_stat_sum(TextTable
&tbl
, Formatter
*f
,
291 const object_stat_sum_t
&sum
,
294 bool verbose
, const pg_pool_t
*pool
);
295 void dump_basic(Formatter
*f
) const;
296 void dump_pg_stats(Formatter
*f
, bool brief
) const;
297 void dump_pool_stats(Formatter
*f
) const;
298 void dump_osd_stats(Formatter
*f
) const;
299 void dump_delta(Formatter
*f
) const;
300 void dump_filtered_pg_stats(Formatter
*f
, set
<pg_t
>& pgs
) const;
302 void dump_pg_stats_plain(ostream
& ss
,
303 const ceph::unordered_map
<pg_t
, pg_stat_t
>& pg_stats
,
305 void get_stuck_stats(int types
, const utime_t cutoff
,
306 ceph::unordered_map
<pg_t
, pg_stat_t
>& stuck_pgs
) const;
307 bool get_stuck_counts(const utime_t cutoff
, map
<string
, int>& note
) const;
308 void dump_stuck(Formatter
*f
, int types
, utime_t cutoff
) const;
309 void dump_stuck_plain(ostream
& ss
, int types
, utime_t cutoff
) const;
310 int dump_stuck_pg_stats(stringstream
&ds
,
313 vector
<string
>& args
) const;
314 void dump(ostream
& ss
) const;
315 void dump_basic(ostream
& ss
) const;
316 void dump_pg_stats(ostream
& ss
, bool brief
) const;
317 void dump_pg_sum_stats(ostream
& ss
, bool header
) const;
318 void dump_pool_stats(ostream
& ss
, bool header
) const;
319 void dump_osd_stats(ostream
& ss
) const;
320 void dump_osd_sum_stats(ostream
& ss
) const;
321 void dump_filtered_pg_stats(ostream
& ss
, set
<pg_t
>& pgs
) const;
323 void dump_osd_perf_stats(Formatter
*f
) const;
324 void print_osd_perf_stats(std::ostream
*ss
) const;
326 void dump_osd_blocked_by_stats(Formatter
*f
) const;
327 void print_osd_blocked_by_stats(std::ostream
*ss
) const;
329 void get_filtered_pg_stats(uint32_t state
, int64_t poolid
, int64_t osdid
,
330 bool primary
, set
<pg_t
>& pgs
) const;
331 void recovery_summary(Formatter
*f
, list
<string
> *psl
,
332 const pool_stat_t
& delta_sum
) const;
333 void overall_recovery_summary(Formatter
*f
, list
<string
> *psl
) const;
334 void pool_recovery_summary(Formatter
*f
, list
<string
> *psl
,
335 uint64_t poolid
) const;
336 void recovery_rate_summary(Formatter
*f
, ostream
*out
,
337 const pool_stat_t
& delta_sum
,
338 utime_t delta_stamp
) const;
339 void overall_recovery_rate_summary(Formatter
*f
, ostream
*out
) const;
340 void pool_recovery_rate_summary(Formatter
*f
, ostream
*out
,
341 uint64_t poolid
) const;
343 * Obtain a formatted/plain output for client I/O, source from stats for a
344 * given @p delta_sum pool over a given @p delta_stamp period of time.
346 void client_io_rate_summary(Formatter
*f
, ostream
*out
,
347 const pool_stat_t
& delta_sum
,
348 utime_t delta_stamp
) const;
350 * Obtain a formatted/plain output for the overall client I/O, which is
351 * calculated resorting to @p pg_sum_delta and @p stamp_delta.
353 void overall_client_io_rate_summary(Formatter
*f
, ostream
*out
) const;
355 * Obtain a formatted/plain output for client I/O over a given pool
356 * with id @p pool_id. We will then obtain pool-specific data
357 * from @p per_pool_sum_delta.
359 void pool_client_io_rate_summary(Formatter
*f
, ostream
*out
,
360 uint64_t poolid
) const;
362 * Obtain a formatted/plain output for cache tier IO, source from stats for a
363 * given @p delta_sum pool over a given @p delta_stamp period of time.
365 void cache_io_rate_summary(Formatter
*f
, ostream
*out
,
366 const pool_stat_t
& delta_sum
,
367 utime_t delta_stamp
) const;
369 * Obtain a formatted/plain output for the overall cache tier IO, which is
370 * calculated resorting to @p pg_sum_delta and @p stamp_delta.
372 void overall_cache_io_rate_summary(Formatter
*f
, ostream
*out
) const;
374 * Obtain a formatted/plain output for cache tier IO over a given pool
375 * with id @p pool_id. We will then obtain pool-specific data
376 * from @p per_pool_sum_delta.
378 void pool_cache_io_rate_summary(Formatter
*f
, ostream
*out
,
379 uint64_t poolid
) const;
381 void print_summary(Formatter
*f
, ostream
*out
) const;
382 void print_oneline_summary(Formatter
*f
, ostream
*out
) const;
384 epoch_t
get_min_last_epoch_clean() const {
385 if (!min_last_epoch_clean
)
386 min_last_epoch_clean
= calc_min_last_epoch_clean();
387 return min_last_epoch_clean
;
390 static void generate_test_instances(list
<PGMap
*>& o
);
392 WRITE_CLASS_ENCODER_FEATURES(PGMap::Incremental
)
393 WRITE_CLASS_ENCODER_FEATURES(PGMap
)
395 inline ostream
& operator<<(ostream
& out
, const PGMap
& m
) {
396 m
.print_oneline_summary(NULL
, &out
);
400 int process_pg_map_command(
401 const string
& prefix
,
402 const map
<string
,cmd_vartype
>& cmdmap
,
404 const OSDMap
& osdmap
,
412 static void check_osd_map(
413 const OSDMap::Incremental
&osd_inc
,
414 std::set
<int> *need_check_down_pg_osds
,
415 std::map
<int,utime_t
> *last_osd_report
,
417 PGMap::Incremental
*pending_inc
);
420 * check latest osdmap for new pgs to register
422 static void register_new_pgs(
423 const OSDMap
&osd_map
,
425 PGMap::Incremental
*pending_inc
);
428 * recalculate creating pg mappings
430 static void update_creating_pgs(
431 const OSDMap
&osd_map
,
433 PGMap::Incremental
*pending_inc
);
435 static void register_pg(
436 const OSDMap
&osd_map
,
437 pg_t pgid
, epoch_t epoch
,
440 PGMap::Incremental
*pending_inc
);
442 // mark pg's state stale if its acting primary osd is down
443 static void check_down_pgs(
444 const OSDMap
&osd_map
,
447 const set
<int>& need_check_down_pg_osds
,
448 PGMap::Incremental
*pending_inc
);
452 /* Assign a lower weight to overloaded OSDs.
454 * The osds that will get a lower weight are those with with a utilization
455 * percentage 'oload' percent greater than the average utilization.
457 int by_utilization(const OSDMap
&osd_map
,
462 bool by_pg
, const set
<int64_t> *pools
,
464 mempool::osdmap::map
<int32_t, uint32_t>* new_weights
,
465 std::stringstream
*ss
,
466 std::string
*out_str
,