]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/PGMap.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / mon / PGMap.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 /*
16 * Placement Group Map. Placement Groups are logical sets of objects
17 * that are replicated by the same set of devices. pgid=(r,hash(o)&m)
18 * where & is a bit-wise AND and m=2^k-1
19 */
20
21 #ifndef CEPH_PGMAP_H
22 #define CEPH_PGMAP_H
23
24 #include "common/debug.h"
25 #include "common/TextTable.h"
26 #include "osd/osd_types.h"
27 #include <sstream>
28
29 // FIXME: don't like including this here to get OSDMap::Incremental, maybe
30 // PGMapUpdater needs its own header.
31 #include "osd/OSDMap.h"
32
33 namespace ceph { class Formatter; }
34
35 class PGMap {
36 public:
37 // the map
38 version_t version;
39 epoch_t last_osdmap_epoch; // last osdmap epoch i applied to the pgmap
40 epoch_t last_pg_scan; // osdmap epoch
41 ceph::unordered_map<pg_t,pg_stat_t> pg_stat;
42 ceph::unordered_map<int32_t,osd_stat_t> osd_stat;
43 set<int32_t> full_osds;
44 set<int32_t> nearfull_osds;
45 float full_ratio;
46 float nearfull_ratio;
47
48 // mapping of osd to most recently reported osdmap epoch
49 ceph::unordered_map<int32_t,epoch_t> osd_epochs;
50
51 class Incremental {
52 public:
53 version_t version;
54 map<pg_t,pg_stat_t> pg_stat_updates;
55 epoch_t osdmap_epoch;
56 epoch_t pg_scan; // osdmap epoch
57 set<pg_t> pg_remove;
58 float full_ratio;
59 float nearfull_ratio;
60 utime_t stamp;
61
62 private:
63 map<int32_t,osd_stat_t> osd_stat_updates;
64 set<int32_t> osd_stat_rm;
65
66 // mapping of osd to most recently reported osdmap epoch
67 map<int32_t,epoch_t> osd_epochs;
68 public:
69
70 const map<int32_t, osd_stat_t> &get_osd_stat_updates() const {
71 return osd_stat_updates;
72 }
73 const set<int32_t> &get_osd_stat_rm() const {
74 return osd_stat_rm;
75 }
76 const map<int32_t, epoch_t> &get_osd_epochs() const {
77 return osd_epochs;
78 }
79
80 void update_stat(int32_t osd, epoch_t epoch, const osd_stat_t &stat) {
81 osd_stat_updates[osd] = stat;
82 osd_epochs[osd] = epoch;
83 assert(osd_epochs.size() == osd_stat_updates.size());
84 }
85 void stat_osd_out(int32_t osd) {
86 // 0 the stats for the osd
87 osd_stat_updates[osd] = osd_stat_t();
88 }
89 void stat_osd_down_up(int32_t osd, PGMap& pg_map) {
90 // 0 the op_queue_age_hist for this osd
91 map<int32_t,osd_stat_t>::iterator p = osd_stat_updates.find(osd);
92 if (p != osd_stat_updates.end()) {
93 p->second.op_queue_age_hist.clear();
94 return;
95 }
96 ceph::unordered_map<int32_t,osd_stat_t>::iterator q =
97 pg_map.osd_stat.find(osd);
98 if (q != pg_map.osd_stat.end()) {
99 osd_stat_t& t = osd_stat_updates[osd] = q->second;
100 t.op_queue_age_hist.clear();
101 }
102 }
103 void rm_stat(int32_t osd) {
104 osd_stat_rm.insert(osd);
105 osd_epochs.erase(osd);
106 osd_stat_updates.erase(osd);
107 }
108 void encode(bufferlist &bl, uint64_t features=-1) const;
109 void decode(bufferlist::iterator &bl);
110 void dump(Formatter *f) const;
111 static void generate_test_instances(list<Incremental*>& o);
112
113 Incremental() : version(0), osdmap_epoch(0), pg_scan(0),
114 full_ratio(0), nearfull_ratio(0) {}
115 };
116
117
118 // aggregate stats (soft state), generated by calc_stats()
119 ceph::unordered_map<int,int> num_pg_by_state;
120 int64_t num_pg = 0, num_osd = 0;
121 int64_t num_pg_active = 0;
122 ceph::unordered_map<int,pool_stat_t> pg_pool_sum;
123 pool_stat_t pg_sum;
124 osd_stat_t osd_sum;
125 mutable epoch_t min_last_epoch_clean = 0;
126 ceph::unordered_map<int,int> blocked_by_sum;
127 ceph::unordered_map<int,set<pg_t> > pg_by_osd;
128 ceph::unordered_map<int,int> num_primary_pg_by_osd;
129
130 utime_t stamp;
131
132 // recent deltas, and summation
133 /**
134 * keep track of last deltas for each pool, calculated using
135 * @p pg_pool_sum as baseline.
136 */
137 ceph::unordered_map<uint64_t, list< pair<pool_stat_t, utime_t> > > per_pool_sum_deltas;
138 /**
139 * keep track of per-pool timestamp deltas, according to last update on
140 * each pool.
141 */
142 ceph::unordered_map<uint64_t, utime_t> per_pool_sum_deltas_stamps;
143 /**
144 * keep track of sum deltas, per-pool, taking into account any previous
145 * deltas existing in @p per_pool_sum_deltas. The utime_t as second member
146 * of the pair is the timestamp refering to the last update (i.e., the first
147 * member of the pair) for a given pool.
148 */
149 ceph::unordered_map<uint64_t, pair<pool_stat_t,utime_t> > per_pool_sum_delta;
150
151 list< pair<pool_stat_t, utime_t> > pg_sum_deltas;
152 pool_stat_t pg_sum_delta;
153 utime_t stamp_delta;
154
155 void update_global_delta(CephContext *cct,
156 const utime_t ts, const pool_stat_t& pg_sum_old);
157 void update_pool_deltas(CephContext *cct,
158 const utime_t ts,
159 const ceph::unordered_map<uint64_t, pool_stat_t>& pg_pool_sum_old);
160 void clear_delta();
161
162 void deleted_pool(int64_t pool) {
163 pg_pool_sum.erase(pool);
164 per_pool_sum_deltas.erase(pool);
165 per_pool_sum_deltas_stamps.erase(pool);
166 per_pool_sum_delta.erase(pool);
167 }
168
169 private:
170 void update_delta(CephContext *cct,
171 const utime_t ts,
172 const pool_stat_t& old_pool_sum,
173 utime_t *last_ts,
174 const pool_stat_t& current_pool_sum,
175 pool_stat_t *result_pool_delta,
176 utime_t *result_ts_delta,
177 list<pair<pool_stat_t,utime_t> > *delta_avg_list);
178
179 void update_one_pool_delta(CephContext *cct,
180 const utime_t ts,
181 const uint64_t pool,
182 const pool_stat_t& old_pool_sum);
183
184 epoch_t calc_min_last_epoch_clean() const;
185
186 int64_t get_rule_avail(const OSDMap& osdmap, int ruleno) const;
187
188 public:
189
190 set<pg_t> creating_pgs;
191 map<int,map<epoch_t,set<pg_t> > > creating_pgs_by_osd_epoch;
192
193 // Bits that use to be enum StuckPG
194 static const int STUCK_INACTIVE = (1<<0);
195 static const int STUCK_UNCLEAN = (1<<1);
196 static const int STUCK_UNDERSIZED = (1<<2);
197 static const int STUCK_DEGRADED = (1<<3);
198 static const int STUCK_STALE = (1<<4);
199
200 PGMap()
201 : version(0),
202 last_osdmap_epoch(0), last_pg_scan(0),
203 full_ratio(0), nearfull_ratio(0)
204 {}
205
206 void set_full_ratios(float full, float nearfull) {
207 if (full_ratio == full && nearfull_ratio == nearfull)
208 return;
209 full_ratio = full;
210 nearfull_ratio = nearfull;
211 redo_full_sets();
212 }
213
214 version_t get_version() const {
215 return version;
216 }
217 void set_version(version_t v) {
218 version = v;
219 }
220 epoch_t get_last_osdmap_epoch() const {
221 return last_osdmap_epoch;
222 }
223 void set_last_osdmap_epoch(epoch_t e) {
224 last_osdmap_epoch = e;
225 }
226 epoch_t get_last_pg_scan() const {
227 return last_pg_scan;
228 }
229 void set_last_pg_scan(epoch_t e) {
230 last_pg_scan = e;
231 }
232 utime_t get_stamp() const {
233 return stamp;
234 }
235 void set_stamp(utime_t s) {
236 stamp = s;
237 }
238
239 size_t get_num_pg_by_osd(int osd) const {
240 ceph::unordered_map<int,set<pg_t> >::const_iterator p = pg_by_osd.find(osd);
241 if (p == pg_by_osd.end())
242 return 0;
243 else
244 return p->second.size();
245 }
246
247 pool_stat_t get_pg_pool_sum_stat(int64_t pool) const {
248 ceph::unordered_map<int,pool_stat_t>::const_iterator p =
249 pg_pool_sum.find(pool);
250 if (p != pg_pool_sum.end())
251 return p->second;
252 return pool_stat_t();
253 }
254
255 int get_num_primary_pg_by_osd(int osd) const {
256 assert(osd >= 0);
257 int num = 0;
258 auto it = num_primary_pg_by_osd.find(osd);
259 if (it != num_primary_pg_by_osd.end())
260 num = it->second;
261 return num;
262 }
263
264 void update_pg(pg_t pgid, bufferlist& bl);
265 void remove_pg(pg_t pgid);
266 void update_osd(int osd, bufferlist& bl);
267 void remove_osd(int osd);
268
269 void apply_incremental(CephContext *cct, const Incremental& inc);
270 void redo_full_sets();
271 void register_nearfull_status(int osd, const osd_stat_t& s);
272 void calc_stats();
273 void stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
274 bool sameosds=false);
275 void stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
276 bool sameosds=false);
277 void stat_pg_update(const pg_t pgid, pg_stat_t &prev, bufferlist::iterator& blp);
278 void stat_osd_add(const osd_stat_t &s);
279 void stat_osd_sub(const osd_stat_t &s);
280
281 void encode(bufferlist &bl, uint64_t features=-1) const;
282 void decode(bufferlist::iterator &bl);
283
284 void dirty_all(Incremental& inc);
285
286 void dump(Formatter *f) const;
287 void dump_pool_stats(const OSDMap &osd_map, stringstream *ss, Formatter *f,
288 bool verbose) const;
289 void dump_fs_stats(stringstream *ss, Formatter *f, bool verbose) const;
290 static void dump_object_stat_sum(TextTable &tbl, Formatter *f,
291 const object_stat_sum_t &sum,
292 uint64_t avail,
293 float raw_used_rate,
294 bool verbose, const pg_pool_t *pool);
295 void dump_basic(Formatter *f) const;
296 void dump_pg_stats(Formatter *f, bool brief) const;
297 void dump_pool_stats(Formatter *f) const;
298 void dump_osd_stats(Formatter *f) const;
299 void dump_delta(Formatter *f) const;
300 void dump_filtered_pg_stats(Formatter *f, set<pg_t>& pgs) const;
301
302 void dump_pg_stats_plain(ostream& ss,
303 const ceph::unordered_map<pg_t, pg_stat_t>& pg_stats,
304 bool brief) const;
305 void get_stuck_stats(int types, const utime_t cutoff,
306 ceph::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const;
307 bool get_stuck_counts(const utime_t cutoff, map<string, int>& note) const;
308 void dump_stuck(Formatter *f, int types, utime_t cutoff) const;
309 void dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const;
310 int dump_stuck_pg_stats(stringstream &ds,
311 Formatter *f,
312 int threshold,
313 vector<string>& args) const;
314 void dump(ostream& ss) const;
315 void dump_basic(ostream& ss) const;
316 void dump_pg_stats(ostream& ss, bool brief) const;
317 void dump_pg_sum_stats(ostream& ss, bool header) const;
318 void dump_pool_stats(ostream& ss, bool header) const;
319 void dump_osd_stats(ostream& ss) const;
320 void dump_osd_sum_stats(ostream& ss) const;
321 void dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const;
322
323 void dump_osd_perf_stats(Formatter *f) const;
324 void print_osd_perf_stats(std::ostream *ss) const;
325
326 void dump_osd_blocked_by_stats(Formatter *f) const;
327 void print_osd_blocked_by_stats(std::ostream *ss) const;
328
329 void get_filtered_pg_stats(uint32_t state, int64_t poolid, int64_t osdid,
330 bool primary, set<pg_t>& pgs) const;
331 void recovery_summary(Formatter *f, list<string> *psl,
332 const pool_stat_t& delta_sum) const;
333 void overall_recovery_summary(Formatter *f, list<string> *psl) const;
334 void pool_recovery_summary(Formatter *f, list<string> *psl,
335 uint64_t poolid) const;
336 void recovery_rate_summary(Formatter *f, ostream *out,
337 const pool_stat_t& delta_sum,
338 utime_t delta_stamp) const;
339 void overall_recovery_rate_summary(Formatter *f, ostream *out) const;
340 void pool_recovery_rate_summary(Formatter *f, ostream *out,
341 uint64_t poolid) const;
342 /**
343 * Obtain a formatted/plain output for client I/O, source from stats for a
344 * given @p delta_sum pool over a given @p delta_stamp period of time.
345 */
346 void client_io_rate_summary(Formatter *f, ostream *out,
347 const pool_stat_t& delta_sum,
348 utime_t delta_stamp) const;
349 /**
350 * Obtain a formatted/plain output for the overall client I/O, which is
351 * calculated resorting to @p pg_sum_delta and @p stamp_delta.
352 */
353 void overall_client_io_rate_summary(Formatter *f, ostream *out) const;
354 /**
355 * Obtain a formatted/plain output for client I/O over a given pool
356 * with id @p pool_id. We will then obtain pool-specific data
357 * from @p per_pool_sum_delta.
358 */
359 void pool_client_io_rate_summary(Formatter *f, ostream *out,
360 uint64_t poolid) const;
361 /**
362 * Obtain a formatted/plain output for cache tier IO, source from stats for a
363 * given @p delta_sum pool over a given @p delta_stamp period of time.
364 */
365 void cache_io_rate_summary(Formatter *f, ostream *out,
366 const pool_stat_t& delta_sum,
367 utime_t delta_stamp) const;
368 /**
369 * Obtain a formatted/plain output for the overall cache tier IO, which is
370 * calculated resorting to @p pg_sum_delta and @p stamp_delta.
371 */
372 void overall_cache_io_rate_summary(Formatter *f, ostream *out) const;
373 /**
374 * Obtain a formatted/plain output for cache tier IO over a given pool
375 * with id @p pool_id. We will then obtain pool-specific data
376 * from @p per_pool_sum_delta.
377 */
378 void pool_cache_io_rate_summary(Formatter *f, ostream *out,
379 uint64_t poolid) const;
380
381 void print_summary(Formatter *f, ostream *out) const;
382 void print_oneline_summary(Formatter *f, ostream *out) const;
383
384 epoch_t get_min_last_epoch_clean() const {
385 if (!min_last_epoch_clean)
386 min_last_epoch_clean = calc_min_last_epoch_clean();
387 return min_last_epoch_clean;
388 }
389
390 static void generate_test_instances(list<PGMap*>& o);
391 };
392 WRITE_CLASS_ENCODER_FEATURES(PGMap::Incremental)
393 WRITE_CLASS_ENCODER_FEATURES(PGMap)
394
395 inline ostream& operator<<(ostream& out, const PGMap& m) {
396 m.print_oneline_summary(NULL, &out);
397 return out;
398 }
399
400 int process_pg_map_command(
401 const string& prefix,
402 const map<string,cmd_vartype>& cmdmap,
403 const PGMap& pg_map,
404 const OSDMap& osdmap,
405 Formatter *f,
406 stringstream *ss,
407 bufferlist *odata);
408
409 class PGMapUpdater
410 {
411 public:
412 static void check_osd_map(
413 const OSDMap::Incremental &osd_inc,
414 std::set<int> *need_check_down_pg_osds,
415 std::map<int,utime_t> *last_osd_report,
416 PGMap *pg_map,
417 PGMap::Incremental *pending_inc);
418
419 /**
420 * check latest osdmap for new pgs to register
421 */
422 static void register_new_pgs(
423 const OSDMap &osd_map,
424 const PGMap &pg_map,
425 PGMap::Incremental *pending_inc);
426
427 /**
428 * recalculate creating pg mappings
429 */
430 static void update_creating_pgs(
431 const OSDMap &osd_map,
432 const PGMap &pg_map,
433 PGMap::Incremental *pending_inc);
434
435 static void register_pg(
436 const OSDMap &osd_map,
437 pg_t pgid, epoch_t epoch,
438 bool new_pool,
439 const PGMap &pg_map,
440 PGMap::Incremental *pending_inc);
441
442 // mark pg's state stale if its acting primary osd is down
443 static void check_down_pgs(
444 const OSDMap &osd_map,
445 const PGMap &pg_map,
446 bool check_all,
447 const set<int>& need_check_down_pg_osds,
448 PGMap::Incremental *pending_inc);
449 };
450
451 namespace reweight {
452 /* Assign a lower weight to overloaded OSDs.
453 *
454 * The osds that will get a lower weight are those with with a utilization
455 * percentage 'oload' percent greater than the average utilization.
456 */
457 int by_utilization(const OSDMap &osd_map,
458 const PGMap &pg_map,
459 int oload,
460 double max_changef,
461 int max_osds,
462 bool by_pg, const set<int64_t> *pools,
463 bool no_increasing,
464 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
465 std::stringstream *ss,
466 std::string *out_str,
467 Formatter *f);
468 }
469
470 #endif