]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | /* | |
16 | * Placement Group Map. Placement Groups are logical sets of objects | |
17 | * that are replicated by the same set of devices. pgid=(r,hash(o)&m) | |
18 | * where & is a bit-wise AND and m=2^k-1 | |
19 | */ | |
20 | ||
21 | #ifndef CEPH_PGMAP_H | |
22 | #define CEPH_PGMAP_H | |
23 | ||
24 | #include "common/debug.h" | |
25 | #include "common/TextTable.h" | |
26 | #include "osd/osd_types.h" | |
27 | #include <sstream> | |
28 | ||
29 | // FIXME: don't like including this here to get OSDMap::Incremental, maybe | |
30 | // PGMapUpdater needs its own header. | |
31 | #include "osd/OSDMap.h" | |
32 | ||
33 | namespace ceph { class Formatter; } | |
34 | ||
35 | class PGMap { | |
36 | public: | |
37 | // the map | |
38 | version_t version; | |
39 | epoch_t last_osdmap_epoch; // last osdmap epoch i applied to the pgmap | |
40 | epoch_t last_pg_scan; // osdmap epoch | |
41 | ceph::unordered_map<pg_t,pg_stat_t> pg_stat; | |
42 | ceph::unordered_map<int32_t,osd_stat_t> osd_stat; | |
43 | set<int32_t> full_osds; | |
44 | set<int32_t> nearfull_osds; | |
45 | float full_ratio; | |
46 | float nearfull_ratio; | |
47 | ||
48 | // mapping of osd to most recently reported osdmap epoch | |
49 | ceph::unordered_map<int32_t,epoch_t> osd_epochs; | |
50 | ||
51 | class Incremental { | |
52 | public: | |
53 | version_t version; | |
54 | map<pg_t,pg_stat_t> pg_stat_updates; | |
55 | epoch_t osdmap_epoch; | |
56 | epoch_t pg_scan; // osdmap epoch | |
57 | set<pg_t> pg_remove; | |
58 | float full_ratio; | |
59 | float nearfull_ratio; | |
60 | utime_t stamp; | |
61 | ||
62 | private: | |
63 | map<int32_t,osd_stat_t> osd_stat_updates; | |
64 | set<int32_t> osd_stat_rm; | |
65 | ||
66 | // mapping of osd to most recently reported osdmap epoch | |
67 | map<int32_t,epoch_t> osd_epochs; | |
68 | public: | |
69 | ||
70 | const map<int32_t, osd_stat_t> &get_osd_stat_updates() const { | |
71 | return osd_stat_updates; | |
72 | } | |
73 | const set<int32_t> &get_osd_stat_rm() const { | |
74 | return osd_stat_rm; | |
75 | } | |
76 | const map<int32_t, epoch_t> &get_osd_epochs() const { | |
77 | return osd_epochs; | |
78 | } | |
79 | ||
80 | void update_stat(int32_t osd, epoch_t epoch, const osd_stat_t &stat) { | |
81 | osd_stat_updates[osd] = stat; | |
82 | osd_epochs[osd] = epoch; | |
83 | assert(osd_epochs.size() == osd_stat_updates.size()); | |
84 | } | |
85 | void stat_osd_out(int32_t osd) { | |
86 | // 0 the stats for the osd | |
87 | osd_stat_updates[osd] = osd_stat_t(); | |
88 | } | |
89 | void stat_osd_down_up(int32_t osd, PGMap& pg_map) { | |
90 | // 0 the op_queue_age_hist for this osd | |
91 | map<int32_t,osd_stat_t>::iterator p = osd_stat_updates.find(osd); | |
92 | if (p != osd_stat_updates.end()) { | |
93 | p->second.op_queue_age_hist.clear(); | |
94 | return; | |
95 | } | |
96 | ceph::unordered_map<int32_t,osd_stat_t>::iterator q = | |
97 | pg_map.osd_stat.find(osd); | |
98 | if (q != pg_map.osd_stat.end()) { | |
99 | osd_stat_t& t = osd_stat_updates[osd] = q->second; | |
100 | t.op_queue_age_hist.clear(); | |
101 | } | |
102 | } | |
103 | void rm_stat(int32_t osd) { | |
104 | osd_stat_rm.insert(osd); | |
105 | osd_epochs.erase(osd); | |
106 | osd_stat_updates.erase(osd); | |
107 | } | |
108 | void encode(bufferlist &bl, uint64_t features=-1) const; | |
109 | void decode(bufferlist::iterator &bl); | |
110 | void dump(Formatter *f) const; | |
111 | static void generate_test_instances(list<Incremental*>& o); | |
112 | ||
113 | Incremental() : version(0), osdmap_epoch(0), pg_scan(0), | |
114 | full_ratio(0), nearfull_ratio(0) {} | |
115 | }; | |
116 | ||
117 | ||
118 | // aggregate stats (soft state), generated by calc_stats() | |
119 | ceph::unordered_map<int,int> num_pg_by_state; | |
120 | int64_t num_pg = 0, num_osd = 0; | |
121 | int64_t num_pg_active = 0; | |
122 | ceph::unordered_map<int,pool_stat_t> pg_pool_sum; | |
123 | pool_stat_t pg_sum; | |
124 | osd_stat_t osd_sum; | |
125 | mutable epoch_t min_last_epoch_clean = 0; | |
126 | ceph::unordered_map<int,int> blocked_by_sum; | |
127 | ceph::unordered_map<int,set<pg_t> > pg_by_osd; | |
128 | ceph::unordered_map<int,int> num_primary_pg_by_osd; | |
129 | ||
130 | utime_t stamp; | |
131 | ||
132 | // recent deltas, and summation | |
133 | /** | |
134 | * keep track of last deltas for each pool, calculated using | |
135 | * @p pg_pool_sum as baseline. | |
136 | */ | |
137 | ceph::unordered_map<uint64_t, list< pair<pool_stat_t, utime_t> > > per_pool_sum_deltas; | |
138 | /** | |
139 | * keep track of per-pool timestamp deltas, according to last update on | |
140 | * each pool. | |
141 | */ | |
142 | ceph::unordered_map<uint64_t, utime_t> per_pool_sum_deltas_stamps; | |
143 | /** | |
144 | * keep track of sum deltas, per-pool, taking into account any previous | |
145 | * deltas existing in @p per_pool_sum_deltas. The utime_t as second member | |
146 | * of the pair is the timestamp refering to the last update (i.e., the first | |
147 | * member of the pair) for a given pool. | |
148 | */ | |
149 | ceph::unordered_map<uint64_t, pair<pool_stat_t,utime_t> > per_pool_sum_delta; | |
150 | ||
151 | list< pair<pool_stat_t, utime_t> > pg_sum_deltas; | |
152 | pool_stat_t pg_sum_delta; | |
153 | utime_t stamp_delta; | |
154 | ||
155 | void update_global_delta(CephContext *cct, | |
156 | const utime_t ts, const pool_stat_t& pg_sum_old); | |
157 | void update_pool_deltas(CephContext *cct, | |
158 | const utime_t ts, | |
159 | const ceph::unordered_map<uint64_t, pool_stat_t>& pg_pool_sum_old); | |
160 | void clear_delta(); | |
161 | ||
162 | void deleted_pool(int64_t pool) { | |
163 | pg_pool_sum.erase(pool); | |
164 | per_pool_sum_deltas.erase(pool); | |
165 | per_pool_sum_deltas_stamps.erase(pool); | |
166 | per_pool_sum_delta.erase(pool); | |
167 | } | |
168 | ||
169 | private: | |
170 | void update_delta(CephContext *cct, | |
171 | const utime_t ts, | |
172 | const pool_stat_t& old_pool_sum, | |
173 | utime_t *last_ts, | |
174 | const pool_stat_t& current_pool_sum, | |
175 | pool_stat_t *result_pool_delta, | |
176 | utime_t *result_ts_delta, | |
177 | list<pair<pool_stat_t,utime_t> > *delta_avg_list); | |
178 | ||
179 | void update_one_pool_delta(CephContext *cct, | |
180 | const utime_t ts, | |
181 | const uint64_t pool, | |
182 | const pool_stat_t& old_pool_sum); | |
183 | ||
184 | epoch_t calc_min_last_epoch_clean() const; | |
185 | ||
186 | int64_t get_rule_avail(const OSDMap& osdmap, int ruleno) const; | |
187 | ||
188 | public: | |
189 | ||
190 | set<pg_t> creating_pgs; | |
191 | map<int,map<epoch_t,set<pg_t> > > creating_pgs_by_osd_epoch; | |
192 | ||
193 | // Bits that use to be enum StuckPG | |
194 | static const int STUCK_INACTIVE = (1<<0); | |
195 | static const int STUCK_UNCLEAN = (1<<1); | |
196 | static const int STUCK_UNDERSIZED = (1<<2); | |
197 | static const int STUCK_DEGRADED = (1<<3); | |
198 | static const int STUCK_STALE = (1<<4); | |
199 | ||
200 | PGMap() | |
201 | : version(0), | |
202 | last_osdmap_epoch(0), last_pg_scan(0), | |
203 | full_ratio(0), nearfull_ratio(0) | |
204 | {} | |
205 | ||
206 | void set_full_ratios(float full, float nearfull) { | |
207 | if (full_ratio == full && nearfull_ratio == nearfull) | |
208 | return; | |
209 | full_ratio = full; | |
210 | nearfull_ratio = nearfull; | |
211 | redo_full_sets(); | |
212 | } | |
213 | ||
214 | version_t get_version() const { | |
215 | return version; | |
216 | } | |
217 | void set_version(version_t v) { | |
218 | version = v; | |
219 | } | |
220 | epoch_t get_last_osdmap_epoch() const { | |
221 | return last_osdmap_epoch; | |
222 | } | |
223 | void set_last_osdmap_epoch(epoch_t e) { | |
224 | last_osdmap_epoch = e; | |
225 | } | |
226 | epoch_t get_last_pg_scan() const { | |
227 | return last_pg_scan; | |
228 | } | |
229 | void set_last_pg_scan(epoch_t e) { | |
230 | last_pg_scan = e; | |
231 | } | |
232 | utime_t get_stamp() const { | |
233 | return stamp; | |
234 | } | |
235 | void set_stamp(utime_t s) { | |
236 | stamp = s; | |
237 | } | |
238 | ||
239 | size_t get_num_pg_by_osd(int osd) const { | |
240 | ceph::unordered_map<int,set<pg_t> >::const_iterator p = pg_by_osd.find(osd); | |
241 | if (p == pg_by_osd.end()) | |
242 | return 0; | |
243 | else | |
244 | return p->second.size(); | |
245 | } | |
246 | ||
247 | pool_stat_t get_pg_pool_sum_stat(int64_t pool) const { | |
248 | ceph::unordered_map<int,pool_stat_t>::const_iterator p = | |
249 | pg_pool_sum.find(pool); | |
250 | if (p != pg_pool_sum.end()) | |
251 | return p->second; | |
252 | return pool_stat_t(); | |
253 | } | |
254 | ||
255 | int get_num_primary_pg_by_osd(int osd) const { | |
256 | assert(osd >= 0); | |
257 | int num = 0; | |
258 | auto it = num_primary_pg_by_osd.find(osd); | |
259 | if (it != num_primary_pg_by_osd.end()) | |
260 | num = it->second; | |
261 | return num; | |
262 | } | |
263 | ||
264 | void update_pg(pg_t pgid, bufferlist& bl); | |
265 | void remove_pg(pg_t pgid); | |
266 | void update_osd(int osd, bufferlist& bl); | |
267 | void remove_osd(int osd); | |
268 | ||
269 | void apply_incremental(CephContext *cct, const Incremental& inc); | |
270 | void redo_full_sets(); | |
271 | void register_nearfull_status(int osd, const osd_stat_t& s); | |
272 | void calc_stats(); | |
273 | void stat_pg_add(const pg_t &pgid, const pg_stat_t &s, | |
274 | bool sameosds=false); | |
275 | void stat_pg_sub(const pg_t &pgid, const pg_stat_t &s, | |
276 | bool sameosds=false); | |
277 | void stat_pg_update(const pg_t pgid, pg_stat_t &prev, bufferlist::iterator& blp); | |
278 | void stat_osd_add(const osd_stat_t &s); | |
279 | void stat_osd_sub(const osd_stat_t &s); | |
280 | ||
281 | void encode(bufferlist &bl, uint64_t features=-1) const; | |
282 | void decode(bufferlist::iterator &bl); | |
283 | ||
284 | void dirty_all(Incremental& inc); | |
285 | ||
286 | void dump(Formatter *f) const; | |
287 | void dump_pool_stats(const OSDMap &osd_map, stringstream *ss, Formatter *f, | |
288 | bool verbose) const; | |
289 | void dump_fs_stats(stringstream *ss, Formatter *f, bool verbose) const; | |
290 | static void dump_object_stat_sum(TextTable &tbl, Formatter *f, | |
291 | const object_stat_sum_t &sum, | |
292 | uint64_t avail, | |
293 | float raw_used_rate, | |
294 | bool verbose, const pg_pool_t *pool); | |
295 | void dump_basic(Formatter *f) const; | |
296 | void dump_pg_stats(Formatter *f, bool brief) const; | |
297 | void dump_pool_stats(Formatter *f) const; | |
298 | void dump_osd_stats(Formatter *f) const; | |
299 | void dump_delta(Formatter *f) const; | |
300 | void dump_filtered_pg_stats(Formatter *f, set<pg_t>& pgs) const; | |
301 | ||
302 | void dump_pg_stats_plain(ostream& ss, | |
303 | const ceph::unordered_map<pg_t, pg_stat_t>& pg_stats, | |
304 | bool brief) const; | |
305 | void get_stuck_stats(int types, const utime_t cutoff, | |
306 | ceph::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const; | |
307 | bool get_stuck_counts(const utime_t cutoff, map<string, int>& note) const; | |
308 | void dump_stuck(Formatter *f, int types, utime_t cutoff) const; | |
309 | void dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const; | |
310 | int dump_stuck_pg_stats(stringstream &ds, | |
311 | Formatter *f, | |
312 | int threshold, | |
313 | vector<string>& args) const; | |
314 | void dump(ostream& ss) const; | |
315 | void dump_basic(ostream& ss) const; | |
316 | void dump_pg_stats(ostream& ss, bool brief) const; | |
317 | void dump_pg_sum_stats(ostream& ss, bool header) const; | |
318 | void dump_pool_stats(ostream& ss, bool header) const; | |
319 | void dump_osd_stats(ostream& ss) const; | |
320 | void dump_osd_sum_stats(ostream& ss) const; | |
321 | void dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const; | |
322 | ||
323 | void dump_osd_perf_stats(Formatter *f) const; | |
324 | void print_osd_perf_stats(std::ostream *ss) const; | |
325 | ||
326 | void dump_osd_blocked_by_stats(Formatter *f) const; | |
327 | void print_osd_blocked_by_stats(std::ostream *ss) const; | |
328 | ||
329 | void get_filtered_pg_stats(uint32_t state, int64_t poolid, int64_t osdid, | |
330 | bool primary, set<pg_t>& pgs) const; | |
331 | void recovery_summary(Formatter *f, list<string> *psl, | |
332 | const pool_stat_t& delta_sum) const; | |
333 | void overall_recovery_summary(Formatter *f, list<string> *psl) const; | |
334 | void pool_recovery_summary(Formatter *f, list<string> *psl, | |
335 | uint64_t poolid) const; | |
336 | void recovery_rate_summary(Formatter *f, ostream *out, | |
337 | const pool_stat_t& delta_sum, | |
338 | utime_t delta_stamp) const; | |
339 | void overall_recovery_rate_summary(Formatter *f, ostream *out) const; | |
340 | void pool_recovery_rate_summary(Formatter *f, ostream *out, | |
341 | uint64_t poolid) const; | |
342 | /** | |
343 | * Obtain a formatted/plain output for client I/O, source from stats for a | |
344 | * given @p delta_sum pool over a given @p delta_stamp period of time. | |
345 | */ | |
346 | void client_io_rate_summary(Formatter *f, ostream *out, | |
347 | const pool_stat_t& delta_sum, | |
348 | utime_t delta_stamp) const; | |
349 | /** | |
350 | * Obtain a formatted/plain output for the overall client I/O, which is | |
351 | * calculated resorting to @p pg_sum_delta and @p stamp_delta. | |
352 | */ | |
353 | void overall_client_io_rate_summary(Formatter *f, ostream *out) const; | |
354 | /** | |
355 | * Obtain a formatted/plain output for client I/O over a given pool | |
356 | * with id @p pool_id. We will then obtain pool-specific data | |
357 | * from @p per_pool_sum_delta. | |
358 | */ | |
359 | void pool_client_io_rate_summary(Formatter *f, ostream *out, | |
360 | uint64_t poolid) const; | |
361 | /** | |
362 | * Obtain a formatted/plain output for cache tier IO, source from stats for a | |
363 | * given @p delta_sum pool over a given @p delta_stamp period of time. | |
364 | */ | |
365 | void cache_io_rate_summary(Formatter *f, ostream *out, | |
366 | const pool_stat_t& delta_sum, | |
367 | utime_t delta_stamp) const; | |
368 | /** | |
369 | * Obtain a formatted/plain output for the overall cache tier IO, which is | |
370 | * calculated resorting to @p pg_sum_delta and @p stamp_delta. | |
371 | */ | |
372 | void overall_cache_io_rate_summary(Formatter *f, ostream *out) const; | |
373 | /** | |
374 | * Obtain a formatted/plain output for cache tier IO over a given pool | |
375 | * with id @p pool_id. We will then obtain pool-specific data | |
376 | * from @p per_pool_sum_delta. | |
377 | */ | |
378 | void pool_cache_io_rate_summary(Formatter *f, ostream *out, | |
379 | uint64_t poolid) const; | |
380 | ||
381 | void print_summary(Formatter *f, ostream *out) const; | |
382 | void print_oneline_summary(Formatter *f, ostream *out) const; | |
383 | ||
384 | epoch_t get_min_last_epoch_clean() const { | |
385 | if (!min_last_epoch_clean) | |
386 | min_last_epoch_clean = calc_min_last_epoch_clean(); | |
387 | return min_last_epoch_clean; | |
388 | } | |
389 | ||
390 | static void generate_test_instances(list<PGMap*>& o); | |
391 | }; | |
392 | WRITE_CLASS_ENCODER_FEATURES(PGMap::Incremental) | |
393 | WRITE_CLASS_ENCODER_FEATURES(PGMap) | |
394 | ||
395 | inline ostream& operator<<(ostream& out, const PGMap& m) { | |
396 | m.print_oneline_summary(NULL, &out); | |
397 | return out; | |
398 | } | |
399 | ||
400 | int process_pg_map_command( | |
401 | const string& prefix, | |
402 | const map<string,cmd_vartype>& cmdmap, | |
403 | const PGMap& pg_map, | |
404 | const OSDMap& osdmap, | |
405 | Formatter *f, | |
406 | stringstream *ss, | |
407 | bufferlist *odata); | |
408 | ||
409 | class PGMapUpdater | |
410 | { | |
411 | public: | |
412 | static void check_osd_map( | |
413 | const OSDMap::Incremental &osd_inc, | |
414 | std::set<int> *need_check_down_pg_osds, | |
415 | std::map<int,utime_t> *last_osd_report, | |
416 | PGMap *pg_map, | |
417 | PGMap::Incremental *pending_inc); | |
418 | ||
419 | /** | |
420 | * check latest osdmap for new pgs to register | |
421 | */ | |
422 | static void register_new_pgs( | |
423 | const OSDMap &osd_map, | |
424 | const PGMap &pg_map, | |
425 | PGMap::Incremental *pending_inc); | |
426 | ||
427 | /** | |
428 | * recalculate creating pg mappings | |
429 | */ | |
430 | static void update_creating_pgs( | |
431 | const OSDMap &osd_map, | |
432 | const PGMap &pg_map, | |
433 | PGMap::Incremental *pending_inc); | |
434 | ||
435 | static void register_pg( | |
436 | const OSDMap &osd_map, | |
437 | pg_t pgid, epoch_t epoch, | |
438 | bool new_pool, | |
439 | const PGMap &pg_map, | |
440 | PGMap::Incremental *pending_inc); | |
441 | ||
442 | // mark pg's state stale if its acting primary osd is down | |
443 | static void check_down_pgs( | |
444 | const OSDMap &osd_map, | |
445 | const PGMap &pg_map, | |
446 | bool check_all, | |
447 | const set<int>& need_check_down_pg_osds, | |
448 | PGMap::Incremental *pending_inc); | |
449 | }; | |
450 | ||
451 | namespace reweight { | |
452 | /* Assign a lower weight to overloaded OSDs. | |
453 | * | |
454 | * The osds that will get a lower weight are those with with a utilization | |
455 | * percentage 'oload' percent greater than the average utilization. | |
456 | */ | |
457 | int by_utilization(const OSDMap &osd_map, | |
458 | const PGMap &pg_map, | |
459 | int oload, | |
460 | double max_changef, | |
461 | int max_osds, | |
462 | bool by_pg, const set<int64_t> *pools, | |
463 | bool no_increasing, | |
464 | mempool::osdmap::map<int32_t, uint32_t>* new_weights, | |
465 | std::stringstream *ss, | |
466 | std::string *out_str, | |
467 | Formatter *f); | |
468 | } | |
469 | ||
470 | #endif |