]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> | |
8 | * | |
9 | * Author: Loic Dachary <loic@dachary.org> | |
10 | * | |
11 | * This is free software; you can redistribute it and/or | |
12 | * modify it under the terms of the GNU Lesser General Public | |
13 | * License version 2.1, as published by the Free Software | |
14 | * Foundation. See file COPYING. | |
15 | * | |
16 | */ | |
17 | ||
18 | /* Object Store Device (OSD) Monitor | |
19 | */ | |
20 | ||
21 | #ifndef CEPH_OSDMONITOR_H | |
22 | #define CEPH_OSDMONITOR_H | |
23 | ||
24 | #include <map> | |
25 | #include <set> | |
7c673cae FG |
26 | |
27 | #include "include/types.h" | |
11fdf7f2 | 28 | #include "include/encoding.h" |
7c673cae | 29 | #include "common/simple_cache.hpp" |
eafe8130 | 30 | #include "common/PriorityCache.h" |
7c673cae FG |
31 | #include "msg/Messenger.h" |
32 | ||
33 | #include "osd/OSDMap.h" | |
34 | #include "osd/OSDMapMapping.h" | |
35 | ||
36 | #include "CreatingPGs.h" | |
37 | #include "PaxosService.h" | |
38 | ||
39 | class Monitor; | |
40 | class PGMap; | |
41 | class MonSession; | |
42 | class MOSDMap; | |
43 | ||
44 | #include "erasure-code/ErasureCodeInterface.h" | |
45 | #include "mon/MonOpRequest.h" | |
28e407b8 AA |
46 | #include <boost/functional/hash.hpp> |
47 | // re-include our assert to clobber the system one; fix dout: | |
11fdf7f2 | 48 | #include "include/ceph_assert.h" |
7c673cae | 49 | |
7c673cae FG |
50 | /// information about a particular peer's failure reports for one osd |
51 | struct failure_reporter_t { | |
52 | utime_t failed_since; ///< when they think it failed | |
53 | MonOpRequestRef op; ///< failure op request | |
54 | ||
55 | failure_reporter_t() {} | |
56 | explicit failure_reporter_t(utime_t s) : failed_since(s) {} | |
57 | ~failure_reporter_t() { } | |
58 | }; | |
59 | ||
60 | /// information about all failure reports for one osd | |
61 | struct failure_info_t { | |
62 | map<int, failure_reporter_t> reporters; ///< reporter -> failed_since etc | |
63 | utime_t max_failed_since; ///< most recent failed_since | |
64 | ||
65 | failure_info_t() {} | |
66 | ||
67 | utime_t get_failed_since() { | |
68 | if (max_failed_since == utime_t() && !reporters.empty()) { | |
69 | // the old max must have canceled; recalculate. | |
70 | for (map<int, failure_reporter_t>::iterator p = reporters.begin(); | |
71 | p != reporters.end(); | |
72 | ++p) | |
73 | if (p->second.failed_since > max_failed_since) | |
74 | max_failed_since = p->second.failed_since; | |
75 | } | |
76 | return max_failed_since; | |
77 | } | |
78 | ||
79 | // set the message for the latest report. return any old op request we had, | |
80 | // if any, so we can discard it. | |
81 | MonOpRequestRef add_report(int who, utime_t failed_since, | |
82 | MonOpRequestRef op) { | |
83 | map<int, failure_reporter_t>::iterator p = reporters.find(who); | |
84 | if (p == reporters.end()) { | |
91327a77 | 85 | if (max_failed_since != utime_t() && max_failed_since < failed_since) |
7c673cae FG |
86 | max_failed_since = failed_since; |
87 | p = reporters.insert(map<int, failure_reporter_t>::value_type(who, failure_reporter_t(failed_since))).first; | |
88 | } | |
89 | ||
90 | MonOpRequestRef ret = p->second.op; | |
91 | p->second.op = op; | |
92 | return ret; | |
93 | } | |
94 | ||
95 | void take_report_messages(list<MonOpRequestRef>& ls) { | |
96 | for (map<int, failure_reporter_t>::iterator p = reporters.begin(); | |
97 | p != reporters.end(); | |
98 | ++p) { | |
99 | if (p->second.op) { | |
100 | ls.push_back(p->second.op); | |
101 | p->second.op.reset(); | |
102 | } | |
103 | } | |
104 | } | |
105 | ||
106 | MonOpRequestRef cancel_report(int who) { | |
107 | map<int, failure_reporter_t>::iterator p = reporters.find(who); | |
108 | if (p == reporters.end()) | |
109 | return MonOpRequestRef(); | |
110 | MonOpRequestRef ret = p->second.op; | |
111 | reporters.erase(p); | |
91327a77 | 112 | max_failed_since = utime_t(); |
7c673cae FG |
113 | return ret; |
114 | } | |
115 | }; | |
116 | ||
117 | ||
118 | class LastEpochClean { | |
119 | struct Lec { | |
120 | vector<epoch_t> epoch_by_pg; | |
121 | ps_t next_missing = 0; | |
122 | epoch_t floor = std::numeric_limits<epoch_t>::max(); | |
123 | void report(ps_t pg, epoch_t last_epoch_clean); | |
124 | }; | |
125 | std::map<uint64_t, Lec> report_by_pool; | |
126 | public: | |
127 | void report(const pg_t& pg, epoch_t last_epoch_clean); | |
128 | void remove_pool(uint64_t pool); | |
129 | epoch_t get_lower_bound(const OSDMap& latest) const; | |
130 | }; | |
131 | ||
132 | ||
11fdf7f2 TL |
133 | struct osdmap_manifest_t { |
134 | // all the maps we have pinned -- i.e., won't be removed unless | |
135 | // they are inside a trim interval. | |
136 | set<version_t> pinned; | |
137 | ||
138 | osdmap_manifest_t() {} | |
139 | ||
140 | version_t get_last_pinned() const | |
141 | { | |
142 | set<version_t>::const_reverse_iterator it = pinned.crbegin(); | |
143 | if (it == pinned.crend()) { | |
144 | return 0; | |
145 | } | |
146 | return *it; | |
147 | } | |
148 | ||
149 | version_t get_first_pinned() const | |
150 | { | |
151 | set<version_t>::const_iterator it = pinned.cbegin(); | |
152 | if (it == pinned.cend()) { | |
153 | return 0; | |
154 | } | |
155 | return *it; | |
156 | } | |
157 | ||
158 | bool is_pinned(version_t v) const | |
159 | { | |
160 | return pinned.find(v) != pinned.end(); | |
161 | } | |
162 | ||
163 | void pin(version_t v) | |
164 | { | |
165 | pinned.insert(v); | |
166 | } | |
167 | ||
168 | version_t get_lower_closest_pinned(version_t v) const { | |
169 | set<version_t>::const_iterator p = pinned.lower_bound(v); | |
170 | if (p == pinned.cend()) { | |
171 | return 0; | |
172 | } else if (*p > v) { | |
173 | if (p == pinned.cbegin()) { | |
174 | return 0; | |
175 | } | |
176 | --p; | |
177 | } | |
178 | return *p; | |
179 | } | |
180 | ||
181 | void encode(bufferlist& bl) const | |
182 | { | |
183 | ENCODE_START(1, 1, bl); | |
184 | encode(pinned, bl); | |
185 | ENCODE_FINISH(bl); | |
186 | } | |
187 | ||
188 | void decode(bufferlist::const_iterator& bl) | |
189 | { | |
190 | DECODE_START(1, bl); | |
191 | decode(pinned, bl); | |
192 | DECODE_FINISH(bl); | |
193 | } | |
194 | ||
195 | void decode(bufferlist& bl) { | |
196 | auto p = bl.cbegin(); | |
197 | decode(p); | |
198 | } | |
199 | ||
200 | void dump(Formatter *f) { | |
201 | f->dump_unsigned("first_pinned", get_first_pinned()); | |
202 | f->dump_unsigned("last_pinned", get_last_pinned()); | |
203 | f->open_array_section("pinned_maps"); | |
204 | for (auto& i : pinned) { | |
205 | f->dump_unsigned("epoch", i); | |
206 | } | |
207 | f->close_section(); | |
208 | } | |
209 | }; | |
210 | WRITE_CLASS_ENCODER(osdmap_manifest_t); | |
211 | ||
eafe8130 TL |
212 | class OSDMonitor : public PaxosService, |
213 | public md_config_obs_t { | |
7c673cae FG |
214 | CephContext *cct; |
215 | ||
216 | public: | |
217 | OSDMap osdmap; | |
218 | ||
eafe8130 TL |
219 | // config observer |
220 | const char** get_tracked_conf_keys() const override; | |
221 | void handle_conf_change(const ConfigProxy& conf, | |
222 | const std::set<std::string> &changed) override; | |
7c673cae FG |
223 | // [leader] |
224 | OSDMap::Incremental pending_inc; | |
225 | map<int, bufferlist> pending_metadata; | |
226 | set<int> pending_metadata_rm; | |
227 | map<int, failure_info_t> failure_info; | |
228 | map<int,utime_t> down_pending_out; // osd down -> out | |
81eedcae | 229 | bool priority_convert = false; |
eafe8130 TL |
230 | std::shared_ptr<PriorityCache::PriCache> rocksdb_binned_kv_cache = nullptr; |
231 | std::shared_ptr<PriorityCache::Manager> pcm = nullptr; | |
232 | ceph::mutex balancer_lock = ceph::make_mutex("OSDMonitor::balancer_lock"); | |
7c673cae FG |
233 | |
234 | map<int,double> osd_weight; | |
235 | ||
28e407b8 AA |
236 | using osdmap_key_t = std::pair<version_t, uint64_t>; |
237 | using osdmap_cache_t = SimpleLRU<osdmap_key_t, | |
238 | bufferlist, | |
239 | std::less<osdmap_key_t>, | |
240 | boost::hash<osdmap_key_t>>; | |
241 | osdmap_cache_t inc_osd_cache; | |
242 | osdmap_cache_t full_osd_cache; | |
7c673cae | 243 | |
11fdf7f2 TL |
244 | bool has_osdmap_manifest; |
245 | osdmap_manifest_t osdmap_manifest; | |
246 | ||
7c673cae FG |
247 | bool check_failures(utime_t now); |
248 | bool check_failure(utime_t now, int target_osd, failure_info_t& fi); | |
224ce89b | 249 | void force_failure(int target_osd, int by); |
7c673cae | 250 | |
7c673cae FG |
251 | bool _have_pending_crush(); |
252 | CrushWrapper &_get_stable_crush(); | |
253 | void _get_pending_crush(CrushWrapper& newcrush); | |
254 | ||
255 | enum FastReadType { | |
256 | FAST_READ_OFF, | |
257 | FAST_READ_ON, | |
258 | FAST_READ_DEFAULT | |
259 | }; | |
260 | ||
494da23a TL |
261 | struct CleanUpmapJob : public ParallelPGMapper::Job { |
262 | CephContext *cct; | |
263 | const OSDMap& osdmap; | |
264 | OSDMap::Incremental& pending_inc; | |
265 | // lock to protect pending_inc form changing | |
266 | // when checking is done | |
267 | Mutex pending_inc_lock = {"CleanUpmapJob::pending_inc_lock"}; | |
268 | ||
269 | CleanUpmapJob(CephContext *cct, const OSDMap& om, OSDMap::Incremental& pi) | |
270 | : ParallelPGMapper::Job(&om), | |
271 | cct(cct), | |
272 | osdmap(om), | |
273 | pending_inc(pi) {} | |
274 | ||
275 | void process(const vector<pg_t>& to_check) override { | |
276 | vector<pg_t> to_cancel; | |
277 | map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap; | |
278 | osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_remap); | |
279 | // don't bother taking lock if nothing changes | |
280 | if (!to_cancel.empty() || !to_remap.empty()) { | |
281 | std::lock_guard l(pending_inc_lock); | |
282 | osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_remap); | |
283 | } | |
284 | } | |
285 | ||
286 | void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) override {} | |
287 | void complete() override {} | |
288 | }; // public as this will need to be accessible from TestTestOSDMap.cc | |
289 | ||
7c673cae | 290 | // svc |
11fdf7f2 | 291 | public: |
7c673cae | 292 | void create_initial() override; |
11fdf7f2 | 293 | void get_store_prefixes(std::set<string>& s) const override; |
7c673cae FG |
294 | |
295 | private: | |
296 | void update_from_paxos(bool *need_bootstrap) override; | |
297 | void create_pending() override; // prepare a new pending | |
298 | void encode_pending(MonitorDBStore::TransactionRef t) override; | |
299 | void on_active() override; | |
300 | void on_restart() override; | |
301 | void on_shutdown() override; | |
11fdf7f2 TL |
302 | |
303 | /* osdmap full map prune */ | |
304 | void load_osdmap_manifest(); | |
305 | bool should_prune() const; | |
306 | void _prune_update_trimmed( | |
307 | MonitorDBStore::TransactionRef tx, | |
308 | version_t first); | |
309 | void prune_init(osdmap_manifest_t& manifest); | |
310 | bool _prune_sanitize_options() const; | |
311 | bool is_prune_enabled() const; | |
312 | bool is_prune_supported() const; | |
313 | bool do_prune(MonitorDBStore::TransactionRef tx); | |
314 | ||
eafe8130 TL |
315 | // Priority cache control |
316 | uint32_t mon_osd_cache_size = 0; ///< Number of cached OSDMaps | |
317 | uint64_t rocksdb_cache_size = 0; ///< Cache for kv Db | |
318 | double cache_kv_ratio = 0; ///< Cache ratio dedicated to kv | |
319 | double cache_inc_ratio = 0; ///< Cache ratio dedicated to inc | |
320 | double cache_full_ratio = 0; ///< Cache ratio dedicated to full | |
321 | uint64_t mon_memory_base = 0; ///< Mon base memory for cache autotuning | |
322 | double mon_memory_fragmentation = 0; ///< Expected memory fragmentation | |
323 | uint64_t mon_memory_target = 0; ///< Mon target memory for cache autotuning | |
324 | uint64_t mon_memory_min = 0; ///< Min memory to cache osdmaps | |
325 | bool mon_memory_autotune = false; ///< Cache auto tune setting | |
326 | int register_cache_with_pcm(); | |
327 | int _set_cache_sizes(); | |
328 | int _set_cache_ratios(); | |
329 | void _set_new_cache_sizes(); | |
330 | void _set_cache_autotuning(); | |
331 | int _update_mon_cache_settings(); | |
332 | ||
333 | friend struct OSDMemCache; | |
334 | friend struct IncCache; | |
335 | friend struct FullCache; | |
336 | ||
7c673cae FG |
337 | /** |
338 | * we haven't delegated full version stashing to paxosservice for some time | |
339 | * now, making this function useless in current context. | |
340 | */ | |
341 | void encode_full(MonitorDBStore::TransactionRef t) override { } | |
342 | /** | |
343 | * do not let paxosservice periodically stash full osdmaps, or we will break our | |
344 | * locally-managed full maps. (update_from_paxos loads the latest and writes them | |
345 | * out going forward from there, but if we just synced that may mean we skip some.) | |
346 | */ | |
347 | bool should_stash_full() override { | |
348 | return false; | |
349 | } | |
350 | ||
351 | /** | |
352 | * hook into trim to include the oldest full map in the trim transaction | |
353 | * | |
354 | * This ensures that anyone post-sync will have enough to rebuild their | |
355 | * full osdmaps. | |
356 | */ | |
357 | void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) override; | |
358 | ||
359 | void update_msgr_features(); | |
360 | int check_cluster_features(uint64_t features, stringstream &ss); | |
361 | /** | |
362 | * check if the cluster supports the features required by the | |
363 | * given crush map. Outputs the daemons which don't support it | |
364 | * to the stringstream. | |
365 | * | |
366 | * @returns true if the map is passable, false otherwise | |
367 | */ | |
368 | bool validate_crush_against_features(const CrushWrapper *newcrush, | |
369 | stringstream &ss); | |
370 | void check_osdmap_subs(); | |
371 | void share_map_with_random_osd(); | |
372 | ||
373 | Mutex prime_pg_temp_lock = {"OSDMonitor::prime_pg_temp_lock"}; | |
374 | struct PrimeTempJob : public ParallelPGMapper::Job { | |
375 | OSDMonitor *osdmon; | |
376 | PrimeTempJob(const OSDMap& om, OSDMonitor *m) | |
377 | : ParallelPGMapper::Job(&om), osdmon(m) {} | |
378 | void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override { | |
379 | for (unsigned ps = ps_begin; ps < ps_end; ++ps) { | |
380 | pg_t pgid(ps, pool); | |
381 | osdmon->prime_pg_temp(*osdmap, pgid); | |
382 | } | |
383 | } | |
494da23a | 384 | void process(const vector<pg_t>& pgs) override {} |
7c673cae FG |
385 | void complete() override {} |
386 | }; | |
387 | void maybe_prime_pg_temp(); | |
388 | void prime_pg_temp(const OSDMap& next, pg_t pgid); | |
389 | ||
390 | ParallelPGMapper mapper; ///< for background pg work | |
391 | OSDMapMapping mapping; ///< pg <-> osd mappings | |
392 | unique_ptr<ParallelPGMapper::Job> mapping_job; ///< background mapping job | |
393 | void start_mapping(); | |
394 | ||
395 | void update_logger(); | |
396 | ||
397 | void handle_query(PaxosServiceMessage *m); | |
398 | bool preprocess_query(MonOpRequestRef op) override; // true if processed. | |
399 | bool prepare_update(MonOpRequestRef op) override; | |
400 | bool should_propose(double &delay) override; | |
401 | ||
11fdf7f2 | 402 | version_t get_trim_to() const override; |
7c673cae FG |
403 | |
404 | bool can_mark_down(int o); | |
405 | bool can_mark_up(int o); | |
406 | bool can_mark_out(int o); | |
407 | bool can_mark_in(int o); | |
408 | ||
409 | // ... | |
28e407b8 AA |
410 | MOSDMap *build_latest_full(uint64_t features); |
411 | MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features); | |
7c673cae FG |
412 | void send_full(MonOpRequestRef op); |
413 | void send_incremental(MonOpRequestRef op, epoch_t first); | |
414 | public: | |
415 | // @param req an optional op request, if the osdmaps are replies to it. so | |
416 | // @c Monitor::send_reply() can mark_event with it. | |
417 | void send_incremental(epoch_t first, MonSession *session, bool onetime, | |
418 | MonOpRequestRef req = MonOpRequestRef()); | |
419 | ||
420 | private: | |
421 | void print_utilization(ostream &out, Formatter *f, bool tree) const; | |
422 | ||
11fdf7f2 | 423 | bool check_source(MonOpRequestRef op, uuid_d fsid); |
7c673cae FG |
424 | |
425 | bool preprocess_get_osdmap(MonOpRequestRef op); | |
426 | ||
427 | bool preprocess_mark_me_down(MonOpRequestRef op); | |
428 | ||
429 | friend class C_AckMarkedDown; | |
430 | bool preprocess_failure(MonOpRequestRef op); | |
431 | bool prepare_failure(MonOpRequestRef op); | |
432 | bool prepare_mark_me_down(MonOpRequestRef op); | |
433 | void process_failures(); | |
434 | void take_all_failures(list<MonOpRequestRef>& ls); | |
435 | ||
436 | bool preprocess_full(MonOpRequestRef op); | |
437 | bool prepare_full(MonOpRequestRef op); | |
438 | ||
439 | bool preprocess_boot(MonOpRequestRef op); | |
440 | bool prepare_boot(MonOpRequestRef op); | |
441 | void _booted(MonOpRequestRef op, bool logit); | |
442 | ||
443 | void update_up_thru(int from, epoch_t up_thru); | |
444 | bool preprocess_alive(MonOpRequestRef op); | |
445 | bool prepare_alive(MonOpRequestRef op); | |
446 | void _reply_map(MonOpRequestRef op, epoch_t e); | |
447 | ||
448 | bool preprocess_pgtemp(MonOpRequestRef op); | |
449 | bool prepare_pgtemp(MonOpRequestRef op); | |
450 | ||
451 | bool preprocess_pg_created(MonOpRequestRef op); | |
452 | bool prepare_pg_created(MonOpRequestRef op); | |
453 | ||
11fdf7f2 TL |
454 | bool preprocess_pg_ready_to_merge(MonOpRequestRef op); |
455 | bool prepare_pg_ready_to_merge(MonOpRequestRef op); | |
456 | ||
7c673cae FG |
457 | int _check_remove_pool(int64_t pool_id, const pg_pool_t &pool, ostream *ss); |
458 | bool _check_become_tier( | |
459 | int64_t tier_pool_id, const pg_pool_t *tier_pool, | |
460 | int64_t base_pool_id, const pg_pool_t *base_pool, | |
461 | int *err, ostream *ss) const; | |
462 | bool _check_remove_tier( | |
463 | int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool, | |
464 | int *err, ostream *ss) const; | |
465 | ||
466 | int _prepare_remove_pool(int64_t pool, ostream *ss, bool no_fake); | |
467 | int _prepare_rename_pool(int64_t pool, string newname); | |
468 | ||
28e407b8 | 469 | bool enforce_pool_op_caps(MonOpRequestRef op); |
7c673cae FG |
470 | bool preprocess_pool_op (MonOpRequestRef op); |
471 | bool preprocess_pool_op_create (MonOpRequestRef op); | |
472 | bool prepare_pool_op (MonOpRequestRef op); | |
473 | bool prepare_pool_op_create (MonOpRequestRef op); | |
474 | bool prepare_pool_op_delete(MonOpRequestRef op); | |
475 | int crush_rename_bucket(const string& srcname, | |
476 | const string& dstname, | |
477 | ostream *ss); | |
478 | void check_legacy_ec_plugin(const string& plugin, | |
479 | const string& profile) const; | |
480 | int normalize_profile(const string& profilename, | |
481 | ErasureCodeProfile &profile, | |
482 | bool force, | |
483 | ostream *ss); | |
31f18b77 FG |
484 | int crush_rule_create_erasure(const string &name, |
485 | const string &profile, | |
486 | int *rule, | |
487 | ostream *ss); | |
488 | int get_crush_rule(const string &rule_name, | |
489 | int *crush_rule, | |
7c673cae FG |
490 | ostream *ss); |
491 | int get_erasure_code(const string &erasure_code_profile, | |
492 | ErasureCodeInterfaceRef *erasure_code, | |
493 | ostream *ss) const; | |
31f18b77 | 494 | int prepare_pool_crush_rule(const unsigned pool_type, |
7c673cae | 495 | const string &erasure_code_profile, |
31f18b77 FG |
496 | const string &rule_name, |
497 | int *crush_rule, | |
7c673cae FG |
498 | ostream *ss); |
499 | bool erasure_code_profile_in_use( | |
500 | const mempool::osdmap::map<int64_t, pg_pool_t> &pools, | |
501 | const string &profile, | |
502 | ostream *ss); | |
503 | int parse_erasure_code_profile(const vector<string> &erasure_code_profile, | |
504 | map<string,string> *erasure_code_profile_map, | |
505 | ostream *ss); | |
506 | int prepare_pool_size(const unsigned pool_type, | |
507 | const string &erasure_code_profile, | |
11fdf7f2 | 508 | uint8_t repl_size, |
7c673cae FG |
509 | unsigned *size, unsigned *min_size, |
510 | ostream *ss); | |
511 | int prepare_pool_stripe_width(const unsigned pool_type, | |
512 | const string &erasure_code_profile, | |
513 | unsigned *stripe_width, | |
514 | ostream *ss); | |
3efd9988 | 515 | int check_pg_num(int64_t pool, int pg_num, int size, ostream* ss); |
11fdf7f2 | 516 | int prepare_new_pool(string& name, |
31f18b77 FG |
517 | int crush_rule, |
518 | const string &crush_rule_name, | |
7c673cae | 519 | unsigned pg_num, unsigned pgp_num, |
11fdf7f2 TL |
520 | unsigned pg_num_min, |
521 | uint64_t repl_size, | |
522 | const uint64_t target_size_bytes, | |
523 | const float target_size_ratio, | |
7c673cae FG |
524 | const string &erasure_code_profile, |
525 | const unsigned pool_type, | |
526 | const uint64_t expected_num_objects, | |
527 | FastReadType fast_read, | |
528 | ostream *ss); | |
529 | int prepare_new_pool(MonOpRequestRef op); | |
530 | ||
3efd9988 FG |
531 | void set_pool_flags(int64_t pool_id, uint64_t flags); |
532 | void clear_pool_flags(int64_t pool_id, uint64_t flags); | |
7c673cae | 533 | bool update_pools_status(); |
7c673cae | 534 | |
11fdf7f2 TL |
535 | string make_snap_epoch_key(int64_t pool, epoch_t epoch); |
536 | string make_snap_key(int64_t pool, snapid_t snap); | |
537 | string make_snap_key_value(int64_t pool, snapid_t snap, snapid_t num, | |
538 | epoch_t epoch, bufferlist *v); | |
539 | string make_snap_purged_key(int64_t pool, snapid_t snap); | |
540 | string make_snap_purged_key_value(int64_t pool, snapid_t snap, snapid_t num, | |
541 | epoch_t epoch, bufferlist *v); | |
542 | bool try_prune_purged_snaps(); | |
543 | int lookup_pruned_snap(int64_t pool, snapid_t snap, | |
544 | snapid_t *begin, snapid_t *end); | |
545 | ||
7c673cae FG |
546 | bool prepare_set_flag(MonOpRequestRef op, int flag); |
547 | bool prepare_unset_flag(MonOpRequestRef op, int flag); | |
548 | ||
549 | void _pool_op_reply(MonOpRequestRef op, | |
550 | int ret, epoch_t epoch, bufferlist *blp=NULL); | |
551 | ||
552 | struct C_Booted : public C_MonOp { | |
553 | OSDMonitor *cmon; | |
554 | bool logit; | |
555 | C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) : | |
556 | C_MonOp(op_), cmon(cm), logit(l) {} | |
557 | void _finish(int r) override { | |
558 | if (r >= 0) | |
559 | cmon->_booted(op, logit); | |
560 | else if (r == -ECANCELED) | |
561 | return; | |
562 | else if (r == -EAGAIN) | |
563 | cmon->dispatch(op); | |
564 | else | |
11fdf7f2 | 565 | ceph_abort_msg("bad C_Booted return value"); |
7c673cae FG |
566 | } |
567 | }; | |
568 | ||
569 | struct C_ReplyMap : public C_MonOp { | |
570 | OSDMonitor *osdmon; | |
571 | epoch_t e; | |
572 | C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee) | |
573 | : C_MonOp(op_), osdmon(o), e(ee) {} | |
574 | void _finish(int r) override { | |
575 | if (r >= 0) | |
576 | osdmon->_reply_map(op, e); | |
577 | else if (r == -ECANCELED) | |
578 | return; | |
579 | else if (r == -EAGAIN) | |
580 | osdmon->dispatch(op); | |
581 | else | |
11fdf7f2 | 582 | ceph_abort_msg("bad C_ReplyMap return value"); |
7c673cae FG |
583 | } |
584 | }; | |
585 | struct C_PoolOp : public C_MonOp { | |
586 | OSDMonitor *osdmon; | |
587 | int replyCode; | |
588 | int epoch; | |
589 | bufferlist reply_data; | |
590 | C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, bufferlist *rd=NULL) : | |
591 | C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) { | |
592 | if (rd) | |
593 | reply_data = *rd; | |
594 | } | |
595 | void _finish(int r) override { | |
596 | if (r >= 0) | |
597 | osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data); | |
598 | else if (r == -ECANCELED) | |
599 | return; | |
600 | else if (r == -EAGAIN) | |
601 | osdmon->dispatch(op); | |
602 | else | |
11fdf7f2 | 603 | ceph_abort_msg("bad C_PoolOp return value"); |
7c673cae FG |
604 | } |
605 | }; | |
606 | ||
607 | bool preprocess_remove_snaps(MonOpRequestRef op); | |
608 | bool prepare_remove_snaps(MonOpRequestRef op); | |
609 | ||
7c673cae | 610 | int load_metadata(int osd, map<string, string>& m, ostream *err); |
31f18b77 | 611 | void count_metadata(const string& field, Formatter *f); |
28e407b8 AA |
612 | |
613 | void reencode_incremental_map(bufferlist& bl, uint64_t features); | |
614 | void reencode_full_map(bufferlist& bl, uint64_t features); | |
c07f9fc5 FG |
615 | public: |
616 | void count_metadata(const string& field, map<string,int> *out); | |
617 | protected: | |
7c673cae FG |
618 | int get_osd_objectstore_type(int osd, std::string *type); |
619 | bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool, | |
620 | ostream *err); | |
621 | ||
622 | // when we last received PG stats from each osd | |
623 | map<int,utime_t> last_osd_report; | |
624 | // TODO: use last_osd_report to store the osd report epochs, once we don't | |
625 | // need to upgrade from pre-luminous releases. | |
626 | map<int,epoch_t> osd_epochs; | |
627 | LastEpochClean last_epoch_clean; | |
628 | bool preprocess_beacon(MonOpRequestRef op); | |
629 | bool prepare_beacon(MonOpRequestRef op); | |
630 | epoch_t get_min_last_epoch_clean() const; | |
631 | ||
632 | friend class C_UpdateCreatingPGs; | |
11fdf7f2 | 633 | std::map<int, std::map<epoch_t, std::set<spg_t>>> creating_pgs_by_osd_epoch; |
7c673cae FG |
634 | std::vector<pg_t> pending_created_pgs; |
635 | // the epoch when the pg mapping was calculated | |
636 | epoch_t creating_pgs_epoch = 0; | |
637 | creating_pgs_t creating_pgs; | |
c07f9fc5 | 638 | mutable std::mutex creating_pgs_lock; |
7c673cae | 639 | |
94b18763 FG |
640 | creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc, |
641 | const OSDMap& nextmap); | |
31f18b77 | 642 | unsigned scan_for_creating_pgs( |
7c673cae FG |
643 | const mempool::osdmap::map<int64_t,pg_pool_t>& pools, |
644 | const mempool::osdmap::set<int64_t>& removed_pools, | |
645 | utime_t modified, | |
646 | creating_pgs_t* creating_pgs) const; | |
647 | pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const; | |
648 | void update_creating_pgs(); | |
649 | void check_pg_creates_subs(); | |
c07f9fc5 | 650 | epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const; |
7c673cae | 651 | |
31f18b77 FG |
652 | int32_t _allocate_osd_id(int32_t* existing_id); |
653 | ||
7c673cae FG |
654 | public: |
655 | OSDMonitor(CephContext *cct, Monitor *mn, Paxos *p, const string& service_name); | |
656 | ||
657 | void tick() override; // check state, take actions | |
658 | ||
7c673cae FG |
659 | bool preprocess_command(MonOpRequestRef op); |
660 | bool prepare_command(MonOpRequestRef op); | |
11fdf7f2 | 661 | bool prepare_command_impl(MonOpRequestRef op, const cmdmap_t& cmdmap); |
7c673cae | 662 | |
31f18b77 FG |
663 | int validate_osd_create( |
664 | const int32_t id, | |
665 | const uuid_d& uuid, | |
666 | const bool check_osd_exists, | |
667 | int32_t* existing_id, | |
668 | stringstream& ss); | |
669 | int prepare_command_osd_create( | |
670 | const int32_t id, | |
671 | const uuid_d& uuid, | |
672 | int32_t* existing_id, | |
673 | stringstream& ss); | |
3a9019d9 FG |
674 | void do_osd_create(const int32_t id, const uuid_d& uuid, |
675 | const string& device_class, | |
676 | int32_t* new_id); | |
31f18b77 FG |
677 | int prepare_command_osd_purge(int32_t id, stringstream& ss); |
678 | int prepare_command_osd_destroy(int32_t id, stringstream& ss); | |
679 | int _prepare_command_osd_crush_remove( | |
680 | CrushWrapper &newcrush, | |
681 | int32_t id, | |
682 | int32_t ancestor, | |
683 | bool has_ancestor, | |
684 | bool unlink_only); | |
685 | void do_osd_crush_remove(CrushWrapper& newcrush); | |
686 | int prepare_command_osd_crush_remove( | |
687 | CrushWrapper &newcrush, | |
688 | int32_t id, | |
689 | int32_t ancestor, | |
690 | bool has_ancestor, | |
691 | bool unlink_only); | |
692 | int prepare_command_osd_remove(int32_t id); | |
693 | int prepare_command_osd_new( | |
694 | MonOpRequestRef op, | |
11fdf7f2 | 695 | const cmdmap_t& cmdmap, |
31f18b77 FG |
696 | const map<string,string>& secrets, |
697 | stringstream &ss, | |
698 | Formatter *f); | |
699 | ||
11fdf7f2 | 700 | int prepare_command_pool_set(const cmdmap_t& cmdmap, |
7c673cae | 701 | stringstream& ss); |
11fdf7f2 | 702 | |
c07f9fc5 | 703 | int prepare_command_pool_application(const string &prefix, |
11fdf7f2 | 704 | const cmdmap_t& cmdmap, |
c07f9fc5 | 705 | stringstream& ss); |
11fdf7f2 TL |
706 | int preprocess_command_pool_application(const string &prefix, |
707 | const cmdmap_t& cmdmap, | |
708 | stringstream& ss, | |
709 | bool *modified); | |
710 | int _command_pool_application(const string &prefix, | |
711 | const cmdmap_t& cmdmap, | |
712 | stringstream& ss, | |
713 | bool *modified, | |
714 | bool preparing); | |
7c673cae FG |
715 | |
716 | bool handle_osd_timeouts(const utime_t &now, | |
717 | std::map<int,utime_t> &last_osd_report); | |
718 | ||
719 | void send_latest(MonOpRequestRef op, epoch_t start=0); | |
720 | void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) { | |
721 | op->mark_osdmon_event(__func__); | |
722 | send_incremental(op, start); | |
723 | } | |
724 | ||
11fdf7f2 TL |
725 | void get_removed_snaps_range( |
726 | epoch_t start, epoch_t end, | |
727 | mempool::osdmap::map<int64_t,OSDMap::snap_interval_set_t> *gap_removed_snaps); | |
728 | ||
7c673cae | 729 | int get_version(version_t ver, bufferlist& bl) override; |
28e407b8 AA |
730 | int get_version(version_t ver, uint64_t feature, bufferlist& bl); |
731 | ||
732 | int get_version_full(version_t ver, uint64_t feature, bufferlist& bl); | |
7c673cae | 733 | int get_version_full(version_t ver, bufferlist& bl) override; |
11fdf7f2 TL |
734 | int get_inc(version_t ver, OSDMap::Incremental& inc); |
735 | int get_full_from_pinned_map(version_t ver, bufferlist& bl); | |
7c673cae | 736 | |
11fdf7f2 TL |
737 | epoch_t blacklist(const entity_addrvec_t& av, utime_t until); |
738 | epoch_t blacklist(entity_addr_t a, utime_t until); | |
7c673cae FG |
739 | |
740 | void dump_info(Formatter *f); | |
741 | int dump_osd_metadata(int osd, Formatter *f, ostream *err); | |
742 | void print_nodes(Formatter *f); | |
743 | ||
744 | void check_osdmap_sub(Subscription *sub); | |
745 | void check_pg_creates_sub(Subscription *sub); | |
746 | ||
11fdf7f2 TL |
747 | void do_application_enable(int64_t pool_id, const std::string &app_name, |
748 | const std::string &app_key="", | |
749 | const std::string &app_value=""); | |
494da23a TL |
750 | void do_set_pool_opt(int64_t pool_id, pool_opts_t::key_t opt, |
751 | pool_opts_t::value_t); | |
c07f9fc5 | 752 | |
7c673cae FG |
753 | void add_flag(int flag) { |
754 | if (!(osdmap.flags & flag)) { | |
755 | if (pending_inc.new_flags < 0) | |
756 | pending_inc.new_flags = osdmap.flags; | |
757 | pending_inc.new_flags |= flag; | |
758 | } | |
759 | } | |
760 | ||
761 | void remove_flag(int flag) { | |
762 | if(osdmap.flags & flag) { | |
763 | if (pending_inc.new_flags < 0) | |
764 | pending_inc.new_flags = osdmap.flags; | |
765 | pending_inc.new_flags &= ~flag; | |
766 | } | |
767 | } | |
81eedcae | 768 | void convert_pool_priorities(void); |
7c673cae FG |
769 | }; |
770 | ||
771 | #endif |