]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.h
255902169d9bd8fe42dd2c92ed18552c123918f4
[ceph.git] / ceph / src / mon / OSDMonitor.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 /* Object Store Device (OSD) Monitor
19 */
20
21 #ifndef CEPH_OSDMONITOR_H
22 #define CEPH_OSDMONITOR_H
23
24 #include <map>
25 #include <set>
26 #include <utility>
27
28 #include "include/types.h"
29 #include "include/encoding.h"
30 #include "common/simple_cache.hpp"
31 #include "common/PriorityCache.h"
32 #include "msg/Messenger.h"
33
34 #include "osd/OSDMap.h"
35 #include "osd/OSDMapMapping.h"
36
37 #include "CreatingPGs.h"
38 #include "PaxosService.h"
39
40 #include "erasure-code/ErasureCodeInterface.h"
41 #include "mon/MonOpRequest.h"
42 #include <boost/functional/hash.hpp>
43
44 class Monitor;
45 class PGMap;
46 struct MonSession;
47 class MOSDMap;
48
49
50 /// information about a particular peer's failure reports for one osd
51 struct failure_reporter_t {
52 utime_t failed_since; ///< when they think it failed
53 MonOpRequestRef op; ///< failure op request
54
55 failure_reporter_t() {}
56 explicit failure_reporter_t(utime_t s) : failed_since(s) {}
57 ~failure_reporter_t() { }
58 };
59
60 /// information about all failure reports for one osd
61 struct failure_info_t {
62 std::map<int, failure_reporter_t> reporters; ///< reporter -> failed_since etc
63 utime_t max_failed_since; ///< most recent failed_since
64
65 failure_info_t() {}
66
67 utime_t get_failed_since() {
68 if (max_failed_since == utime_t() && !reporters.empty()) {
69 // the old max must have canceled; recalculate.
70 for (auto p = reporters.begin(); p != reporters.end(); ++p)
71 if (p->second.failed_since > max_failed_since)
72 max_failed_since = p->second.failed_since;
73 }
74 return max_failed_since;
75 }
76
77 // set the message for the latest report. return any old op request we had,
78 // if any, so we can discard it.
79 MonOpRequestRef add_report(int who, utime_t failed_since,
80 MonOpRequestRef op) {
81 auto p = reporters.find(who);
82 if (p == reporters.end()) {
83 if (max_failed_since != utime_t() && max_failed_since < failed_since)
84 max_failed_since = failed_since;
85 p = reporters.insert(std::map<int, failure_reporter_t>::value_type(who, failure_reporter_t(failed_since))).first;
86 }
87
88 MonOpRequestRef ret = p->second.op;
89 p->second.op = op;
90 return ret;
91 }
92
93 void take_report_messages(std::list<MonOpRequestRef>& ls) {
94 for (auto p = reporters.begin(); p != reporters.end(); ++p) {
95 if (p->second.op) {
96 ls.push_back(p->second.op);
97 p->second.op.reset();
98 }
99 }
100 }
101
102 MonOpRequestRef cancel_report(int who) {
103 auto p = reporters.find(who);
104 if (p == reporters.end())
105 return MonOpRequestRef();
106 MonOpRequestRef ret = p->second.op;
107 reporters.erase(p);
108 max_failed_since = utime_t();
109 return ret;
110 }
111 };
112
113
114 class LastEpochClean {
115 struct Lec {
116 std::vector<epoch_t> epoch_by_pg;
117 ps_t next_missing = 0;
118 epoch_t floor = std::numeric_limits<epoch_t>::max();
119 void report(ps_t pg, epoch_t last_epoch_clean);
120 };
121 std::map<uint64_t, Lec> report_by_pool;
122 public:
123 void report(const pg_t& pg, epoch_t last_epoch_clean);
124 void remove_pool(uint64_t pool);
125 epoch_t get_lower_bound(const OSDMap& latest) const;
126
127 void dump(Formatter *f) const;
128 };
129
130
131 struct osdmap_manifest_t {
132 // all the maps we have pinned -- i.e., won't be removed unless
133 // they are inside a trim interval.
134 std::set<version_t> pinned;
135
136 osdmap_manifest_t() {}
137
138 version_t get_last_pinned() const
139 {
140 auto it = pinned.crbegin();
141 if (it == pinned.crend()) {
142 return 0;
143 }
144 return *it;
145 }
146
147 version_t get_first_pinned() const
148 {
149 auto it = pinned.cbegin();
150 if (it == pinned.cend()) {
151 return 0;
152 }
153 return *it;
154 }
155
156 bool is_pinned(version_t v) const
157 {
158 return pinned.find(v) != pinned.end();
159 }
160
161 void pin(version_t v)
162 {
163 pinned.insert(v);
164 }
165
166 version_t get_lower_closest_pinned(version_t v) const {
167 auto p = pinned.lower_bound(v);
168 if (p == pinned.cend()) {
169 return 0;
170 } else if (*p > v) {
171 if (p == pinned.cbegin()) {
172 return 0;
173 }
174 --p;
175 }
176 return *p;
177 }
178
179 void encode(ceph::buffer::list& bl) const
180 {
181 ENCODE_START(1, 1, bl);
182 encode(pinned, bl);
183 ENCODE_FINISH(bl);
184 }
185
186 void decode(ceph::buffer::list::const_iterator& bl)
187 {
188 DECODE_START(1, bl);
189 decode(pinned, bl);
190 DECODE_FINISH(bl);
191 }
192
193 void decode(ceph::buffer::list& bl) {
194 auto p = bl.cbegin();
195 decode(p);
196 }
197
198 void dump(ceph::Formatter *f) {
199 f->dump_unsigned("first_pinned", get_first_pinned());
200 f->dump_unsigned("last_pinned", get_last_pinned());
201 f->open_array_section("pinned_maps");
202 for (auto& i : pinned) {
203 f->dump_unsigned("epoch", i);
204 }
205 f->close_section();
206 }
207 };
208 WRITE_CLASS_ENCODER(osdmap_manifest_t);
209
210 class OSDMonitor : public PaxosService,
211 public md_config_obs_t {
212 CephContext *cct;
213
214 public:
215 OSDMap osdmap;
216
217 // config observer
218 const char** get_tracked_conf_keys() const override;
219 void handle_conf_change(const ConfigProxy& conf,
220 const std::set<std::string> &changed) override;
221 // [leader]
222 OSDMap::Incremental pending_inc;
223 std::map<int, ceph::buffer::list> pending_metadata;
224 std::set<int> pending_metadata_rm;
225 std::map<int, failure_info_t> failure_info;
226 std::map<int,utime_t> down_pending_out; // osd down -> out
227 bool priority_convert = false;
228 std::map<int64_t,std::set<snapid_t>> pending_pseudo_purged_snaps;
229 std::shared_ptr<PriorityCache::PriCache> rocksdb_binned_kv_cache = nullptr;
230 std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
231 ceph::mutex balancer_lock = ceph::make_mutex("OSDMonitor::balancer_lock");
232
233 std::map<int,double> osd_weight;
234
235 using osdmap_key_t = std::pair<version_t, uint64_t>;
236 using osdmap_cache_t = SimpleLRU<osdmap_key_t,
237 ceph::buffer::list,
238 std::less<osdmap_key_t>,
239 boost::hash<osdmap_key_t>>;
240 osdmap_cache_t inc_osd_cache;
241 osdmap_cache_t full_osd_cache;
242
243 bool has_osdmap_manifest;
244 osdmap_manifest_t osdmap_manifest;
245
246 bool check_failures(utime_t now);
247 bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
248 void force_failure(int target_osd, int by);
249
250 bool _have_pending_crush();
251 CrushWrapper &_get_stable_crush();
252 void _get_pending_crush(CrushWrapper& newcrush);
253
254 enum FastReadType {
255 FAST_READ_OFF,
256 FAST_READ_ON,
257 FAST_READ_DEFAULT
258 };
259
260 struct CleanUpmapJob : public ParallelPGMapper::Job {
261 CephContext *cct;
262 const OSDMap& osdmap;
263 OSDMap::Incremental& pending_inc;
264 // lock to protect pending_inc form changing
265 // when checking is done
266 ceph::mutex pending_inc_lock =
267 ceph::make_mutex("CleanUpmapJob::pending_inc_lock");
268
269 CleanUpmapJob(CephContext *cct, const OSDMap& om, OSDMap::Incremental& pi)
270 : ParallelPGMapper::Job(&om),
271 cct(cct),
272 osdmap(om),
273 pending_inc(pi) {}
274
275 void process(const std::vector<pg_t>& to_check) override {
276 std::vector<pg_t> to_cancel;
277 std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>> to_remap;
278 osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
279 // don't bother taking lock if nothing changes
280 if (!to_cancel.empty() || !to_remap.empty()) {
281 std::lock_guard l(pending_inc_lock);
282 osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_remap);
283 }
284 }
285
286 void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) override {}
287 void complete() override {}
288 }; // public as this will need to be accessible from TestTestOSDMap.cc
289
290 // svc
291 public:
292 void create_initial() override;
293 void get_store_prefixes(std::set<std::string>& s) const override;
294
295 private:
296 void update_from_paxos(bool *need_bootstrap) override;
297 void create_pending() override; // prepare a new pending
298 void encode_pending(MonitorDBStore::TransactionRef t) override;
299 void on_active() override;
300 void on_restart() override;
301 void on_shutdown() override;
302
303 /* osdmap full map prune */
304 void load_osdmap_manifest();
305 bool should_prune() const;
306 void _prune_update_trimmed(
307 MonitorDBStore::TransactionRef tx,
308 version_t first);
309 void prune_init(osdmap_manifest_t& manifest);
310 bool _prune_sanitize_options() const;
311 bool is_prune_enabled() const;
312 bool is_prune_supported() const;
313 bool do_prune(MonitorDBStore::TransactionRef tx);
314
315 // Priority cache control
316 uint32_t mon_osd_cache_size = 0; ///< Number of cached OSDMaps
317 uint64_t rocksdb_cache_size = 0; ///< Cache for kv Db
318 double cache_kv_ratio = 0; ///< Cache ratio dedicated to kv
319 double cache_inc_ratio = 0; ///< Cache ratio dedicated to inc
320 double cache_full_ratio = 0; ///< Cache ratio dedicated to full
321 uint64_t mon_memory_base = 0; ///< Mon base memory for cache autotuning
322 double mon_memory_fragmentation = 0; ///< Expected memory fragmentation
323 uint64_t mon_memory_target = 0; ///< Mon target memory for cache autotuning
324 uint64_t mon_memory_min = 0; ///< Min memory to cache osdmaps
325 bool mon_memory_autotune = false; ///< Cache auto tune setting
326 int register_cache_with_pcm();
327 int _set_cache_sizes();
328 int _set_cache_ratios();
329 void _set_new_cache_sizes();
330 void _set_cache_autotuning();
331 int _update_mon_cache_settings();
332
333 friend struct OSDMemCache;
334 friend struct IncCache;
335 friend struct FullCache;
336
337 /**
338 * we haven't delegated full version stashing to paxosservice for some time
339 * now, making this function useless in current context.
340 */
341 void encode_full(MonitorDBStore::TransactionRef t) override { }
342 /**
343 * do not let paxosservice periodically stash full osdmaps, or we will break our
344 * locally-managed full maps. (update_from_paxos loads the latest and writes them
345 * out going forward from there, but if we just synced that may mean we skip some.)
346 */
347 bool should_stash_full() override {
348 return false;
349 }
350
351 /**
352 * hook into trim to include the oldest full map in the trim transaction
353 *
354 * This ensures that anyone post-sync will have enough to rebuild their
355 * full osdmaps.
356 */
357 void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) override;
358
359 void update_msgr_features();
360 /**
361 * check if the cluster supports the features required by the
362 * given crush map. Outputs the daemons which don't support it
363 * to the stringstream.
364 *
365 * @returns true if the map is passable, false otherwise
366 */
367 bool validate_crush_against_features(const CrushWrapper *newcrush,
368 std::stringstream &ss);
369 void check_osdmap_subs();
370 void share_map_with_random_osd();
371
372 ceph::mutex prime_pg_temp_lock =
373 ceph::make_mutex("OSDMonitor::prime_pg_temp_lock");
374 struct PrimeTempJob : public ParallelPGMapper::Job {
375 OSDMonitor *osdmon;
376 PrimeTempJob(const OSDMap& om, OSDMonitor *m)
377 : ParallelPGMapper::Job(&om), osdmon(m) {}
378 void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override {
379 for (unsigned ps = ps_begin; ps < ps_end; ++ps) {
380 pg_t pgid(ps, pool);
381 osdmon->prime_pg_temp(*osdmap, pgid);
382 }
383 }
384 void process(const std::vector<pg_t>& pgs) override {}
385 void complete() override {}
386 };
387 void maybe_prime_pg_temp();
388 void prime_pg_temp(const OSDMap& next, pg_t pgid);
389
390 ParallelPGMapper mapper; ///< for background pg work
391 OSDMapMapping mapping; ///< pg <-> osd mappings
392 std::unique_ptr<ParallelPGMapper::Job> mapping_job; ///< background mapping job
393 void start_mapping();
394
395 void update_logger();
396
397 void handle_query(PaxosServiceMessage *m);
398 bool preprocess_query(MonOpRequestRef op) override; // true if processed.
399 bool prepare_update(MonOpRequestRef op) override;
400 bool should_propose(double &delay) override;
401
402 version_t get_trim_to() const override;
403
404 bool can_mark_down(int o);
405 bool can_mark_up(int o);
406 bool can_mark_out(int o);
407 bool can_mark_in(int o);
408
409 // ...
410 MOSDMap *build_latest_full(uint64_t features);
411 MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features);
412 void send_full(MonOpRequestRef op);
413 void send_incremental(MonOpRequestRef op, epoch_t first);
414 public:
415 /**
416 * Make sure the existing (up) OSDs support the given features
417 * @return 0 on success, or an error code if any OSDs re missing features.
418 * @param ss Filled in with ane explanation of failure, if any
419 */
420 int check_cluster_features(uint64_t features, std::stringstream &ss);
421 // @param req an optional op request, if the osdmaps are replies to it. so
422 // @c Monitor::send_reply() can mark_event with it.
423 void send_incremental(epoch_t first, MonSession *session, bool onetime,
424 MonOpRequestRef req = MonOpRequestRef());
425
426 private:
427 void print_utilization(std::ostream &out, ceph::Formatter *f, bool tree) const;
428
429 bool check_source(MonOpRequestRef op, uuid_d fsid);
430
431 bool preprocess_get_osdmap(MonOpRequestRef op);
432
433 bool preprocess_mark_me_down(MonOpRequestRef op);
434
435 friend class C_AckMarkedDown;
436 bool preprocess_failure(MonOpRequestRef op);
437 bool prepare_failure(MonOpRequestRef op);
438 bool prepare_mark_me_down(MonOpRequestRef op);
439 void process_failures();
440 void take_all_failures(std::list<MonOpRequestRef>& ls);
441
442 bool preprocess_mark_me_dead(MonOpRequestRef op);
443 bool prepare_mark_me_dead(MonOpRequestRef op);
444
445 bool preprocess_full(MonOpRequestRef op);
446 bool prepare_full(MonOpRequestRef op);
447
448 bool preprocess_boot(MonOpRequestRef op);
449 bool prepare_boot(MonOpRequestRef op);
450 void _booted(MonOpRequestRef op, bool logit);
451
452 void update_up_thru(int from, epoch_t up_thru);
453 bool preprocess_alive(MonOpRequestRef op);
454 bool prepare_alive(MonOpRequestRef op);
455 void _reply_map(MonOpRequestRef op, epoch_t e);
456
457 bool preprocess_pgtemp(MonOpRequestRef op);
458 bool prepare_pgtemp(MonOpRequestRef op);
459
460 bool preprocess_pg_created(MonOpRequestRef op);
461 bool prepare_pg_created(MonOpRequestRef op);
462
463 bool preprocess_pg_ready_to_merge(MonOpRequestRef op);
464 bool prepare_pg_ready_to_merge(MonOpRequestRef op);
465
466 int _check_remove_pool(int64_t pool_id, const pg_pool_t &pool, std::ostream *ss);
467 bool _check_become_tier(
468 int64_t tier_pool_id, const pg_pool_t *tier_pool,
469 int64_t base_pool_id, const pg_pool_t *base_pool,
470 int *err, std::ostream *ss) const;
471 bool _check_remove_tier(
472 int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
473 int *err, std::ostream *ss) const;
474
475 int _prepare_remove_pool(int64_t pool, std::ostream *ss, bool no_fake);
476 int _prepare_rename_pool(int64_t pool, std::string newname);
477
478 bool enforce_pool_op_caps(MonOpRequestRef op);
479 bool preprocess_pool_op (MonOpRequestRef op);
480 bool preprocess_pool_op_create (MonOpRequestRef op);
481 bool prepare_pool_op (MonOpRequestRef op);
482 bool prepare_pool_op_create (MonOpRequestRef op);
483 bool prepare_pool_op_delete(MonOpRequestRef op);
484 int crush_rename_bucket(const std::string& srcname,
485 const std::string& dstname,
486 std::ostream *ss);
487 void check_legacy_ec_plugin(const std::string& plugin,
488 const std::string& profile) const;
489 int normalize_profile(const std::string& profilename,
490 ceph::ErasureCodeProfile &profile,
491 bool force,
492 std::ostream *ss);
493 int crush_rule_create_erasure(const std::string &name,
494 const std::string &profile,
495 int *rule,
496 std::ostream *ss);
497 int get_crush_rule(const std::string &rule_name,
498 int *crush_rule,
499 std::ostream *ss);
500 int get_erasure_code(const std::string &erasure_code_profile,
501 ceph::ErasureCodeInterfaceRef *erasure_code,
502 std::ostream *ss) const;
503 int prepare_pool_crush_rule(const unsigned pool_type,
504 const std::string &erasure_code_profile,
505 const std::string &rule_name,
506 int *crush_rule,
507 std::ostream *ss);
508 bool erasure_code_profile_in_use(
509 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
510 const std::string &profile,
511 std::ostream *ss);
512 int parse_erasure_code_profile(const std::vector<std::string> &erasure_code_profile,
513 std::map<std::string,std::string> *erasure_code_profile_map,
514 std::ostream *ss);
515 int prepare_pool_size(const unsigned pool_type,
516 const std::string &erasure_code_profile,
517 uint8_t repl_size,
518 unsigned *size, unsigned *min_size,
519 std::ostream *ss);
520 int prepare_pool_stripe_width(const unsigned pool_type,
521 const std::string &erasure_code_profile,
522 unsigned *stripe_width,
523 std::ostream *ss);
524 int check_pg_num(int64_t pool, int pg_num, int size, std::ostream* ss);
525 int prepare_new_pool(std::string& name,
526 int crush_rule,
527 const std::string &crush_rule_name,
528 unsigned pg_num, unsigned pgp_num,
529 unsigned pg_num_min,
530 uint64_t repl_size,
531 const uint64_t target_size_bytes,
532 const float target_size_ratio,
533 const std::string &erasure_code_profile,
534 const unsigned pool_type,
535 const uint64_t expected_num_objects,
536 FastReadType fast_read,
537 const std::string& pg_autoscale_mode,
538 std::ostream *ss);
539 int prepare_new_pool(MonOpRequestRef op);
540
541 void set_pool_flags(int64_t pool_id, uint64_t flags);
542 void clear_pool_flags(int64_t pool_id, uint64_t flags);
543 bool update_pools_status();
544
545 bool _is_removed_snap(int64_t pool_id, snapid_t snapid);
546 bool _is_pending_removed_snap(int64_t pool_id, snapid_t snapid);
547
548 std::string make_purged_snap_epoch_key(epoch_t epoch);
549 std::string make_purged_snap_key(int64_t pool, snapid_t snap);
550 std::string make_purged_snap_key_value(int64_t pool, snapid_t snap, snapid_t num,
551 epoch_t epoch, ceph::buffer::list *v);
552
553 bool try_prune_purged_snaps();
554 int lookup_purged_snap(int64_t pool, snapid_t snap,
555 snapid_t *begin, snapid_t *end);
556
557 void insert_purged_snap_update(
558 int64_t pool,
559 snapid_t start, snapid_t end,
560 epoch_t epoch,
561 MonitorDBStore::TransactionRef t);
562
563 bool prepare_set_flag(MonOpRequestRef op, int flag);
564 bool prepare_unset_flag(MonOpRequestRef op, int flag);
565
566 void _pool_op_reply(MonOpRequestRef op,
567 int ret, epoch_t epoch, ceph::buffer::list *blp=NULL);
568
569 struct C_Booted : public C_MonOp {
570 OSDMonitor *cmon;
571 bool logit;
572 C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) :
573 C_MonOp(op_), cmon(cm), logit(l) {}
574 void _finish(int r) override {
575 if (r >= 0)
576 cmon->_booted(op, logit);
577 else if (r == -ECANCELED)
578 return;
579 else if (r == -EAGAIN)
580 cmon->dispatch(op);
581 else
582 ceph_abort_msg("bad C_Booted return value");
583 }
584 };
585
586 struct C_ReplyMap : public C_MonOp {
587 OSDMonitor *osdmon;
588 epoch_t e;
589 C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee)
590 : C_MonOp(op_), osdmon(o), e(ee) {}
591 void _finish(int r) override {
592 if (r >= 0)
593 osdmon->_reply_map(op, e);
594 else if (r == -ECANCELED)
595 return;
596 else if (r == -EAGAIN)
597 osdmon->dispatch(op);
598 else
599 ceph_abort_msg("bad C_ReplyMap return value");
600 }
601 };
602 struct C_PoolOp : public C_MonOp {
603 OSDMonitor *osdmon;
604 int replyCode;
605 int epoch;
606 ceph::buffer::list reply_data;
607 C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, ceph::buffer::list *rd=NULL) :
608 C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) {
609 if (rd)
610 reply_data = *rd;
611 }
612 void _finish(int r) override {
613 if (r >= 0)
614 osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data);
615 else if (r == -ECANCELED)
616 return;
617 else if (r == -EAGAIN)
618 osdmon->dispatch(op);
619 else
620 ceph_abort_msg("bad C_PoolOp return value");
621 }
622 };
623
624 bool preprocess_remove_snaps(MonOpRequestRef op);
625 bool prepare_remove_snaps(MonOpRequestRef op);
626
627 bool preprocess_get_purged_snaps(MonOpRequestRef op);
628
629 int load_metadata(int osd, std::map<std::string, std::string>& m,
630 std::ostream *err);
631 void count_metadata(const std::string& field, ceph::Formatter *f);
632
633 void reencode_incremental_map(ceph::buffer::list& bl, uint64_t features);
634 void reencode_full_map(ceph::buffer::list& bl, uint64_t features);
635 public:
636 void count_metadata(const std::string& field, std::map<std::string,int> *out);
637 void get_versions(std::map<std::string, std::list<std::string>> &versions);
638 protected:
639 int get_osd_objectstore_type(int osd, std::string *type);
640 bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool,
641 std::ostream *err);
642
643 // when we last received PG stats from each osd and the osd's osd_beacon_report_interval
644 std::map<int, std::pair<utime_t, int>> last_osd_report;
645 // TODO: use last_osd_report to store the osd report epochs, once we don't
646 // need to upgrade from pre-luminous releases.
647 std::map<int,epoch_t> osd_epochs;
648 LastEpochClean last_epoch_clean;
649 bool preprocess_beacon(MonOpRequestRef op);
650 bool prepare_beacon(MonOpRequestRef op);
651 epoch_t get_min_last_epoch_clean() const;
652
653 friend class C_UpdateCreatingPGs;
654 std::map<int, std::map<epoch_t, std::set<spg_t>>> creating_pgs_by_osd_epoch;
655 std::vector<pg_t> pending_created_pgs;
656 // the epoch when the pg mapping was calculated
657 epoch_t creating_pgs_epoch = 0;
658 creating_pgs_t creating_pgs;
659 mutable std::mutex creating_pgs_lock;
660
661 creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc,
662 const OSDMap& nextmap);
663 unsigned scan_for_creating_pgs(
664 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
665 const mempool::osdmap::set<int64_t>& removed_pools,
666 utime_t modified,
667 creating_pgs_t* creating_pgs) const;
668 std::pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const;
669 void update_creating_pgs();
670 void check_pg_creates_subs();
671 epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const;
672
673 int32_t _allocate_osd_id(int32_t* existing_id);
674
675 int get_grace_interval_threshold();
676 bool grace_interval_threshold_exceeded(int last_failed);
677 void set_default_laggy_params(int target_osd);
678
679 public:
680 OSDMonitor(CephContext *cct, Monitor &mn, Paxos &p, const std::string& service_name);
681
682 void tick() override; // check state, take actions
683
684 bool preprocess_command(MonOpRequestRef op);
685 bool prepare_command(MonOpRequestRef op);
686 bool prepare_command_impl(MonOpRequestRef op, const cmdmap_t& cmdmap);
687
688 int validate_osd_create(
689 const int32_t id,
690 const uuid_d& uuid,
691 const bool check_osd_exists,
692 int32_t* existing_id,
693 std::stringstream& ss);
694 int prepare_command_osd_create(
695 const int32_t id,
696 const uuid_d& uuid,
697 int32_t* existing_id,
698 std::stringstream& ss);
699 void do_osd_create(const int32_t id, const uuid_d& uuid,
700 const std::string& device_class,
701 int32_t* new_id);
702 int prepare_command_osd_purge(int32_t id, std::stringstream& ss);
703 int prepare_command_osd_destroy(int32_t id, std::stringstream& ss);
704 int _prepare_command_osd_crush_remove(
705 CrushWrapper &newcrush,
706 int32_t id,
707 int32_t ancestor,
708 bool has_ancestor,
709 bool unlink_only);
710 void do_osd_crush_remove(CrushWrapper& newcrush);
711 int prepare_command_osd_crush_remove(
712 CrushWrapper &newcrush,
713 int32_t id,
714 int32_t ancestor,
715 bool has_ancestor,
716 bool unlink_only);
717 int prepare_command_osd_remove(int32_t id);
718 int prepare_command_osd_new(
719 MonOpRequestRef op,
720 const cmdmap_t& cmdmap,
721 const std::map<std::string,std::string>& secrets,
722 std::stringstream &ss,
723 ceph::Formatter *f);
724
725 int prepare_command_pool_set(const cmdmap_t& cmdmap,
726 std::stringstream& ss);
727
728 int prepare_command_pool_application(const std::string &prefix,
729 const cmdmap_t& cmdmap,
730 std::stringstream& ss);
731 int preprocess_command_pool_application(const std::string &prefix,
732 const cmdmap_t& cmdmap,
733 std::stringstream& ss,
734 bool *modified);
735 int _command_pool_application(const std::string &prefix,
736 const cmdmap_t& cmdmap,
737 std::stringstream& ss,
738 bool *modified,
739 bool preparing);
740
741 bool handle_osd_timeouts(const utime_t &now,
742 std::map<int, std::pair<utime_t, int>> &last_osd_report);
743
744 void send_latest(MonOpRequestRef op, epoch_t start=0);
745 void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) {
746 op->mark_osdmon_event(__func__);
747 send_incremental(op, start);
748 }
749
750 int get_version(version_t ver, ceph::buffer::list& bl) override;
751 int get_version(version_t ver, uint64_t feature, ceph::buffer::list& bl);
752
753 int get_version_full(version_t ver, uint64_t feature, ceph::buffer::list& bl);
754 int get_version_full(version_t ver, ceph::buffer::list& bl) override;
755 int get_inc(version_t ver, OSDMap::Incremental& inc);
756 int get_full_from_pinned_map(version_t ver, ceph::buffer::list& bl);
757
758 epoch_t blocklist(const entity_addrvec_t& av, utime_t until);
759 epoch_t blocklist(entity_addr_t a, utime_t until);
760
761 void dump_info(ceph::Formatter *f);
762 int dump_osd_metadata(int osd, ceph::Formatter *f, std::ostream *err);
763 void print_nodes(ceph::Formatter *f);
764
765 void check_osdmap_sub(Subscription *sub);
766 void check_pg_creates_sub(Subscription *sub);
767
768 void do_application_enable(int64_t pool_id, const std::string &app_name,
769 const std::string &app_key="",
770 const std::string &app_value="",
771 bool force=false);
772 void do_set_pool_opt(int64_t pool_id, pool_opts_t::key_t opt,
773 pool_opts_t::value_t);
774
775 void add_flag(int flag) {
776 if (!(osdmap.flags & flag)) {
777 if (pending_inc.new_flags < 0)
778 pending_inc.new_flags = osdmap.flags;
779 pending_inc.new_flags |= flag;
780 }
781 }
782
783 void remove_flag(int flag) {
784 if(osdmap.flags & flag) {
785 if (pending_inc.new_flags < 0)
786 pending_inc.new_flags = osdmap.flags;
787 pending_inc.new_flags &= ~flag;
788 }
789 }
790 void convert_pool_priorities(void);
791 /**
792 * Find the pools which are requested to be put into stretch mode,
793 * validate that they are allowed to be in stretch mode (eg, are replicated)
794 * and place copies of them in the pools set.
795 * This does not make any changes to the pools or state; it's just
796 * a safety-check-and-collect function.
797 */
798 void try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
799 int *errcode,
800 set<pg_pool_t*>* pools, const string& new_crush_rule);
801 /**
802 * Check validity of inputs and OSD/CRUSH state to
803 * engage stretch mode. Designed to be used with
804 * MonmapMonitor::try_enable_stretch_mode() where we call both twice,
805 * first with commit=false to validate.
806 * @param ss: a stringstream to write errors into
807 * @param okay: Filled to true if okay, false if validation fails
808 * @param errcode: filled with -errno if there's a problem
809 * @param commit: true if we should commit the change, false if just testing
810 * @param dividing_bucket: the bucket type (eg 'dc') that divides the cluster
811 * @param bucket_count: The number of buckets required in peering.
812 * Currently must be 2.
813 * @param pools: The pg_pool_ts which are being set to stretch mode (obtained
814 * from try_enable_stretch_mode_pools()).
815 * @param new_crush_rule: The crush rule to set the pools to.
816 */
817 void try_enable_stretch_mode(stringstream& ss, bool *okay,
818 int *errcode, bool commit,
819 const string& dividing_bucket,
820 uint32_t bucket_count,
821 const set<pg_pool_t*>& pools,
822 const string& new_crush_rule);
823 /**
824 * Check the input dead_buckets mapping (buckets->dead monitors) to see
825 * if the OSDs are also down. If so, fill in really_down_buckets and
826 * really_down_mons and return true; else return false.
827 */
828 bool check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
829 set<int> *really_down_buckets,
830 set<string> *really_down_mons);
831 /**
832 * Set degraded mode in the OSDMap, adding the given dead buckets to the dead set
833 * and using the live_zones (should presently be size 1)
834 */
835 void trigger_degraded_stretch_mode(const set<int>& dead_buckets,
836 const set<string>& live_zones);
837 /**
838 * Set recovery stretch mode in the OSDMap, resetting pool size back to normal
839 */
840 void trigger_recovery_stretch_mode();
841 /**
842 * Tells the OSD there's a new pg digest, in case it's interested.
843 * (It's interested when in recovering stretch mode.)
844 */
845 void notify_new_pg_digest();
846 /**
847 * Check if we can exit recovery stretch mode and go back to normal.
848 * @param force If true, we will force the exit through once it is legal,
849 * without regard to the reported PG status.
850 */
851 void try_end_recovery_stretch_mode(bool force);
852 /**
853 * Sets the osdmap and pg_pool_t values back to healthy stretch mode status.
854 */
855 void trigger_healthy_stretch_mode();
856 private:
857 utime_t stretch_recovery_triggered; // what time we committed a switch to recovery mode
858 };
859
860 #endif