]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.h
import ceph pacific 16.2.5
[ceph.git] / ceph / src / mon / OSDMonitor.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 /* Object Store Device (OSD) Monitor
19 */
20
21 #ifndef CEPH_OSDMONITOR_H
22 #define CEPH_OSDMONITOR_H
23
24 #include <map>
25 #include <set>
26 #include <utility>
27
28 #include "include/types.h"
29 #include "include/encoding.h"
30 #include "common/simple_cache.hpp"
31 #include "common/PriorityCache.h"
32 #include "msg/Messenger.h"
33
34 #include "osd/OSDMap.h"
35 #include "osd/OSDMapMapping.h"
36
37 #include "CreatingPGs.h"
38 #include "PaxosService.h"
39
40 #include "erasure-code/ErasureCodeInterface.h"
41 #include "mon/MonOpRequest.h"
42 #include <boost/functional/hash.hpp>
43
44 class Monitor;
45 class PGMap;
46 struct MonSession;
47 class MOSDMap;
48
49
50 /// information about a particular peer's failure reports for one osd
51 struct failure_reporter_t {
52 utime_t failed_since; ///< when they think it failed
53 MonOpRequestRef op; ///< failure op request
54
55 failure_reporter_t() {}
56 failure_reporter_t(utime_t s, MonOpRequestRef op)
57 : failed_since(s), op(op) {}
58 ~failure_reporter_t() { }
59 };
60
61 /// information about all failure reports for one osd
62 struct failure_info_t {
63 std::map<int, failure_reporter_t> reporters; ///< reporter -> failed_since etc
64 utime_t max_failed_since; ///< most recent failed_since
65
66 failure_info_t() {}
67
68 utime_t get_failed_since() {
69 if (max_failed_since == utime_t() && !reporters.empty()) {
70 // the old max must have canceled; recalculate.
71 for (auto p = reporters.begin(); p != reporters.end(); ++p)
72 if (p->second.failed_since > max_failed_since)
73 max_failed_since = p->second.failed_since;
74 }
75 return max_failed_since;
76 }
77
78 // set the message for the latest report.
79 void add_report(int who, utime_t failed_since, MonOpRequestRef op) {
80 [[maybe_unused]] auto [it, new_reporter] =
81 reporters.insert_or_assign(who, failure_reporter_t{failed_since, op});
82 if (new_reporter) {
83 if (max_failed_since != utime_t() && max_failed_since < failed_since) {
84 max_failed_since = failed_since;
85 }
86 }
87 }
88
89 void take_report_messages(std::list<MonOpRequestRef>& ls) {
90 for (auto p = reporters.begin(); p != reporters.end(); ++p) {
91 if (p->second.op) {
92 ls.push_back(p->second.op);
93 p->second.op.reset();
94 }
95 }
96 }
97
98 void cancel_report(int who) {
99 reporters.erase(who);
100 max_failed_since = utime_t();
101 }
102 };
103
104
105 class LastEpochClean {
106 struct Lec {
107 std::vector<epoch_t> epoch_by_pg;
108 ps_t next_missing = 0;
109 epoch_t floor = std::numeric_limits<epoch_t>::max();
110 void report(ps_t pg, epoch_t last_epoch_clean);
111 };
112 std::map<uint64_t, Lec> report_by_pool;
113 public:
114 void report(const pg_t& pg, epoch_t last_epoch_clean);
115 void remove_pool(uint64_t pool);
116 epoch_t get_lower_bound(const OSDMap& latest) const;
117
118 void dump(Formatter *f) const;
119 };
120
121
122 struct osdmap_manifest_t {
123 // all the maps we have pinned -- i.e., won't be removed unless
124 // they are inside a trim interval.
125 std::set<version_t> pinned;
126
127 osdmap_manifest_t() {}
128
129 version_t get_last_pinned() const
130 {
131 auto it = pinned.crbegin();
132 if (it == pinned.crend()) {
133 return 0;
134 }
135 return *it;
136 }
137
138 version_t get_first_pinned() const
139 {
140 auto it = pinned.cbegin();
141 if (it == pinned.cend()) {
142 return 0;
143 }
144 return *it;
145 }
146
147 bool is_pinned(version_t v) const
148 {
149 return pinned.find(v) != pinned.end();
150 }
151
152 void pin(version_t v)
153 {
154 pinned.insert(v);
155 }
156
157 version_t get_lower_closest_pinned(version_t v) const {
158 auto p = pinned.lower_bound(v);
159 if (p == pinned.cend()) {
160 return 0;
161 } else if (*p > v) {
162 if (p == pinned.cbegin()) {
163 return 0;
164 }
165 --p;
166 }
167 return *p;
168 }
169
170 void encode(ceph::buffer::list& bl) const
171 {
172 ENCODE_START(1, 1, bl);
173 encode(pinned, bl);
174 ENCODE_FINISH(bl);
175 }
176
177 void decode(ceph::buffer::list::const_iterator& bl)
178 {
179 DECODE_START(1, bl);
180 decode(pinned, bl);
181 DECODE_FINISH(bl);
182 }
183
184 void decode(ceph::buffer::list& bl) {
185 auto p = bl.cbegin();
186 decode(p);
187 }
188
189 void dump(ceph::Formatter *f) {
190 f->dump_unsigned("first_pinned", get_first_pinned());
191 f->dump_unsigned("last_pinned", get_last_pinned());
192 f->open_array_section("pinned_maps");
193 for (auto& i : pinned) {
194 f->dump_unsigned("epoch", i);
195 }
196 f->close_section();
197 }
198 };
199 WRITE_CLASS_ENCODER(osdmap_manifest_t);
200
201 class OSDMonitor : public PaxosService,
202 public md_config_obs_t {
203 CephContext *cct;
204
205 public:
206 OSDMap osdmap;
207
208 // config observer
209 const char** get_tracked_conf_keys() const override;
210 void handle_conf_change(const ConfigProxy& conf,
211 const std::set<std::string> &changed) override;
212 // [leader]
213 OSDMap::Incremental pending_inc;
214 std::map<int, ceph::buffer::list> pending_metadata;
215 std::set<int> pending_metadata_rm;
216 std::map<int, failure_info_t> failure_info;
217 std::map<int,utime_t> down_pending_out; // osd down -> out
218 bool priority_convert = false;
219 std::map<int64_t,std::set<snapid_t>> pending_pseudo_purged_snaps;
220 std::shared_ptr<PriorityCache::PriCache> rocksdb_binned_kv_cache = nullptr;
221 std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
222 ceph::mutex balancer_lock = ceph::make_mutex("OSDMonitor::balancer_lock");
223
224 std::map<int,double> osd_weight;
225
226 using osdmap_key_t = std::pair<version_t, uint64_t>;
227 using osdmap_cache_t = SimpleLRU<osdmap_key_t,
228 ceph::buffer::list,
229 std::less<osdmap_key_t>,
230 boost::hash<osdmap_key_t>>;
231 osdmap_cache_t inc_osd_cache;
232 osdmap_cache_t full_osd_cache;
233
234 bool has_osdmap_manifest;
235 osdmap_manifest_t osdmap_manifest;
236
237 bool check_failures(utime_t now);
238 bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
239 utime_t get_grace_time(utime_t now, int target_osd, failure_info_t& fi) const;
240 bool is_failure_stale(utime_t now, failure_info_t& fi) const;
241 void force_failure(int target_osd, int by);
242
243 bool _have_pending_crush();
244 CrushWrapper &_get_stable_crush();
245 void _get_pending_crush(CrushWrapper& newcrush);
246
247 enum FastReadType {
248 FAST_READ_OFF,
249 FAST_READ_ON,
250 FAST_READ_DEFAULT
251 };
252
253 struct CleanUpmapJob : public ParallelPGMapper::Job {
254 CephContext *cct;
255 const OSDMap& osdmap;
256 OSDMap::Incremental& pending_inc;
257 // lock to protect pending_inc form changing
258 // when checking is done
259 ceph::mutex pending_inc_lock =
260 ceph::make_mutex("CleanUpmapJob::pending_inc_lock");
261
262 CleanUpmapJob(CephContext *cct, const OSDMap& om, OSDMap::Incremental& pi)
263 : ParallelPGMapper::Job(&om),
264 cct(cct),
265 osdmap(om),
266 pending_inc(pi) {}
267
268 void process(const std::vector<pg_t>& to_check) override {
269 std::vector<pg_t> to_cancel;
270 std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>> to_remap;
271 osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
272 // don't bother taking lock if nothing changes
273 if (!to_cancel.empty() || !to_remap.empty()) {
274 std::lock_guard l(pending_inc_lock);
275 osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_remap);
276 }
277 }
278
279 void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) override {}
280 void complete() override {}
281 }; // public as this will need to be accessible from TestTestOSDMap.cc
282
283 // svc
284 public:
285 void create_initial() override;
286 void get_store_prefixes(std::set<std::string>& s) const override;
287
288 private:
289 void update_from_paxos(bool *need_bootstrap) override;
290 void create_pending() override; // prepare a new pending
291 void encode_pending(MonitorDBStore::TransactionRef t) override;
292 void on_active() override;
293 void on_restart() override;
294 void on_shutdown() override;
295
296 /* osdmap full map prune */
297 void load_osdmap_manifest();
298 bool should_prune() const;
299 void _prune_update_trimmed(
300 MonitorDBStore::TransactionRef tx,
301 version_t first);
302 void prune_init(osdmap_manifest_t& manifest);
303 bool _prune_sanitize_options() const;
304 bool is_prune_enabled() const;
305 bool is_prune_supported() const;
306 bool do_prune(MonitorDBStore::TransactionRef tx);
307
308 // Priority cache control
309 uint32_t mon_osd_cache_size = 0; ///< Number of cached OSDMaps
310 uint64_t rocksdb_cache_size = 0; ///< Cache for kv Db
311 double cache_kv_ratio = 0; ///< Cache ratio dedicated to kv
312 double cache_inc_ratio = 0; ///< Cache ratio dedicated to inc
313 double cache_full_ratio = 0; ///< Cache ratio dedicated to full
314 uint64_t mon_memory_base = 0; ///< Mon base memory for cache autotuning
315 double mon_memory_fragmentation = 0; ///< Expected memory fragmentation
316 uint64_t mon_memory_target = 0; ///< Mon target memory for cache autotuning
317 uint64_t mon_memory_min = 0; ///< Min memory to cache osdmaps
318 bool mon_memory_autotune = false; ///< Cache auto tune setting
319 int register_cache_with_pcm();
320 int _set_cache_sizes();
321 int _set_cache_ratios();
322 void _set_new_cache_sizes();
323 void _set_cache_autotuning();
324 int _update_mon_cache_settings();
325
326 friend struct OSDMemCache;
327 friend struct IncCache;
328 friend struct FullCache;
329
330 /**
331 * we haven't delegated full version stashing to paxosservice for some time
332 * now, making this function useless in current context.
333 */
334 void encode_full(MonitorDBStore::TransactionRef t) override { }
335 /**
336 * do not let paxosservice periodically stash full osdmaps, or we will break our
337 * locally-managed full maps. (update_from_paxos loads the latest and writes them
338 * out going forward from there, but if we just synced that may mean we skip some.)
339 */
340 bool should_stash_full() override {
341 return false;
342 }
343
344 /**
345 * hook into trim to include the oldest full map in the trim transaction
346 *
347 * This ensures that anyone post-sync will have enough to rebuild their
348 * full osdmaps.
349 */
350 void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) override;
351
352 void update_msgr_features();
353 /**
354 * check if the cluster supports the features required by the
355 * given crush map. Outputs the daemons which don't support it
356 * to the stringstream.
357 *
358 * @returns true if the map is passable, false otherwise
359 */
360 bool validate_crush_against_features(const CrushWrapper *newcrush,
361 std::stringstream &ss);
362 void check_osdmap_subs();
363 void share_map_with_random_osd();
364
365 ceph::mutex prime_pg_temp_lock =
366 ceph::make_mutex("OSDMonitor::prime_pg_temp_lock");
367 struct PrimeTempJob : public ParallelPGMapper::Job {
368 OSDMonitor *osdmon;
369 PrimeTempJob(const OSDMap& om, OSDMonitor *m)
370 : ParallelPGMapper::Job(&om), osdmon(m) {}
371 void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override {
372 for (unsigned ps = ps_begin; ps < ps_end; ++ps) {
373 pg_t pgid(ps, pool);
374 osdmon->prime_pg_temp(*osdmap, pgid);
375 }
376 }
377 void process(const std::vector<pg_t>& pgs) override {}
378 void complete() override {}
379 };
380 void maybe_prime_pg_temp();
381 void prime_pg_temp(const OSDMap& next, pg_t pgid);
382
383 ParallelPGMapper mapper; ///< for background pg work
384 OSDMapMapping mapping; ///< pg <-> osd mappings
385 std::unique_ptr<ParallelPGMapper::Job> mapping_job; ///< background mapping job
386 void start_mapping();
387
388 void update_logger();
389
390 void handle_query(PaxosServiceMessage *m);
391 bool preprocess_query(MonOpRequestRef op) override; // true if processed.
392 bool prepare_update(MonOpRequestRef op) override;
393 bool should_propose(double &delay) override;
394
395 version_t get_trim_to() const override;
396
397 bool can_mark_down(int o);
398 bool can_mark_up(int o);
399 bool can_mark_out(int o);
400 bool can_mark_in(int o);
401
402 // ...
403 MOSDMap *build_latest_full(uint64_t features);
404 MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features);
405 void send_full(MonOpRequestRef op);
406 void send_incremental(MonOpRequestRef op, epoch_t first);
407 public:
408 /**
409 * Make sure the existing (up) OSDs support the given features
410 * @return 0 on success, or an error code if any OSDs re missing features.
411 * @param ss Filled in with ane explanation of failure, if any
412 */
413 int check_cluster_features(uint64_t features, std::stringstream &ss);
414 // @param req an optional op request, if the osdmaps are replies to it. so
415 // @c Monitor::send_reply() can mark_event with it.
416 void send_incremental(epoch_t first, MonSession *session, bool onetime,
417 MonOpRequestRef req = MonOpRequestRef());
418
419 private:
420 void print_utilization(std::ostream &out, ceph::Formatter *f, bool tree) const;
421
422 bool check_source(MonOpRequestRef op, uuid_d fsid);
423
424 bool preprocess_get_osdmap(MonOpRequestRef op);
425
426 bool preprocess_mark_me_down(MonOpRequestRef op);
427
428 friend class C_AckMarkedDown;
429 bool preprocess_failure(MonOpRequestRef op);
430 bool prepare_failure(MonOpRequestRef op);
431 bool prepare_mark_me_down(MonOpRequestRef op);
432 void process_failures();
433 void take_all_failures(std::list<MonOpRequestRef>& ls);
434
435 bool preprocess_mark_me_dead(MonOpRequestRef op);
436 bool prepare_mark_me_dead(MonOpRequestRef op);
437
438 bool preprocess_full(MonOpRequestRef op);
439 bool prepare_full(MonOpRequestRef op);
440
441 bool preprocess_boot(MonOpRequestRef op);
442 bool prepare_boot(MonOpRequestRef op);
443 void _booted(MonOpRequestRef op, bool logit);
444
445 void update_up_thru(int from, epoch_t up_thru);
446 bool preprocess_alive(MonOpRequestRef op);
447 bool prepare_alive(MonOpRequestRef op);
448 void _reply_map(MonOpRequestRef op, epoch_t e);
449
450 bool preprocess_pgtemp(MonOpRequestRef op);
451 bool prepare_pgtemp(MonOpRequestRef op);
452
453 bool preprocess_pg_created(MonOpRequestRef op);
454 bool prepare_pg_created(MonOpRequestRef op);
455
456 bool preprocess_pg_ready_to_merge(MonOpRequestRef op);
457 bool prepare_pg_ready_to_merge(MonOpRequestRef op);
458
459 int _check_remove_pool(int64_t pool_id, const pg_pool_t &pool, std::ostream *ss);
460 bool _check_become_tier(
461 int64_t tier_pool_id, const pg_pool_t *tier_pool,
462 int64_t base_pool_id, const pg_pool_t *base_pool,
463 int *err, std::ostream *ss) const;
464 bool _check_remove_tier(
465 int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
466 int *err, std::ostream *ss) const;
467
468 int _prepare_remove_pool(int64_t pool, std::ostream *ss, bool no_fake);
469 int _prepare_rename_pool(int64_t pool, std::string newname);
470
471 bool enforce_pool_op_caps(MonOpRequestRef op);
472 bool preprocess_pool_op (MonOpRequestRef op);
473 bool preprocess_pool_op_create (MonOpRequestRef op);
474 bool prepare_pool_op (MonOpRequestRef op);
475 bool prepare_pool_op_create (MonOpRequestRef op);
476 bool prepare_pool_op_delete(MonOpRequestRef op);
477 int crush_rename_bucket(const std::string& srcname,
478 const std::string& dstname,
479 std::ostream *ss);
480 void check_legacy_ec_plugin(const std::string& plugin,
481 const std::string& profile) const;
482 int normalize_profile(const std::string& profilename,
483 ceph::ErasureCodeProfile &profile,
484 bool force,
485 std::ostream *ss);
486 int crush_rule_create_erasure(const std::string &name,
487 const std::string &profile,
488 int *rule,
489 std::ostream *ss);
490 int get_crush_rule(const std::string &rule_name,
491 int *crush_rule,
492 std::ostream *ss);
493 int get_erasure_code(const std::string &erasure_code_profile,
494 ceph::ErasureCodeInterfaceRef *erasure_code,
495 std::ostream *ss) const;
496 int prepare_pool_crush_rule(const unsigned pool_type,
497 const std::string &erasure_code_profile,
498 const std::string &rule_name,
499 int *crush_rule,
500 std::ostream *ss);
501 bool erasure_code_profile_in_use(
502 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
503 const std::string &profile,
504 std::ostream *ss);
505 int parse_erasure_code_profile(const std::vector<std::string> &erasure_code_profile,
506 std::map<std::string,std::string> *erasure_code_profile_map,
507 std::ostream *ss);
508 int prepare_pool_size(const unsigned pool_type,
509 const std::string &erasure_code_profile,
510 uint8_t repl_size,
511 unsigned *size, unsigned *min_size,
512 std::ostream *ss);
513 int prepare_pool_stripe_width(const unsigned pool_type,
514 const std::string &erasure_code_profile,
515 unsigned *stripe_width,
516 std::ostream *ss);
517 int check_pg_num(int64_t pool, int pg_num, int size, std::ostream* ss);
518 int prepare_new_pool(std::string& name,
519 int crush_rule,
520 const std::string &crush_rule_name,
521 unsigned pg_num, unsigned pgp_num,
522 unsigned pg_num_min,
523 uint64_t repl_size,
524 const uint64_t target_size_bytes,
525 const float target_size_ratio,
526 const std::string &erasure_code_profile,
527 const unsigned pool_type,
528 const uint64_t expected_num_objects,
529 FastReadType fast_read,
530 const std::string& pg_autoscale_mode,
531 std::ostream *ss);
532 int prepare_new_pool(MonOpRequestRef op);
533
534 void set_pool_flags(int64_t pool_id, uint64_t flags);
535 void clear_pool_flags(int64_t pool_id, uint64_t flags);
536 bool update_pools_status();
537
538 bool _is_removed_snap(int64_t pool_id, snapid_t snapid);
539 bool _is_pending_removed_snap(int64_t pool_id, snapid_t snapid);
540
541 std::string make_purged_snap_epoch_key(epoch_t epoch);
542 std::string make_purged_snap_key(int64_t pool, snapid_t snap);
543 std::string make_purged_snap_key_value(int64_t pool, snapid_t snap, snapid_t num,
544 epoch_t epoch, ceph::buffer::list *v);
545
546 bool try_prune_purged_snaps();
547 int lookup_purged_snap(int64_t pool, snapid_t snap,
548 snapid_t *begin, snapid_t *end);
549
550 void insert_purged_snap_update(
551 int64_t pool,
552 snapid_t start, snapid_t end,
553 epoch_t epoch,
554 MonitorDBStore::TransactionRef t);
555
556 bool prepare_set_flag(MonOpRequestRef op, int flag);
557 bool prepare_unset_flag(MonOpRequestRef op, int flag);
558
559 void _pool_op_reply(MonOpRequestRef op,
560 int ret, epoch_t epoch, ceph::buffer::list *blp=NULL);
561
562 struct C_Booted : public C_MonOp {
563 OSDMonitor *cmon;
564 bool logit;
565 C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) :
566 C_MonOp(op_), cmon(cm), logit(l) {}
567 void _finish(int r) override {
568 if (r >= 0)
569 cmon->_booted(op, logit);
570 else if (r == -ECANCELED)
571 return;
572 else if (r == -EAGAIN)
573 cmon->dispatch(op);
574 else
575 ceph_abort_msg("bad C_Booted return value");
576 }
577 };
578
579 struct C_ReplyMap : public C_MonOp {
580 OSDMonitor *osdmon;
581 epoch_t e;
582 C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee)
583 : C_MonOp(op_), osdmon(o), e(ee) {}
584 void _finish(int r) override {
585 if (r >= 0)
586 osdmon->_reply_map(op, e);
587 else if (r == -ECANCELED)
588 return;
589 else if (r == -EAGAIN)
590 osdmon->dispatch(op);
591 else
592 ceph_abort_msg("bad C_ReplyMap return value");
593 }
594 };
595 struct C_PoolOp : public C_MonOp {
596 OSDMonitor *osdmon;
597 int replyCode;
598 int epoch;
599 ceph::buffer::list reply_data;
600 C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, ceph::buffer::list *rd=NULL) :
601 C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) {
602 if (rd)
603 reply_data = *rd;
604 }
605 void _finish(int r) override {
606 if (r >= 0)
607 osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data);
608 else if (r == -ECANCELED)
609 return;
610 else if (r == -EAGAIN)
611 osdmon->dispatch(op);
612 else
613 ceph_abort_msg("bad C_PoolOp return value");
614 }
615 };
616
617 bool preprocess_remove_snaps(MonOpRequestRef op);
618 bool prepare_remove_snaps(MonOpRequestRef op);
619
620 bool preprocess_get_purged_snaps(MonOpRequestRef op);
621
622 int load_metadata(int osd, std::map<std::string, std::string>& m,
623 std::ostream *err);
624 void count_metadata(const std::string& field, ceph::Formatter *f);
625
626 void reencode_incremental_map(ceph::buffer::list& bl, uint64_t features);
627 void reencode_full_map(ceph::buffer::list& bl, uint64_t features);
628 public:
629 void count_metadata(const std::string& field, std::map<std::string,int> *out);
630 void get_versions(std::map<std::string, std::list<std::string>> &versions);
631 protected:
632 int get_osd_objectstore_type(int osd, std::string *type);
633 bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool,
634 std::ostream *err);
635
636 // when we last received PG stats from each osd and the osd's osd_beacon_report_interval
637 std::map<int, std::pair<utime_t, int>> last_osd_report;
638 // TODO: use last_osd_report to store the osd report epochs, once we don't
639 // need to upgrade from pre-luminous releases.
640 std::map<int,epoch_t> osd_epochs;
641 LastEpochClean last_epoch_clean;
642 bool preprocess_beacon(MonOpRequestRef op);
643 bool prepare_beacon(MonOpRequestRef op);
644 epoch_t get_min_last_epoch_clean() const;
645
646 friend class C_UpdateCreatingPGs;
647 std::map<int, std::map<epoch_t, std::set<spg_t>>> creating_pgs_by_osd_epoch;
648 std::vector<pg_t> pending_created_pgs;
649 // the epoch when the pg mapping was calculated
650 epoch_t creating_pgs_epoch = 0;
651 creating_pgs_t creating_pgs;
652 mutable std::mutex creating_pgs_lock;
653
654 creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc,
655 const OSDMap& nextmap);
656 unsigned scan_for_creating_pgs(
657 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
658 const mempool::osdmap::set<int64_t>& removed_pools,
659 utime_t modified,
660 creating_pgs_t* creating_pgs) const;
661 std::pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const;
662 void update_creating_pgs();
663 void check_pg_creates_subs();
664 epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const;
665
666 int32_t _allocate_osd_id(int32_t* existing_id);
667
668 int get_grace_interval_threshold();
669 bool grace_interval_threshold_exceeded(int last_failed);
670 void set_default_laggy_params(int target_osd);
671
672 public:
673 OSDMonitor(CephContext *cct, Monitor &mn, Paxos &p, const std::string& service_name);
674
675 void tick() override; // check state, take actions
676
677 bool preprocess_command(MonOpRequestRef op);
678 bool prepare_command(MonOpRequestRef op);
679 bool prepare_command_impl(MonOpRequestRef op, const cmdmap_t& cmdmap);
680
681 int validate_osd_create(
682 const int32_t id,
683 const uuid_d& uuid,
684 const bool check_osd_exists,
685 int32_t* existing_id,
686 std::stringstream& ss);
687 int prepare_command_osd_create(
688 const int32_t id,
689 const uuid_d& uuid,
690 int32_t* existing_id,
691 std::stringstream& ss);
692 void do_osd_create(const int32_t id, const uuid_d& uuid,
693 const std::string& device_class,
694 int32_t* new_id);
695 int prepare_command_osd_purge(int32_t id, std::stringstream& ss);
696 int prepare_command_osd_destroy(int32_t id, std::stringstream& ss);
697 int _prepare_command_osd_crush_remove(
698 CrushWrapper &newcrush,
699 int32_t id,
700 int32_t ancestor,
701 bool has_ancestor,
702 bool unlink_only);
703 void do_osd_crush_remove(CrushWrapper& newcrush);
704 int prepare_command_osd_crush_remove(
705 CrushWrapper &newcrush,
706 int32_t id,
707 int32_t ancestor,
708 bool has_ancestor,
709 bool unlink_only);
710 int prepare_command_osd_remove(int32_t id);
711 int prepare_command_osd_new(
712 MonOpRequestRef op,
713 const cmdmap_t& cmdmap,
714 const std::map<std::string,std::string>& secrets,
715 std::stringstream &ss,
716 ceph::Formatter *f);
717
718 int prepare_command_pool_set(const cmdmap_t& cmdmap,
719 std::stringstream& ss);
720
721 int prepare_command_pool_application(const std::string &prefix,
722 const cmdmap_t& cmdmap,
723 std::stringstream& ss);
724 int preprocess_command_pool_application(const std::string &prefix,
725 const cmdmap_t& cmdmap,
726 std::stringstream& ss,
727 bool *modified);
728 int _command_pool_application(const std::string &prefix,
729 const cmdmap_t& cmdmap,
730 std::stringstream& ss,
731 bool *modified,
732 bool preparing);
733
734 bool handle_osd_timeouts(const utime_t &now,
735 std::map<int, std::pair<utime_t, int>> &last_osd_report);
736
737 void send_latest(MonOpRequestRef op, epoch_t start=0);
738 void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) {
739 op->mark_osdmon_event(__func__);
740 send_incremental(op, start);
741 }
742
743 int get_version(version_t ver, ceph::buffer::list& bl) override;
744 int get_version(version_t ver, uint64_t feature, ceph::buffer::list& bl);
745
746 int get_version_full(version_t ver, uint64_t feature, ceph::buffer::list& bl);
747 int get_version_full(version_t ver, ceph::buffer::list& bl) override;
748 int get_inc(version_t ver, OSDMap::Incremental& inc);
749 int get_full_from_pinned_map(version_t ver, ceph::buffer::list& bl);
750
751 epoch_t blocklist(const entity_addrvec_t& av, utime_t until);
752 epoch_t blocklist(entity_addr_t a, utime_t until);
753
754 void dump_info(ceph::Formatter *f);
755 int dump_osd_metadata(int osd, ceph::Formatter *f, std::ostream *err);
756 void print_nodes(ceph::Formatter *f);
757
758 void check_osdmap_sub(Subscription *sub);
759 void check_pg_creates_sub(Subscription *sub);
760
761 void do_application_enable(int64_t pool_id, const std::string &app_name,
762 const std::string &app_key="",
763 const std::string &app_value="",
764 bool force=false);
765 void do_set_pool_opt(int64_t pool_id, pool_opts_t::key_t opt,
766 pool_opts_t::value_t);
767
768 void add_flag(int flag) {
769 if (!(osdmap.flags & flag)) {
770 if (pending_inc.new_flags < 0)
771 pending_inc.new_flags = osdmap.flags;
772 pending_inc.new_flags |= flag;
773 }
774 }
775
776 void remove_flag(int flag) {
777 if(osdmap.flags & flag) {
778 if (pending_inc.new_flags < 0)
779 pending_inc.new_flags = osdmap.flags;
780 pending_inc.new_flags &= ~flag;
781 }
782 }
783 void convert_pool_priorities(void);
784 /**
785 * Find the pools which are requested to be put into stretch mode,
786 * validate that they are allowed to be in stretch mode (eg, are replicated)
787 * and place copies of them in the pools set.
788 * This does not make any changes to the pools or state; it's just
789 * a safety-check-and-collect function.
790 */
791 void try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
792 int *errcode,
793 set<pg_pool_t*>* pools, const string& new_crush_rule);
794 /**
795 * Check validity of inputs and OSD/CRUSH state to
796 * engage stretch mode. Designed to be used with
797 * MonmapMonitor::try_enable_stretch_mode() where we call both twice,
798 * first with commit=false to validate.
799 * @param ss: a stringstream to write errors into
800 * @param okay: Filled to true if okay, false if validation fails
801 * @param errcode: filled with -errno if there's a problem
802 * @param commit: true if we should commit the change, false if just testing
803 * @param dividing_bucket: the bucket type (eg 'dc') that divides the cluster
804 * @param bucket_count: The number of buckets required in peering.
805 * Currently must be 2.
806 * @param pools: The pg_pool_ts which are being set to stretch mode (obtained
807 * from try_enable_stretch_mode_pools()).
808 * @param new_crush_rule: The crush rule to set the pools to.
809 */
810 void try_enable_stretch_mode(stringstream& ss, bool *okay,
811 int *errcode, bool commit,
812 const string& dividing_bucket,
813 uint32_t bucket_count,
814 const set<pg_pool_t*>& pools,
815 const string& new_crush_rule);
816 /**
817 * Check the input dead_buckets mapping (buckets->dead monitors) to see
818 * if the OSDs are also down. If so, fill in really_down_buckets and
819 * really_down_mons and return true; else return false.
820 */
821 bool check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
822 set<int> *really_down_buckets,
823 set<string> *really_down_mons);
824 /**
825 * Set degraded mode in the OSDMap, adding the given dead buckets to the dead set
826 * and using the live_zones (should presently be size 1)
827 */
828 void trigger_degraded_stretch_mode(const set<int>& dead_buckets,
829 const set<string>& live_zones);
830 /**
831 * This is just to maintain stretch_recovery_triggered; below
832 */
833 void set_degraded_stretch_mode();
834 /**
835 * Set recovery stretch mode in the OSDMap, resetting pool size back to normal
836 */
837 void trigger_recovery_stretch_mode();
838 /**
839 * This is just to maintain stretch_recovery_triggered; below
840 */
841 void set_recovery_stretch_mode();
842 /**
843 * This is just to maintain stretch_recovery_triggered; below
844 */
845 void set_healthy_stretch_mode();
846 /**
847 * Tells the OSD there's a new pg digest, in case it's interested.
848 * (It's interested when in recovering stretch mode.)
849 */
850 void notify_new_pg_digest();
851 /**
852 * Check if we can exit recovery stretch mode and go back to normal.
853 * @param force If true, we will force the exit through once it is legal,
854 * without regard to the reported PG status.
855 */
856 void try_end_recovery_stretch_mode(bool force);
857 /**
858 * Sets the osdmap and pg_pool_t values back to healthy stretch mode status.
859 */
860 void trigger_healthy_stretch_mode();
861 private:
862 utime_t stretch_recovery_triggered; // what time we committed a switch to recovery mode
863 };
864
865 #endif