]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.h
fb941a6a0e56ef740a4dd75f514b92e7a7491a13
[ceph.git] / ceph / src / mon / OSDMonitor.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 /* Object Store Device (OSD) Monitor
19 */
20
21 #ifndef CEPH_OSDMONITOR_H
22 #define CEPH_OSDMONITOR_H
23
24 #include <map>
25 #include <set>
26
27 #include "include/types.h"
28 #include "include/encoding.h"
29 #include "common/simple_cache.hpp"
30 #include "common/PriorityCache.h"
31 #include "msg/Messenger.h"
32
33 #include "osd/OSDMap.h"
34 #include "osd/OSDMapMapping.h"
35
36 #include "CreatingPGs.h"
37 #include "PaxosService.h"
38
39 class Monitor;
40 class PGMap;
41 class MonSession;
42 class MOSDMap;
43
44 #include "erasure-code/ErasureCodeInterface.h"
45 #include "mon/MonOpRequest.h"
46 #include <boost/functional/hash.hpp>
47 // re-include our assert to clobber the system one; fix dout:
48 #include "include/ceph_assert.h"
49
50 /// information about a particular peer's failure reports for one osd
51 struct failure_reporter_t {
52 utime_t failed_since; ///< when they think it failed
53 MonOpRequestRef op; ///< failure op request
54
55 failure_reporter_t() {}
56 failure_reporter_t(utime_t s, MonOpRequestRef op)
57 : failed_since(s), op(op) {}
58 ~failure_reporter_t() { }
59 };
60
61 /// information about all failure reports for one osd
62 struct failure_info_t {
63 map<int, failure_reporter_t> reporters; ///< reporter -> failed_since etc
64 utime_t max_failed_since; ///< most recent failed_since
65
66 failure_info_t() {}
67
68 utime_t get_failed_since() {
69 if (max_failed_since == utime_t() && !reporters.empty()) {
70 // the old max must have canceled; recalculate.
71 for (map<int, failure_reporter_t>::iterator p = reporters.begin();
72 p != reporters.end();
73 ++p)
74 if (p->second.failed_since > max_failed_since)
75 max_failed_since = p->second.failed_since;
76 }
77 return max_failed_since;
78 }
79
80 // set the message for the latest report.
81 void add_report(int who, utime_t failed_since, MonOpRequestRef op) {
82 [[maybe_unused]] auto [it, new_reporter] =
83 reporters.insert_or_assign(who, failure_reporter_t{failed_since, op});
84 if (new_reporter) {
85 if (max_failed_since != utime_t() && max_failed_since < failed_since) {
86 max_failed_since = failed_since;
87 }
88 }
89 }
90
91 void take_report_messages(list<MonOpRequestRef>& ls) {
92 for (map<int, failure_reporter_t>::iterator p = reporters.begin();
93 p != reporters.end();
94 ++p) {
95 if (p->second.op) {
96 ls.push_back(p->second.op);
97 p->second.op.reset();
98 }
99 }
100 }
101
102 void cancel_report(int who) {
103 reporters.erase(who);
104 max_failed_since = utime_t();
105 }
106 };
107
108
109 class LastEpochClean {
110 struct Lec {
111 vector<epoch_t> epoch_by_pg;
112 ps_t next_missing = 0;
113 epoch_t floor = std::numeric_limits<epoch_t>::max();
114 void report(ps_t pg, epoch_t last_epoch_clean);
115 };
116 std::map<uint64_t, Lec> report_by_pool;
117 public:
118 void report(const pg_t& pg, epoch_t last_epoch_clean);
119 void remove_pool(uint64_t pool);
120 epoch_t get_lower_bound(const OSDMap& latest) const;
121
122 void dump(Formatter *f) const;
123 };
124
125
126 struct osdmap_manifest_t {
127 // all the maps we have pinned -- i.e., won't be removed unless
128 // they are inside a trim interval.
129 set<version_t> pinned;
130
131 osdmap_manifest_t() {}
132
133 version_t get_last_pinned() const
134 {
135 set<version_t>::const_reverse_iterator it = pinned.crbegin();
136 if (it == pinned.crend()) {
137 return 0;
138 }
139 return *it;
140 }
141
142 version_t get_first_pinned() const
143 {
144 set<version_t>::const_iterator it = pinned.cbegin();
145 if (it == pinned.cend()) {
146 return 0;
147 }
148 return *it;
149 }
150
151 bool is_pinned(version_t v) const
152 {
153 return pinned.find(v) != pinned.end();
154 }
155
156 void pin(version_t v)
157 {
158 pinned.insert(v);
159 }
160
161 version_t get_lower_closest_pinned(version_t v) const {
162 set<version_t>::const_iterator p = pinned.lower_bound(v);
163 if (p == pinned.cend()) {
164 return 0;
165 } else if (*p > v) {
166 if (p == pinned.cbegin()) {
167 return 0;
168 }
169 --p;
170 }
171 return *p;
172 }
173
174 void encode(bufferlist& bl) const
175 {
176 ENCODE_START(1, 1, bl);
177 encode(pinned, bl);
178 ENCODE_FINISH(bl);
179 }
180
181 void decode(bufferlist::const_iterator& bl)
182 {
183 DECODE_START(1, bl);
184 decode(pinned, bl);
185 DECODE_FINISH(bl);
186 }
187
188 void decode(bufferlist& bl) {
189 auto p = bl.cbegin();
190 decode(p);
191 }
192
193 void dump(Formatter *f) {
194 f->dump_unsigned("first_pinned", get_first_pinned());
195 f->dump_unsigned("last_pinned", get_last_pinned());
196 f->open_array_section("pinned_maps");
197 for (auto& i : pinned) {
198 f->dump_unsigned("epoch", i);
199 }
200 f->close_section();
201 }
202 };
203 WRITE_CLASS_ENCODER(osdmap_manifest_t);
204
205 class OSDMonitor : public PaxosService,
206 public md_config_obs_t {
207 CephContext *cct;
208
209 public:
210 OSDMap osdmap;
211
212 // config observer
213 const char** get_tracked_conf_keys() const override;
214 void handle_conf_change(const ConfigProxy& conf,
215 const std::set<std::string> &changed) override;
216 // [leader]
217 OSDMap::Incremental pending_inc;
218 map<int, bufferlist> pending_metadata;
219 set<int> pending_metadata_rm;
220 map<int, failure_info_t> failure_info;
221 map<int,utime_t> down_pending_out; // osd down -> out
222 bool priority_convert = false;
223 map<int64_t,set<snapid_t>> pending_pseudo_purged_snaps;
224 std::shared_ptr<PriorityCache::PriCache> rocksdb_binned_kv_cache = nullptr;
225 std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
226 ceph::mutex balancer_lock = ceph::make_mutex("OSDMonitor::balancer_lock");
227
228 map<int,double> osd_weight;
229
230 using osdmap_key_t = std::pair<version_t, uint64_t>;
231 using osdmap_cache_t = SimpleLRU<osdmap_key_t,
232 bufferlist,
233 std::less<osdmap_key_t>,
234 boost::hash<osdmap_key_t>>;
235 osdmap_cache_t inc_osd_cache;
236 osdmap_cache_t full_osd_cache;
237
238 bool has_osdmap_manifest;
239 osdmap_manifest_t osdmap_manifest;
240
241 bool check_failures(utime_t now);
242 bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
243 utime_t get_grace_time(utime_t now, int target_osd, failure_info_t& fi) const;
244 bool is_failure_stale(utime_t now, failure_info_t& fi) const;
245 void force_failure(int target_osd, int by);
246
247 bool _have_pending_crush();
248 CrushWrapper &_get_stable_crush();
249 void _get_pending_crush(CrushWrapper& newcrush);
250
251 enum FastReadType {
252 FAST_READ_OFF,
253 FAST_READ_ON,
254 FAST_READ_DEFAULT
255 };
256
257 struct CleanUpmapJob : public ParallelPGMapper::Job {
258 CephContext *cct;
259 const OSDMap& osdmap;
260 OSDMap::Incremental& pending_inc;
261 // lock to protect pending_inc form changing
262 // when checking is done
263 ceph::mutex pending_inc_lock =
264 ceph::make_mutex("CleanUpmapJob::pending_inc_lock");
265
266 CleanUpmapJob(CephContext *cct, const OSDMap& om, OSDMap::Incremental& pi)
267 : ParallelPGMapper::Job(&om),
268 cct(cct),
269 osdmap(om),
270 pending_inc(pi) {}
271
272 void process(const vector<pg_t>& to_check) override {
273 vector<pg_t> to_cancel;
274 map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
275 osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
276 // don't bother taking lock if nothing changes
277 if (!to_cancel.empty() || !to_remap.empty()) {
278 std::lock_guard l(pending_inc_lock);
279 osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_remap);
280 }
281 }
282
283 void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) override {}
284 void complete() override {}
285 }; // public as this will need to be accessible from TestTestOSDMap.cc
286
287 // svc
288 public:
289 void create_initial() override;
290 void get_store_prefixes(std::set<string>& s) const override;
291
292 private:
293 void update_from_paxos(bool *need_bootstrap) override;
294 void create_pending() override; // prepare a new pending
295 void encode_pending(MonitorDBStore::TransactionRef t) override;
296 void on_active() override;
297 void on_restart() override;
298 void on_shutdown() override;
299
300 /* osdmap full map prune */
301 void load_osdmap_manifest();
302 bool should_prune() const;
303 void _prune_update_trimmed(
304 MonitorDBStore::TransactionRef tx,
305 version_t first);
306 void prune_init(osdmap_manifest_t& manifest);
307 bool _prune_sanitize_options() const;
308 bool is_prune_enabled() const;
309 bool is_prune_supported() const;
310 bool do_prune(MonitorDBStore::TransactionRef tx);
311
312 // Priority cache control
313 uint32_t mon_osd_cache_size = 0; ///< Number of cached OSDMaps
314 uint64_t rocksdb_cache_size = 0; ///< Cache for kv Db
315 double cache_kv_ratio = 0; ///< Cache ratio dedicated to kv
316 double cache_inc_ratio = 0; ///< Cache ratio dedicated to inc
317 double cache_full_ratio = 0; ///< Cache ratio dedicated to full
318 uint64_t mon_memory_base = 0; ///< Mon base memory for cache autotuning
319 double mon_memory_fragmentation = 0; ///< Expected memory fragmentation
320 uint64_t mon_memory_target = 0; ///< Mon target memory for cache autotuning
321 uint64_t mon_memory_min = 0; ///< Min memory to cache osdmaps
322 bool mon_memory_autotune = false; ///< Cache auto tune setting
323 int register_cache_with_pcm();
324 int _set_cache_sizes();
325 int _set_cache_ratios();
326 void _set_new_cache_sizes();
327 void _set_cache_autotuning();
328 int _update_mon_cache_settings();
329
330 friend struct OSDMemCache;
331 friend struct IncCache;
332 friend struct FullCache;
333
334 /**
335 * we haven't delegated full version stashing to paxosservice for some time
336 * now, making this function useless in current context.
337 */
338 void encode_full(MonitorDBStore::TransactionRef t) override { }
339 /**
340 * do not let paxosservice periodically stash full osdmaps, or we will break our
341 * locally-managed full maps. (update_from_paxos loads the latest and writes them
342 * out going forward from there, but if we just synced that may mean we skip some.)
343 */
344 bool should_stash_full() override {
345 return false;
346 }
347
348 /**
349 * hook into trim to include the oldest full map in the trim transaction
350 *
351 * This ensures that anyone post-sync will have enough to rebuild their
352 * full osdmaps.
353 */
354 void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) override;
355
356 void update_msgr_features();
357 int check_cluster_features(uint64_t features, stringstream &ss);
358 /**
359 * check if the cluster supports the features required by the
360 * given crush map. Outputs the daemons which don't support it
361 * to the stringstream.
362 *
363 * @returns true if the map is passable, false otherwise
364 */
365 bool validate_crush_against_features(const CrushWrapper *newcrush,
366 stringstream &ss);
367 void check_osdmap_subs();
368 void share_map_with_random_osd();
369
370 ceph::mutex prime_pg_temp_lock =
371 ceph::make_mutex("OSDMonitor::prime_pg_temp_lock");
372 struct PrimeTempJob : public ParallelPGMapper::Job {
373 OSDMonitor *osdmon;
374 PrimeTempJob(const OSDMap& om, OSDMonitor *m)
375 : ParallelPGMapper::Job(&om), osdmon(m) {}
376 void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override {
377 for (unsigned ps = ps_begin; ps < ps_end; ++ps) {
378 pg_t pgid(ps, pool);
379 osdmon->prime_pg_temp(*osdmap, pgid);
380 }
381 }
382 void process(const vector<pg_t>& pgs) override {}
383 void complete() override {}
384 };
385 void maybe_prime_pg_temp();
386 void prime_pg_temp(const OSDMap& next, pg_t pgid);
387
388 ParallelPGMapper mapper; ///< for background pg work
389 OSDMapMapping mapping; ///< pg <-> osd mappings
390 unique_ptr<ParallelPGMapper::Job> mapping_job; ///< background mapping job
391 void start_mapping();
392
393 void update_logger();
394
395 void handle_query(PaxosServiceMessage *m);
396 bool preprocess_query(MonOpRequestRef op) override; // true if processed.
397 bool prepare_update(MonOpRequestRef op) override;
398 bool should_propose(double &delay) override;
399
400 version_t get_trim_to() const override;
401
402 bool can_mark_down(int o);
403 bool can_mark_up(int o);
404 bool can_mark_out(int o);
405 bool can_mark_in(int o);
406
407 // ...
408 MOSDMap *build_latest_full(uint64_t features);
409 MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features);
410 void send_full(MonOpRequestRef op);
411 void send_incremental(MonOpRequestRef op, epoch_t first);
412 public:
413 // @param req an optional op request, if the osdmaps are replies to it. so
414 // @c Monitor::send_reply() can mark_event with it.
415 void send_incremental(epoch_t first, MonSession *session, bool onetime,
416 MonOpRequestRef req = MonOpRequestRef());
417
418 private:
419 void print_utilization(ostream &out, Formatter *f, bool tree) const;
420
421 bool check_source(MonOpRequestRef op, uuid_d fsid);
422
423 bool preprocess_get_osdmap(MonOpRequestRef op);
424
425 bool preprocess_mark_me_down(MonOpRequestRef op);
426
427 friend class C_AckMarkedDown;
428 bool preprocess_failure(MonOpRequestRef op);
429 bool prepare_failure(MonOpRequestRef op);
430 bool prepare_mark_me_down(MonOpRequestRef op);
431 void process_failures();
432 void take_all_failures(list<MonOpRequestRef>& ls);
433
434 bool preprocess_mark_me_dead(MonOpRequestRef op);
435 bool prepare_mark_me_dead(MonOpRequestRef op);
436
437 bool preprocess_full(MonOpRequestRef op);
438 bool prepare_full(MonOpRequestRef op);
439
440 bool preprocess_boot(MonOpRequestRef op);
441 bool prepare_boot(MonOpRequestRef op);
442 void _booted(MonOpRequestRef op, bool logit);
443
444 void update_up_thru(int from, epoch_t up_thru);
445 bool preprocess_alive(MonOpRequestRef op);
446 bool prepare_alive(MonOpRequestRef op);
447 void _reply_map(MonOpRequestRef op, epoch_t e);
448
449 bool preprocess_pgtemp(MonOpRequestRef op);
450 bool prepare_pgtemp(MonOpRequestRef op);
451
452 bool preprocess_pg_created(MonOpRequestRef op);
453 bool prepare_pg_created(MonOpRequestRef op);
454
455 bool preprocess_pg_ready_to_merge(MonOpRequestRef op);
456 bool prepare_pg_ready_to_merge(MonOpRequestRef op);
457
458 int _check_remove_pool(int64_t pool_id, const pg_pool_t &pool, ostream *ss);
459 bool _check_become_tier(
460 int64_t tier_pool_id, const pg_pool_t *tier_pool,
461 int64_t base_pool_id, const pg_pool_t *base_pool,
462 int *err, ostream *ss) const;
463 bool _check_remove_tier(
464 int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
465 int *err, ostream *ss) const;
466
467 int _prepare_remove_pool(int64_t pool, ostream *ss, bool no_fake);
468 int _prepare_rename_pool(int64_t pool, string newname);
469
470 bool enforce_pool_op_caps(MonOpRequestRef op);
471 bool preprocess_pool_op (MonOpRequestRef op);
472 bool preprocess_pool_op_create (MonOpRequestRef op);
473 bool prepare_pool_op (MonOpRequestRef op);
474 bool prepare_pool_op_create (MonOpRequestRef op);
475 bool prepare_pool_op_delete(MonOpRequestRef op);
476 int crush_rename_bucket(const string& srcname,
477 const string& dstname,
478 ostream *ss);
479 void check_legacy_ec_plugin(const string& plugin,
480 const string& profile) const;
481 int normalize_profile(const string& profilename,
482 ErasureCodeProfile &profile,
483 bool force,
484 ostream *ss);
485 int crush_rule_create_erasure(const string &name,
486 const string &profile,
487 int *rule,
488 ostream *ss);
489 int get_crush_rule(const string &rule_name,
490 int *crush_rule,
491 ostream *ss);
492 int get_erasure_code(const string &erasure_code_profile,
493 ErasureCodeInterfaceRef *erasure_code,
494 ostream *ss) const;
495 int prepare_pool_crush_rule(const unsigned pool_type,
496 const string &erasure_code_profile,
497 const string &rule_name,
498 int *crush_rule,
499 ostream *ss);
500 bool erasure_code_profile_in_use(
501 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
502 const string &profile,
503 ostream *ss);
504 int parse_erasure_code_profile(const vector<string> &erasure_code_profile,
505 map<string,string> *erasure_code_profile_map,
506 ostream *ss);
507 int prepare_pool_size(const unsigned pool_type,
508 const string &erasure_code_profile,
509 uint8_t repl_size,
510 unsigned *size, unsigned *min_size,
511 ostream *ss);
512 int prepare_pool_stripe_width(const unsigned pool_type,
513 const string &erasure_code_profile,
514 unsigned *stripe_width,
515 ostream *ss);
516 int check_pg_num(int64_t pool, int pg_num, int size, ostream* ss);
517 int prepare_new_pool(string& name,
518 int crush_rule,
519 const string &crush_rule_name,
520 unsigned pg_num, unsigned pgp_num,
521 unsigned pg_num_min,
522 uint64_t repl_size,
523 const uint64_t target_size_bytes,
524 const float target_size_ratio,
525 const string &erasure_code_profile,
526 const unsigned pool_type,
527 const uint64_t expected_num_objects,
528 FastReadType fast_read,
529 const string& pg_autoscale_mode,
530 ostream *ss);
531 int prepare_new_pool(MonOpRequestRef op);
532
533 void set_pool_flags(int64_t pool_id, uint64_t flags);
534 void clear_pool_flags(int64_t pool_id, uint64_t flags);
535 bool update_pools_status();
536
537 bool _is_removed_snap(int64_t pool_id, snapid_t snapid);
538 bool _is_pending_removed_snap(int64_t pool_id, snapid_t snapid);
539
540 string make_purged_snap_epoch_key(epoch_t epoch);
541 string make_purged_snap_key(int64_t pool, snapid_t snap);
542 string make_purged_snap_key_value(int64_t pool, snapid_t snap, snapid_t num,
543 epoch_t epoch, bufferlist *v);
544
545 bool try_prune_purged_snaps();
546 int lookup_purged_snap(int64_t pool, snapid_t snap,
547 snapid_t *begin, snapid_t *end);
548
549 void insert_purged_snap_update(
550 int64_t pool,
551 snapid_t start, snapid_t end,
552 epoch_t epoch,
553 MonitorDBStore::TransactionRef t);
554
555 bool prepare_set_flag(MonOpRequestRef op, int flag);
556 bool prepare_unset_flag(MonOpRequestRef op, int flag);
557
558 void _pool_op_reply(MonOpRequestRef op,
559 int ret, epoch_t epoch, bufferlist *blp=NULL);
560
561 struct C_Booted : public C_MonOp {
562 OSDMonitor *cmon;
563 bool logit;
564 C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) :
565 C_MonOp(op_), cmon(cm), logit(l) {}
566 void _finish(int r) override {
567 if (r >= 0)
568 cmon->_booted(op, logit);
569 else if (r == -ECANCELED)
570 return;
571 else if (r == -EAGAIN)
572 cmon->dispatch(op);
573 else
574 ceph_abort_msg("bad C_Booted return value");
575 }
576 };
577
578 struct C_ReplyMap : public C_MonOp {
579 OSDMonitor *osdmon;
580 epoch_t e;
581 C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee)
582 : C_MonOp(op_), osdmon(o), e(ee) {}
583 void _finish(int r) override {
584 if (r >= 0)
585 osdmon->_reply_map(op, e);
586 else if (r == -ECANCELED)
587 return;
588 else if (r == -EAGAIN)
589 osdmon->dispatch(op);
590 else
591 ceph_abort_msg("bad C_ReplyMap return value");
592 }
593 };
594 struct C_PoolOp : public C_MonOp {
595 OSDMonitor *osdmon;
596 int replyCode;
597 int epoch;
598 bufferlist reply_data;
599 C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, bufferlist *rd=NULL) :
600 C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) {
601 if (rd)
602 reply_data = *rd;
603 }
604 void _finish(int r) override {
605 if (r >= 0)
606 osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data);
607 else if (r == -ECANCELED)
608 return;
609 else if (r == -EAGAIN)
610 osdmon->dispatch(op);
611 else
612 ceph_abort_msg("bad C_PoolOp return value");
613 }
614 };
615
616 bool preprocess_remove_snaps(MonOpRequestRef op);
617 bool prepare_remove_snaps(MonOpRequestRef op);
618
619 bool preprocess_get_purged_snaps(MonOpRequestRef op);
620
621 int load_metadata(int osd, map<string, string>& m, ostream *err);
622 void count_metadata(const string& field, Formatter *f);
623
624 void reencode_incremental_map(bufferlist& bl, uint64_t features);
625 void reencode_full_map(bufferlist& bl, uint64_t features);
626 public:
627 void count_metadata(const string& field, map<string,int> *out);
628 protected:
629 int get_osd_objectstore_type(int osd, std::string *type);
630 bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool,
631 ostream *err);
632
633 // when we last received PG stats from each osd
634 map<int,utime_t> last_osd_report;
635 // TODO: use last_osd_report to store the osd report epochs, once we don't
636 // need to upgrade from pre-luminous releases.
637 map<int,epoch_t> osd_epochs;
638 LastEpochClean last_epoch_clean;
639 bool preprocess_beacon(MonOpRequestRef op);
640 bool prepare_beacon(MonOpRequestRef op);
641 epoch_t get_min_last_epoch_clean() const;
642
643 friend class C_UpdateCreatingPGs;
644 std::map<int, std::map<epoch_t, std::set<spg_t>>> creating_pgs_by_osd_epoch;
645 std::vector<pg_t> pending_created_pgs;
646 // the epoch when the pg mapping was calculated
647 epoch_t creating_pgs_epoch = 0;
648 creating_pgs_t creating_pgs;
649 mutable std::mutex creating_pgs_lock;
650
651 creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc,
652 const OSDMap& nextmap);
653 unsigned scan_for_creating_pgs(
654 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
655 const mempool::osdmap::set<int64_t>& removed_pools,
656 utime_t modified,
657 creating_pgs_t* creating_pgs) const;
658 pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const;
659 void update_creating_pgs();
660 void check_pg_creates_subs();
661 epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const;
662
663 int32_t _allocate_osd_id(int32_t* existing_id);
664
665 int get_grace_interval_threshold();
666 bool grace_interval_threshold_exceeded(int last_failed);
667 void set_default_laggy_params(int target_osd);
668
669 public:
670 OSDMonitor(CephContext *cct, Monitor *mn, Paxos *p, const string& service_name);
671
672 void tick() override; // check state, take actions
673
674 bool preprocess_command(MonOpRequestRef op);
675 bool prepare_command(MonOpRequestRef op);
676 bool prepare_command_impl(MonOpRequestRef op, const cmdmap_t& cmdmap);
677
678 int validate_osd_create(
679 const int32_t id,
680 const uuid_d& uuid,
681 const bool check_osd_exists,
682 int32_t* existing_id,
683 stringstream& ss);
684 int prepare_command_osd_create(
685 const int32_t id,
686 const uuid_d& uuid,
687 int32_t* existing_id,
688 stringstream& ss);
689 void do_osd_create(const int32_t id, const uuid_d& uuid,
690 const string& device_class,
691 int32_t* new_id);
692 int prepare_command_osd_purge(int32_t id, stringstream& ss);
693 int prepare_command_osd_destroy(int32_t id, stringstream& ss);
694 int _prepare_command_osd_crush_remove(
695 CrushWrapper &newcrush,
696 int32_t id,
697 int32_t ancestor,
698 bool has_ancestor,
699 bool unlink_only);
700 void do_osd_crush_remove(CrushWrapper& newcrush);
701 int prepare_command_osd_crush_remove(
702 CrushWrapper &newcrush,
703 int32_t id,
704 int32_t ancestor,
705 bool has_ancestor,
706 bool unlink_only);
707 int prepare_command_osd_remove(int32_t id);
708 int prepare_command_osd_new(
709 MonOpRequestRef op,
710 const cmdmap_t& cmdmap,
711 const map<string,string>& secrets,
712 stringstream &ss,
713 Formatter *f);
714
715 int prepare_command_pool_set(const cmdmap_t& cmdmap,
716 stringstream& ss);
717
718 int prepare_command_pool_application(const string &prefix,
719 const cmdmap_t& cmdmap,
720 stringstream& ss);
721 int preprocess_command_pool_application(const string &prefix,
722 const cmdmap_t& cmdmap,
723 stringstream& ss,
724 bool *modified);
725 int _command_pool_application(const string &prefix,
726 const cmdmap_t& cmdmap,
727 stringstream& ss,
728 bool *modified,
729 bool preparing);
730
731 bool handle_osd_timeouts(const utime_t &now,
732 std::map<int,utime_t> &last_osd_report);
733
734 void send_latest(MonOpRequestRef op, epoch_t start=0);
735 void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) {
736 op->mark_osdmon_event(__func__);
737 send_incremental(op, start);
738 }
739
740 int get_version(version_t ver, bufferlist& bl) override;
741 int get_version(version_t ver, uint64_t feature, bufferlist& bl);
742
743 int get_version_full(version_t ver, uint64_t feature, bufferlist& bl);
744 int get_version_full(version_t ver, bufferlist& bl) override;
745 int get_inc(version_t ver, OSDMap::Incremental& inc);
746 int get_full_from_pinned_map(version_t ver, bufferlist& bl);
747
748 epoch_t blacklist(const entity_addrvec_t& av, utime_t until);
749 epoch_t blacklist(entity_addr_t a, utime_t until);
750
751 void dump_info(Formatter *f);
752 int dump_osd_metadata(int osd, Formatter *f, ostream *err);
753 void print_nodes(Formatter *f);
754
755 void check_osdmap_sub(Subscription *sub);
756 void check_pg_creates_sub(Subscription *sub);
757
758 void do_application_enable(int64_t pool_id, const std::string &app_name,
759 const std::string &app_key="",
760 const std::string &app_value="",
761 bool force=false);
762 void do_set_pool_opt(int64_t pool_id, pool_opts_t::key_t opt,
763 pool_opts_t::value_t);
764
765 void add_flag(int flag) {
766 if (!(osdmap.flags & flag)) {
767 if (pending_inc.new_flags < 0)
768 pending_inc.new_flags = osdmap.flags;
769 pending_inc.new_flags |= flag;
770 }
771 }
772
773 void remove_flag(int flag) {
774 if(osdmap.flags & flag) {
775 if (pending_inc.new_flags < 0)
776 pending_inc.new_flags = osdmap.flags;
777 pending_inc.new_flags &= ~flag;
778 }
779 }
780 void convert_pool_priorities(void);
781 };
782
783 #endif