]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.h
86878aa315ae70df7c27aeb15717b312d53284db
[ceph.git] / ceph / src / mon / OSDMonitor.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 /* Object Store Device (OSD) Monitor
19 */
20
21 #ifndef CEPH_OSDMONITOR_H
22 #define CEPH_OSDMONITOR_H
23
24 #include <map>
25 #include <set>
26
27 #include "include/types.h"
28 #include "include/encoding.h"
29 #include "common/simple_cache.hpp"
30 #include "common/PriorityCache.h"
31 #include "msg/Messenger.h"
32
33 #include "osd/OSDMap.h"
34 #include "osd/OSDMapMapping.h"
35
36 #include "CreatingPGs.h"
37 #include "PaxosService.h"
38
39 class Monitor;
40 class PGMap;
41 class MonSession;
42 class MOSDMap;
43
44 #include "erasure-code/ErasureCodeInterface.h"
45 #include "mon/MonOpRequest.h"
46 #include <boost/functional/hash.hpp>
47 // re-include our assert to clobber the system one; fix dout:
48 #include "include/ceph_assert.h"
49
50 /// information about a particular peer's failure reports for one osd
51 struct failure_reporter_t {
52 utime_t failed_since; ///< when they think it failed
53 MonOpRequestRef op; ///< failure op request
54
55 failure_reporter_t() {}
56 explicit failure_reporter_t(utime_t s) : failed_since(s) {}
57 ~failure_reporter_t() { }
58 };
59
60 /// information about all failure reports for one osd
61 struct failure_info_t {
62 map<int, failure_reporter_t> reporters; ///< reporter -> failed_since etc
63 utime_t max_failed_since; ///< most recent failed_since
64
65 failure_info_t() {}
66
67 utime_t get_failed_since() {
68 if (max_failed_since == utime_t() && !reporters.empty()) {
69 // the old max must have canceled; recalculate.
70 for (map<int, failure_reporter_t>::iterator p = reporters.begin();
71 p != reporters.end();
72 ++p)
73 if (p->second.failed_since > max_failed_since)
74 max_failed_since = p->second.failed_since;
75 }
76 return max_failed_since;
77 }
78
79 // set the message for the latest report. return any old op request we had,
80 // if any, so we can discard it.
81 MonOpRequestRef add_report(int who, utime_t failed_since,
82 MonOpRequestRef op) {
83 map<int, failure_reporter_t>::iterator p = reporters.find(who);
84 if (p == reporters.end()) {
85 if (max_failed_since != utime_t() && max_failed_since < failed_since)
86 max_failed_since = failed_since;
87 p = reporters.insert(map<int, failure_reporter_t>::value_type(who, failure_reporter_t(failed_since))).first;
88 }
89
90 MonOpRequestRef ret = p->second.op;
91 p->second.op = op;
92 return ret;
93 }
94
95 void take_report_messages(list<MonOpRequestRef>& ls) {
96 for (map<int, failure_reporter_t>::iterator p = reporters.begin();
97 p != reporters.end();
98 ++p) {
99 if (p->second.op) {
100 ls.push_back(p->second.op);
101 p->second.op.reset();
102 }
103 }
104 }
105
106 MonOpRequestRef cancel_report(int who) {
107 map<int, failure_reporter_t>::iterator p = reporters.find(who);
108 if (p == reporters.end())
109 return MonOpRequestRef();
110 MonOpRequestRef ret = p->second.op;
111 reporters.erase(p);
112 max_failed_since = utime_t();
113 return ret;
114 }
115 };
116
117
118 class LastEpochClean {
119 struct Lec {
120 vector<epoch_t> epoch_by_pg;
121 ps_t next_missing = 0;
122 epoch_t floor = std::numeric_limits<epoch_t>::max();
123 void report(ps_t pg, epoch_t last_epoch_clean);
124 };
125 std::map<uint64_t, Lec> report_by_pool;
126 public:
127 void report(const pg_t& pg, epoch_t last_epoch_clean);
128 void remove_pool(uint64_t pool);
129 epoch_t get_lower_bound(const OSDMap& latest) const;
130
131 void dump(Formatter *f) const;
132 };
133
134
135 struct osdmap_manifest_t {
136 // all the maps we have pinned -- i.e., won't be removed unless
137 // they are inside a trim interval.
138 set<version_t> pinned;
139
140 osdmap_manifest_t() {}
141
142 version_t get_last_pinned() const
143 {
144 set<version_t>::const_reverse_iterator it = pinned.crbegin();
145 if (it == pinned.crend()) {
146 return 0;
147 }
148 return *it;
149 }
150
151 version_t get_first_pinned() const
152 {
153 set<version_t>::const_iterator it = pinned.cbegin();
154 if (it == pinned.cend()) {
155 return 0;
156 }
157 return *it;
158 }
159
160 bool is_pinned(version_t v) const
161 {
162 return pinned.find(v) != pinned.end();
163 }
164
165 void pin(version_t v)
166 {
167 pinned.insert(v);
168 }
169
170 version_t get_lower_closest_pinned(version_t v) const {
171 set<version_t>::const_iterator p = pinned.lower_bound(v);
172 if (p == pinned.cend()) {
173 return 0;
174 } else if (*p > v) {
175 if (p == pinned.cbegin()) {
176 return 0;
177 }
178 --p;
179 }
180 return *p;
181 }
182
183 void encode(bufferlist& bl) const
184 {
185 ENCODE_START(1, 1, bl);
186 encode(pinned, bl);
187 ENCODE_FINISH(bl);
188 }
189
190 void decode(bufferlist::const_iterator& bl)
191 {
192 DECODE_START(1, bl);
193 decode(pinned, bl);
194 DECODE_FINISH(bl);
195 }
196
197 void decode(bufferlist& bl) {
198 auto p = bl.cbegin();
199 decode(p);
200 }
201
202 void dump(Formatter *f) {
203 f->dump_unsigned("first_pinned", get_first_pinned());
204 f->dump_unsigned("last_pinned", get_last_pinned());
205 f->open_array_section("pinned_maps");
206 for (auto& i : pinned) {
207 f->dump_unsigned("epoch", i);
208 }
209 f->close_section();
210 }
211 };
212 WRITE_CLASS_ENCODER(osdmap_manifest_t);
213
214 class OSDMonitor : public PaxosService,
215 public md_config_obs_t {
216 CephContext *cct;
217
218 public:
219 OSDMap osdmap;
220
221 // config observer
222 const char** get_tracked_conf_keys() const override;
223 void handle_conf_change(const ConfigProxy& conf,
224 const std::set<std::string> &changed) override;
225 // [leader]
226 OSDMap::Incremental pending_inc;
227 map<int, bufferlist> pending_metadata;
228 set<int> pending_metadata_rm;
229 map<int, failure_info_t> failure_info;
230 map<int,utime_t> down_pending_out; // osd down -> out
231 bool priority_convert = false;
232 map<int64_t,set<snapid_t>> pending_pseudo_purged_snaps;
233 std::shared_ptr<PriorityCache::PriCache> rocksdb_binned_kv_cache = nullptr;
234 std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
235 ceph::mutex balancer_lock = ceph::make_mutex("OSDMonitor::balancer_lock");
236
237 map<int,double> osd_weight;
238
239 using osdmap_key_t = std::pair<version_t, uint64_t>;
240 using osdmap_cache_t = SimpleLRU<osdmap_key_t,
241 bufferlist,
242 std::less<osdmap_key_t>,
243 boost::hash<osdmap_key_t>>;
244 osdmap_cache_t inc_osd_cache;
245 osdmap_cache_t full_osd_cache;
246
247 bool has_osdmap_manifest;
248 osdmap_manifest_t osdmap_manifest;
249
250 bool check_failures(utime_t now);
251 bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
252 void force_failure(int target_osd, int by);
253
254 bool _have_pending_crush();
255 CrushWrapper &_get_stable_crush();
256 void _get_pending_crush(CrushWrapper& newcrush);
257
258 enum FastReadType {
259 FAST_READ_OFF,
260 FAST_READ_ON,
261 FAST_READ_DEFAULT
262 };
263
264 struct CleanUpmapJob : public ParallelPGMapper::Job {
265 CephContext *cct;
266 const OSDMap& osdmap;
267 OSDMap::Incremental& pending_inc;
268 // lock to protect pending_inc form changing
269 // when checking is done
270 ceph::mutex pending_inc_lock =
271 ceph::make_mutex("CleanUpmapJob::pending_inc_lock");
272
273 CleanUpmapJob(CephContext *cct, const OSDMap& om, OSDMap::Incremental& pi)
274 : ParallelPGMapper::Job(&om),
275 cct(cct),
276 osdmap(om),
277 pending_inc(pi) {}
278
279 void process(const vector<pg_t>& to_check) override {
280 vector<pg_t> to_cancel;
281 map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
282 osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
283 // don't bother taking lock if nothing changes
284 if (!to_cancel.empty() || !to_remap.empty()) {
285 std::lock_guard l(pending_inc_lock);
286 osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_remap);
287 }
288 }
289
290 void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) override {}
291 void complete() override {}
292 }; // public as this will need to be accessible from TestTestOSDMap.cc
293
294 // svc
295 public:
296 void create_initial() override;
297 void get_store_prefixes(std::set<string>& s) const override;
298
299 private:
300 void update_from_paxos(bool *need_bootstrap) override;
301 void create_pending() override; // prepare a new pending
302 void encode_pending(MonitorDBStore::TransactionRef t) override;
303 void on_active() override;
304 void on_restart() override;
305 void on_shutdown() override;
306
307 /* osdmap full map prune */
308 void load_osdmap_manifest();
309 bool should_prune() const;
310 void _prune_update_trimmed(
311 MonitorDBStore::TransactionRef tx,
312 version_t first);
313 void prune_init(osdmap_manifest_t& manifest);
314 bool _prune_sanitize_options() const;
315 bool is_prune_enabled() const;
316 bool is_prune_supported() const;
317 bool do_prune(MonitorDBStore::TransactionRef tx);
318
319 // Priority cache control
320 uint32_t mon_osd_cache_size = 0; ///< Number of cached OSDMaps
321 uint64_t rocksdb_cache_size = 0; ///< Cache for kv Db
322 double cache_kv_ratio = 0; ///< Cache ratio dedicated to kv
323 double cache_inc_ratio = 0; ///< Cache ratio dedicated to inc
324 double cache_full_ratio = 0; ///< Cache ratio dedicated to full
325 uint64_t mon_memory_base = 0; ///< Mon base memory for cache autotuning
326 double mon_memory_fragmentation = 0; ///< Expected memory fragmentation
327 uint64_t mon_memory_target = 0; ///< Mon target memory for cache autotuning
328 uint64_t mon_memory_min = 0; ///< Min memory to cache osdmaps
329 bool mon_memory_autotune = false; ///< Cache auto tune setting
330 int register_cache_with_pcm();
331 int _set_cache_sizes();
332 int _set_cache_ratios();
333 void _set_new_cache_sizes();
334 void _set_cache_autotuning();
335 int _update_mon_cache_settings();
336
337 friend struct OSDMemCache;
338 friend struct IncCache;
339 friend struct FullCache;
340
341 /**
342 * we haven't delegated full version stashing to paxosservice for some time
343 * now, making this function useless in current context.
344 */
345 void encode_full(MonitorDBStore::TransactionRef t) override { }
346 /**
347 * do not let paxosservice periodically stash full osdmaps, or we will break our
348 * locally-managed full maps. (update_from_paxos loads the latest and writes them
349 * out going forward from there, but if we just synced that may mean we skip some.)
350 */
351 bool should_stash_full() override {
352 return false;
353 }
354
355 /**
356 * hook into trim to include the oldest full map in the trim transaction
357 *
358 * This ensures that anyone post-sync will have enough to rebuild their
359 * full osdmaps.
360 */
361 void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) override;
362
363 void update_msgr_features();
364 int check_cluster_features(uint64_t features, stringstream &ss);
365 /**
366 * check if the cluster supports the features required by the
367 * given crush map. Outputs the daemons which don't support it
368 * to the stringstream.
369 *
370 * @returns true if the map is passable, false otherwise
371 */
372 bool validate_crush_against_features(const CrushWrapper *newcrush,
373 stringstream &ss);
374 void check_osdmap_subs();
375 void share_map_with_random_osd();
376
377 ceph::mutex prime_pg_temp_lock =
378 ceph::make_mutex("OSDMonitor::prime_pg_temp_lock");
379 struct PrimeTempJob : public ParallelPGMapper::Job {
380 OSDMonitor *osdmon;
381 PrimeTempJob(const OSDMap& om, OSDMonitor *m)
382 : ParallelPGMapper::Job(&om), osdmon(m) {}
383 void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override {
384 for (unsigned ps = ps_begin; ps < ps_end; ++ps) {
385 pg_t pgid(ps, pool);
386 osdmon->prime_pg_temp(*osdmap, pgid);
387 }
388 }
389 void process(const vector<pg_t>& pgs) override {}
390 void complete() override {}
391 };
392 void maybe_prime_pg_temp();
393 void prime_pg_temp(const OSDMap& next, pg_t pgid);
394
395 ParallelPGMapper mapper; ///< for background pg work
396 OSDMapMapping mapping; ///< pg <-> osd mappings
397 unique_ptr<ParallelPGMapper::Job> mapping_job; ///< background mapping job
398 void start_mapping();
399
400 void update_logger();
401
402 void handle_query(PaxosServiceMessage *m);
403 bool preprocess_query(MonOpRequestRef op) override; // true if processed.
404 bool prepare_update(MonOpRequestRef op) override;
405 bool should_propose(double &delay) override;
406
407 version_t get_trim_to() const override;
408
409 bool can_mark_down(int o);
410 bool can_mark_up(int o);
411 bool can_mark_out(int o);
412 bool can_mark_in(int o);
413
414 // ...
415 MOSDMap *build_latest_full(uint64_t features);
416 MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features);
417 void send_full(MonOpRequestRef op);
418 void send_incremental(MonOpRequestRef op, epoch_t first);
419 public:
420 // @param req an optional op request, if the osdmaps are replies to it. so
421 // @c Monitor::send_reply() can mark_event with it.
422 void send_incremental(epoch_t first, MonSession *session, bool onetime,
423 MonOpRequestRef req = MonOpRequestRef());
424
425 private:
426 void print_utilization(ostream &out, Formatter *f, bool tree) const;
427
428 bool check_source(MonOpRequestRef op, uuid_d fsid);
429
430 bool preprocess_get_osdmap(MonOpRequestRef op);
431
432 bool preprocess_mark_me_down(MonOpRequestRef op);
433
434 friend class C_AckMarkedDown;
435 bool preprocess_failure(MonOpRequestRef op);
436 bool prepare_failure(MonOpRequestRef op);
437 bool prepare_mark_me_down(MonOpRequestRef op);
438 void process_failures();
439 void take_all_failures(list<MonOpRequestRef>& ls);
440
441 bool preprocess_mark_me_dead(MonOpRequestRef op);
442 bool prepare_mark_me_dead(MonOpRequestRef op);
443
444 bool preprocess_full(MonOpRequestRef op);
445 bool prepare_full(MonOpRequestRef op);
446
447 bool preprocess_boot(MonOpRequestRef op);
448 bool prepare_boot(MonOpRequestRef op);
449 void _booted(MonOpRequestRef op, bool logit);
450
451 void update_up_thru(int from, epoch_t up_thru);
452 bool preprocess_alive(MonOpRequestRef op);
453 bool prepare_alive(MonOpRequestRef op);
454 void _reply_map(MonOpRequestRef op, epoch_t e);
455
456 bool preprocess_pgtemp(MonOpRequestRef op);
457 bool prepare_pgtemp(MonOpRequestRef op);
458
459 bool preprocess_pg_created(MonOpRequestRef op);
460 bool prepare_pg_created(MonOpRequestRef op);
461
462 bool preprocess_pg_ready_to_merge(MonOpRequestRef op);
463 bool prepare_pg_ready_to_merge(MonOpRequestRef op);
464
465 int _check_remove_pool(int64_t pool_id, const pg_pool_t &pool, ostream *ss);
466 bool _check_become_tier(
467 int64_t tier_pool_id, const pg_pool_t *tier_pool,
468 int64_t base_pool_id, const pg_pool_t *base_pool,
469 int *err, ostream *ss) const;
470 bool _check_remove_tier(
471 int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
472 int *err, ostream *ss) const;
473
474 int _prepare_remove_pool(int64_t pool, ostream *ss, bool no_fake);
475 int _prepare_rename_pool(int64_t pool, string newname);
476
477 bool enforce_pool_op_caps(MonOpRequestRef op);
478 bool preprocess_pool_op (MonOpRequestRef op);
479 bool preprocess_pool_op_create (MonOpRequestRef op);
480 bool prepare_pool_op (MonOpRequestRef op);
481 bool prepare_pool_op_create (MonOpRequestRef op);
482 bool prepare_pool_op_delete(MonOpRequestRef op);
483 int crush_rename_bucket(const string& srcname,
484 const string& dstname,
485 ostream *ss);
486 void check_legacy_ec_plugin(const string& plugin,
487 const string& profile) const;
488 int normalize_profile(const string& profilename,
489 ErasureCodeProfile &profile,
490 bool force,
491 ostream *ss);
492 int crush_rule_create_erasure(const string &name,
493 const string &profile,
494 int *rule,
495 ostream *ss);
496 int get_crush_rule(const string &rule_name,
497 int *crush_rule,
498 ostream *ss);
499 int get_erasure_code(const string &erasure_code_profile,
500 ErasureCodeInterfaceRef *erasure_code,
501 ostream *ss) const;
502 int prepare_pool_crush_rule(const unsigned pool_type,
503 const string &erasure_code_profile,
504 const string &rule_name,
505 int *crush_rule,
506 ostream *ss);
507 bool erasure_code_profile_in_use(
508 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
509 const string &profile,
510 ostream *ss);
511 int parse_erasure_code_profile(const vector<string> &erasure_code_profile,
512 map<string,string> *erasure_code_profile_map,
513 ostream *ss);
514 int prepare_pool_size(const unsigned pool_type,
515 const string &erasure_code_profile,
516 uint8_t repl_size,
517 unsigned *size, unsigned *min_size,
518 ostream *ss);
519 int prepare_pool_stripe_width(const unsigned pool_type,
520 const string &erasure_code_profile,
521 unsigned *stripe_width,
522 ostream *ss);
523 int check_pg_num(int64_t pool, int pg_num, int size, ostream* ss);
524 int prepare_new_pool(string& name,
525 int crush_rule,
526 const string &crush_rule_name,
527 unsigned pg_num, unsigned pgp_num,
528 unsigned pg_num_min,
529 uint64_t repl_size,
530 const uint64_t target_size_bytes,
531 const float target_size_ratio,
532 const string &erasure_code_profile,
533 const unsigned pool_type,
534 const uint64_t expected_num_objects,
535 FastReadType fast_read,
536 const string& pg_autoscale_mode,
537 ostream *ss);
538 int prepare_new_pool(MonOpRequestRef op);
539
540 void set_pool_flags(int64_t pool_id, uint64_t flags);
541 void clear_pool_flags(int64_t pool_id, uint64_t flags);
542 bool update_pools_status();
543
544 bool _is_removed_snap(int64_t pool_id, snapid_t snapid);
545 bool _is_pending_removed_snap(int64_t pool_id, snapid_t snapid);
546
547 string make_purged_snap_epoch_key(epoch_t epoch);
548 string make_purged_snap_key(int64_t pool, snapid_t snap);
549 string make_purged_snap_key_value(int64_t pool, snapid_t snap, snapid_t num,
550 epoch_t epoch, bufferlist *v);
551
552 bool try_prune_purged_snaps();
553 int lookup_purged_snap(int64_t pool, snapid_t snap,
554 snapid_t *begin, snapid_t *end);
555
556 void insert_purged_snap_update(
557 int64_t pool,
558 snapid_t start, snapid_t end,
559 epoch_t epoch,
560 MonitorDBStore::TransactionRef t);
561
562 bool prepare_set_flag(MonOpRequestRef op, int flag);
563 bool prepare_unset_flag(MonOpRequestRef op, int flag);
564
565 void _pool_op_reply(MonOpRequestRef op,
566 int ret, epoch_t epoch, bufferlist *blp=NULL);
567
568 struct C_Booted : public C_MonOp {
569 OSDMonitor *cmon;
570 bool logit;
571 C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) :
572 C_MonOp(op_), cmon(cm), logit(l) {}
573 void _finish(int r) override {
574 if (r >= 0)
575 cmon->_booted(op, logit);
576 else if (r == -ECANCELED)
577 return;
578 else if (r == -EAGAIN)
579 cmon->dispatch(op);
580 else
581 ceph_abort_msg("bad C_Booted return value");
582 }
583 };
584
585 struct C_ReplyMap : public C_MonOp {
586 OSDMonitor *osdmon;
587 epoch_t e;
588 C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee)
589 : C_MonOp(op_), osdmon(o), e(ee) {}
590 void _finish(int r) override {
591 if (r >= 0)
592 osdmon->_reply_map(op, e);
593 else if (r == -ECANCELED)
594 return;
595 else if (r == -EAGAIN)
596 osdmon->dispatch(op);
597 else
598 ceph_abort_msg("bad C_ReplyMap return value");
599 }
600 };
601 struct C_PoolOp : public C_MonOp {
602 OSDMonitor *osdmon;
603 int replyCode;
604 int epoch;
605 bufferlist reply_data;
606 C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, bufferlist *rd=NULL) :
607 C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) {
608 if (rd)
609 reply_data = *rd;
610 }
611 void _finish(int r) override {
612 if (r >= 0)
613 osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data);
614 else if (r == -ECANCELED)
615 return;
616 else if (r == -EAGAIN)
617 osdmon->dispatch(op);
618 else
619 ceph_abort_msg("bad C_PoolOp return value");
620 }
621 };
622
623 bool preprocess_remove_snaps(MonOpRequestRef op);
624 bool prepare_remove_snaps(MonOpRequestRef op);
625
626 bool preprocess_get_purged_snaps(MonOpRequestRef op);
627
628 int load_metadata(int osd, map<string, string>& m, ostream *err);
629 void count_metadata(const string& field, Formatter *f);
630
631 void reencode_incremental_map(bufferlist& bl, uint64_t features);
632 void reencode_full_map(bufferlist& bl, uint64_t features);
633 public:
634 void count_metadata(const string& field, map<string,int> *out);
635 protected:
636 int get_osd_objectstore_type(int osd, std::string *type);
637 bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool,
638 ostream *err);
639
640 // when we last received PG stats from each osd
641 map<int,utime_t> last_osd_report;
642 // TODO: use last_osd_report to store the osd report epochs, once we don't
643 // need to upgrade from pre-luminous releases.
644 map<int,epoch_t> osd_epochs;
645 LastEpochClean last_epoch_clean;
646 bool preprocess_beacon(MonOpRequestRef op);
647 bool prepare_beacon(MonOpRequestRef op);
648 epoch_t get_min_last_epoch_clean() const;
649
650 friend class C_UpdateCreatingPGs;
651 std::map<int, std::map<epoch_t, std::set<spg_t>>> creating_pgs_by_osd_epoch;
652 std::vector<pg_t> pending_created_pgs;
653 // the epoch when the pg mapping was calculated
654 epoch_t creating_pgs_epoch = 0;
655 creating_pgs_t creating_pgs;
656 mutable std::mutex creating_pgs_lock;
657
658 creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc,
659 const OSDMap& nextmap);
660 unsigned scan_for_creating_pgs(
661 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
662 const mempool::osdmap::set<int64_t>& removed_pools,
663 utime_t modified,
664 creating_pgs_t* creating_pgs) const;
665 pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const;
666 void update_creating_pgs();
667 void check_pg_creates_subs();
668 epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const;
669
670 int32_t _allocate_osd_id(int32_t* existing_id);
671
672 public:
673 OSDMonitor(CephContext *cct, Monitor *mn, Paxos *p, const string& service_name);
674
675 void tick() override; // check state, take actions
676
677 bool preprocess_command(MonOpRequestRef op);
678 bool prepare_command(MonOpRequestRef op);
679 bool prepare_command_impl(MonOpRequestRef op, const cmdmap_t& cmdmap);
680
681 int validate_osd_create(
682 const int32_t id,
683 const uuid_d& uuid,
684 const bool check_osd_exists,
685 int32_t* existing_id,
686 stringstream& ss);
687 int prepare_command_osd_create(
688 const int32_t id,
689 const uuid_d& uuid,
690 int32_t* existing_id,
691 stringstream& ss);
692 void do_osd_create(const int32_t id, const uuid_d& uuid,
693 const string& device_class,
694 int32_t* new_id);
695 int prepare_command_osd_purge(int32_t id, stringstream& ss);
696 int prepare_command_osd_destroy(int32_t id, stringstream& ss);
697 int _prepare_command_osd_crush_remove(
698 CrushWrapper &newcrush,
699 int32_t id,
700 int32_t ancestor,
701 bool has_ancestor,
702 bool unlink_only);
703 void do_osd_crush_remove(CrushWrapper& newcrush);
704 int prepare_command_osd_crush_remove(
705 CrushWrapper &newcrush,
706 int32_t id,
707 int32_t ancestor,
708 bool has_ancestor,
709 bool unlink_only);
710 int prepare_command_osd_remove(int32_t id);
711 int prepare_command_osd_new(
712 MonOpRequestRef op,
713 const cmdmap_t& cmdmap,
714 const map<string,string>& secrets,
715 stringstream &ss,
716 Formatter *f);
717
718 int prepare_command_pool_set(const cmdmap_t& cmdmap,
719 stringstream& ss);
720
721 int prepare_command_pool_application(const string &prefix,
722 const cmdmap_t& cmdmap,
723 stringstream& ss);
724 int preprocess_command_pool_application(const string &prefix,
725 const cmdmap_t& cmdmap,
726 stringstream& ss,
727 bool *modified);
728 int _command_pool_application(const string &prefix,
729 const cmdmap_t& cmdmap,
730 stringstream& ss,
731 bool *modified,
732 bool preparing);
733
734 bool handle_osd_timeouts(const utime_t &now,
735 std::map<int,utime_t> &last_osd_report);
736
737 void send_latest(MonOpRequestRef op, epoch_t start=0);
738 void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) {
739 op->mark_osdmon_event(__func__);
740 send_incremental(op, start);
741 }
742
743 int get_version(version_t ver, bufferlist& bl) override;
744 int get_version(version_t ver, uint64_t feature, bufferlist& bl);
745
746 int get_version_full(version_t ver, uint64_t feature, bufferlist& bl);
747 int get_version_full(version_t ver, bufferlist& bl) override;
748 int get_inc(version_t ver, OSDMap::Incremental& inc);
749 int get_full_from_pinned_map(version_t ver, bufferlist& bl);
750
751 epoch_t blacklist(const entity_addrvec_t& av, utime_t until);
752 epoch_t blacklist(entity_addr_t a, utime_t until);
753
754 void dump_info(Formatter *f);
755 int dump_osd_metadata(int osd, Formatter *f, ostream *err);
756 void print_nodes(Formatter *f);
757
758 void check_osdmap_sub(Subscription *sub);
759 void check_pg_creates_sub(Subscription *sub);
760
761 void do_application_enable(int64_t pool_id, const std::string &app_name,
762 const std::string &app_key="",
763 const std::string &app_value="",
764 bool force=false);
765 void do_set_pool_opt(int64_t pool_id, pool_opts_t::key_t opt,
766 pool_opts_t::value_t);
767
768 void add_flag(int flag) {
769 if (!(osdmap.flags & flag)) {
770 if (pending_inc.new_flags < 0)
771 pending_inc.new_flags = osdmap.flags;
772 pending_inc.new_flags |= flag;
773 }
774 }
775
776 void remove_flag(int flag) {
777 if(osdmap.flags & flag) {
778 if (pending_inc.new_flags < 0)
779 pending_inc.new_flags = osdmap.flags;
780 pending_inc.new_flags &= ~flag;
781 }
782 }
783 void convert_pool_priorities(void);
784 };
785
786 #endif