]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/OSDMonitor.h
update download target update for octopus release
[ceph.git] / ceph / src / mon / OSDMonitor.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18/* Object Store Device (OSD) Monitor
19 */
20
21#ifndef CEPH_OSDMONITOR_H
22#define CEPH_OSDMONITOR_H
23
24#include <map>
25#include <set>
7c673cae
FG
26
27#include "include/types.h"
11fdf7f2 28#include "include/encoding.h"
7c673cae 29#include "common/simple_cache.hpp"
eafe8130 30#include "common/PriorityCache.h"
7c673cae
FG
31#include "msg/Messenger.h"
32
33#include "osd/OSDMap.h"
34#include "osd/OSDMapMapping.h"
35
36#include "CreatingPGs.h"
37#include "PaxosService.h"
38
39class Monitor;
40class PGMap;
41class MonSession;
42class MOSDMap;
43
44#include "erasure-code/ErasureCodeInterface.h"
45#include "mon/MonOpRequest.h"
28e407b8
AA
46#include <boost/functional/hash.hpp>
47// re-include our assert to clobber the system one; fix dout:
11fdf7f2 48#include "include/ceph_assert.h"
7c673cae 49
7c673cae
FG
50/// information about a particular peer's failure reports for one osd
51struct failure_reporter_t {
52 utime_t failed_since; ///< when they think it failed
53 MonOpRequestRef op; ///< failure op request
54
55 failure_reporter_t() {}
56 explicit failure_reporter_t(utime_t s) : failed_since(s) {}
57 ~failure_reporter_t() { }
58};
59
60/// information about all failure reports for one osd
61struct failure_info_t {
62 map<int, failure_reporter_t> reporters; ///< reporter -> failed_since etc
63 utime_t max_failed_since; ///< most recent failed_since
64
65 failure_info_t() {}
66
67 utime_t get_failed_since() {
68 if (max_failed_since == utime_t() && !reporters.empty()) {
69 // the old max must have canceled; recalculate.
70 for (map<int, failure_reporter_t>::iterator p = reporters.begin();
71 p != reporters.end();
72 ++p)
73 if (p->second.failed_since > max_failed_since)
74 max_failed_since = p->second.failed_since;
75 }
76 return max_failed_since;
77 }
78
79 // set the message for the latest report. return any old op request we had,
80 // if any, so we can discard it.
81 MonOpRequestRef add_report(int who, utime_t failed_since,
82 MonOpRequestRef op) {
83 map<int, failure_reporter_t>::iterator p = reporters.find(who);
84 if (p == reporters.end()) {
91327a77 85 if (max_failed_since != utime_t() && max_failed_since < failed_since)
7c673cae
FG
86 max_failed_since = failed_since;
87 p = reporters.insert(map<int, failure_reporter_t>::value_type(who, failure_reporter_t(failed_since))).first;
88 }
89
90 MonOpRequestRef ret = p->second.op;
91 p->second.op = op;
92 return ret;
93 }
94
95 void take_report_messages(list<MonOpRequestRef>& ls) {
96 for (map<int, failure_reporter_t>::iterator p = reporters.begin();
97 p != reporters.end();
98 ++p) {
99 if (p->second.op) {
100 ls.push_back(p->second.op);
101 p->second.op.reset();
102 }
103 }
104 }
105
106 MonOpRequestRef cancel_report(int who) {
107 map<int, failure_reporter_t>::iterator p = reporters.find(who);
108 if (p == reporters.end())
109 return MonOpRequestRef();
110 MonOpRequestRef ret = p->second.op;
111 reporters.erase(p);
91327a77 112 max_failed_since = utime_t();
7c673cae
FG
113 return ret;
114 }
115};
116
117
118class LastEpochClean {
119 struct Lec {
120 vector<epoch_t> epoch_by_pg;
121 ps_t next_missing = 0;
122 epoch_t floor = std::numeric_limits<epoch_t>::max();
123 void report(ps_t pg, epoch_t last_epoch_clean);
124 };
125 std::map<uint64_t, Lec> report_by_pool;
126public:
127 void report(const pg_t& pg, epoch_t last_epoch_clean);
128 void remove_pool(uint64_t pool);
129 epoch_t get_lower_bound(const OSDMap& latest) const;
130};
131
132
11fdf7f2
TL
133struct osdmap_manifest_t {
134 // all the maps we have pinned -- i.e., won't be removed unless
135 // they are inside a trim interval.
136 set<version_t> pinned;
137
138 osdmap_manifest_t() {}
139
140 version_t get_last_pinned() const
141 {
142 set<version_t>::const_reverse_iterator it = pinned.crbegin();
143 if (it == pinned.crend()) {
144 return 0;
145 }
146 return *it;
147 }
148
149 version_t get_first_pinned() const
150 {
151 set<version_t>::const_iterator it = pinned.cbegin();
152 if (it == pinned.cend()) {
153 return 0;
154 }
155 return *it;
156 }
157
158 bool is_pinned(version_t v) const
159 {
160 return pinned.find(v) != pinned.end();
161 }
162
163 void pin(version_t v)
164 {
165 pinned.insert(v);
166 }
167
168 version_t get_lower_closest_pinned(version_t v) const {
169 set<version_t>::const_iterator p = pinned.lower_bound(v);
170 if (p == pinned.cend()) {
171 return 0;
172 } else if (*p > v) {
173 if (p == pinned.cbegin()) {
174 return 0;
175 }
176 --p;
177 }
178 return *p;
179 }
180
181 void encode(bufferlist& bl) const
182 {
183 ENCODE_START(1, 1, bl);
184 encode(pinned, bl);
185 ENCODE_FINISH(bl);
186 }
187
188 void decode(bufferlist::const_iterator& bl)
189 {
190 DECODE_START(1, bl);
191 decode(pinned, bl);
192 DECODE_FINISH(bl);
193 }
194
195 void decode(bufferlist& bl) {
196 auto p = bl.cbegin();
197 decode(p);
198 }
199
200 void dump(Formatter *f) {
201 f->dump_unsigned("first_pinned", get_first_pinned());
202 f->dump_unsigned("last_pinned", get_last_pinned());
203 f->open_array_section("pinned_maps");
204 for (auto& i : pinned) {
205 f->dump_unsigned("epoch", i);
206 }
207 f->close_section();
208 }
209};
210WRITE_CLASS_ENCODER(osdmap_manifest_t);
211
eafe8130
TL
212class OSDMonitor : public PaxosService,
213 public md_config_obs_t {
7c673cae
FG
214 CephContext *cct;
215
216public:
217 OSDMap osdmap;
218
eafe8130
TL
219 // config observer
220 const char** get_tracked_conf_keys() const override;
221 void handle_conf_change(const ConfigProxy& conf,
222 const std::set<std::string> &changed) override;
7c673cae
FG
223 // [leader]
224 OSDMap::Incremental pending_inc;
225 map<int, bufferlist> pending_metadata;
226 set<int> pending_metadata_rm;
227 map<int, failure_info_t> failure_info;
228 map<int,utime_t> down_pending_out; // osd down -> out
81eedcae 229 bool priority_convert = false;
eafe8130
TL
230 std::shared_ptr<PriorityCache::PriCache> rocksdb_binned_kv_cache = nullptr;
231 std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
232 ceph::mutex balancer_lock = ceph::make_mutex("OSDMonitor::balancer_lock");
7c673cae
FG
233
234 map<int,double> osd_weight;
235
28e407b8
AA
236 using osdmap_key_t = std::pair<version_t, uint64_t>;
237 using osdmap_cache_t = SimpleLRU<osdmap_key_t,
238 bufferlist,
239 std::less<osdmap_key_t>,
240 boost::hash<osdmap_key_t>>;
241 osdmap_cache_t inc_osd_cache;
242 osdmap_cache_t full_osd_cache;
7c673cae 243
11fdf7f2
TL
244 bool has_osdmap_manifest;
245 osdmap_manifest_t osdmap_manifest;
246
7c673cae
FG
247 bool check_failures(utime_t now);
248 bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
224ce89b 249 void force_failure(int target_osd, int by);
7c673cae 250
7c673cae
FG
251 bool _have_pending_crush();
252 CrushWrapper &_get_stable_crush();
253 void _get_pending_crush(CrushWrapper& newcrush);
254
255 enum FastReadType {
256 FAST_READ_OFF,
257 FAST_READ_ON,
258 FAST_READ_DEFAULT
259 };
260
494da23a
TL
261 struct CleanUpmapJob : public ParallelPGMapper::Job {
262 CephContext *cct;
263 const OSDMap& osdmap;
264 OSDMap::Incremental& pending_inc;
265 // lock to protect pending_inc form changing
266 // when checking is done
267 Mutex pending_inc_lock = {"CleanUpmapJob::pending_inc_lock"};
268
269 CleanUpmapJob(CephContext *cct, const OSDMap& om, OSDMap::Incremental& pi)
270 : ParallelPGMapper::Job(&om),
271 cct(cct),
272 osdmap(om),
273 pending_inc(pi) {}
274
275 void process(const vector<pg_t>& to_check) override {
276 vector<pg_t> to_cancel;
277 map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
278 osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
279 // don't bother taking lock if nothing changes
280 if (!to_cancel.empty() || !to_remap.empty()) {
281 std::lock_guard l(pending_inc_lock);
282 osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_remap);
283 }
284 }
285
286 void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) override {}
287 void complete() override {}
288 }; // public as this will need to be accessible from TestTestOSDMap.cc
289
7c673cae 290 // svc
11fdf7f2 291public:
7c673cae 292 void create_initial() override;
11fdf7f2 293 void get_store_prefixes(std::set<string>& s) const override;
7c673cae
FG
294
295private:
296 void update_from_paxos(bool *need_bootstrap) override;
297 void create_pending() override; // prepare a new pending
298 void encode_pending(MonitorDBStore::TransactionRef t) override;
299 void on_active() override;
300 void on_restart() override;
301 void on_shutdown() override;
11fdf7f2
TL
302
303 /* osdmap full map prune */
304 void load_osdmap_manifest();
305 bool should_prune() const;
306 void _prune_update_trimmed(
307 MonitorDBStore::TransactionRef tx,
308 version_t first);
309 void prune_init(osdmap_manifest_t& manifest);
310 bool _prune_sanitize_options() const;
311 bool is_prune_enabled() const;
312 bool is_prune_supported() const;
313 bool do_prune(MonitorDBStore::TransactionRef tx);
314
eafe8130
TL
315 // Priority cache control
316 uint32_t mon_osd_cache_size = 0; ///< Number of cached OSDMaps
317 uint64_t rocksdb_cache_size = 0; ///< Cache for kv Db
318 double cache_kv_ratio = 0; ///< Cache ratio dedicated to kv
319 double cache_inc_ratio = 0; ///< Cache ratio dedicated to inc
320 double cache_full_ratio = 0; ///< Cache ratio dedicated to full
321 uint64_t mon_memory_base = 0; ///< Mon base memory for cache autotuning
322 double mon_memory_fragmentation = 0; ///< Expected memory fragmentation
323 uint64_t mon_memory_target = 0; ///< Mon target memory for cache autotuning
324 uint64_t mon_memory_min = 0; ///< Min memory to cache osdmaps
325 bool mon_memory_autotune = false; ///< Cache auto tune setting
326 int register_cache_with_pcm();
327 int _set_cache_sizes();
328 int _set_cache_ratios();
329 void _set_new_cache_sizes();
330 void _set_cache_autotuning();
331 int _update_mon_cache_settings();
332
333 friend struct OSDMemCache;
334 friend struct IncCache;
335 friend struct FullCache;
336
7c673cae
FG
337 /**
338 * we haven't delegated full version stashing to paxosservice for some time
339 * now, making this function useless in current context.
340 */
341 void encode_full(MonitorDBStore::TransactionRef t) override { }
342 /**
343 * do not let paxosservice periodically stash full osdmaps, or we will break our
344 * locally-managed full maps. (update_from_paxos loads the latest and writes them
345 * out going forward from there, but if we just synced that may mean we skip some.)
346 */
347 bool should_stash_full() override {
348 return false;
349 }
350
351 /**
352 * hook into trim to include the oldest full map in the trim transaction
353 *
354 * This ensures that anyone post-sync will have enough to rebuild their
355 * full osdmaps.
356 */
357 void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) override;
358
359 void update_msgr_features();
360 int check_cluster_features(uint64_t features, stringstream &ss);
361 /**
362 * check if the cluster supports the features required by the
363 * given crush map. Outputs the daemons which don't support it
364 * to the stringstream.
365 *
366 * @returns true if the map is passable, false otherwise
367 */
368 bool validate_crush_against_features(const CrushWrapper *newcrush,
369 stringstream &ss);
370 void check_osdmap_subs();
371 void share_map_with_random_osd();
372
373 Mutex prime_pg_temp_lock = {"OSDMonitor::prime_pg_temp_lock"};
374 struct PrimeTempJob : public ParallelPGMapper::Job {
375 OSDMonitor *osdmon;
376 PrimeTempJob(const OSDMap& om, OSDMonitor *m)
377 : ParallelPGMapper::Job(&om), osdmon(m) {}
378 void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override {
379 for (unsigned ps = ps_begin; ps < ps_end; ++ps) {
380 pg_t pgid(ps, pool);
381 osdmon->prime_pg_temp(*osdmap, pgid);
382 }
383 }
494da23a 384 void process(const vector<pg_t>& pgs) override {}
7c673cae
FG
385 void complete() override {}
386 };
387 void maybe_prime_pg_temp();
388 void prime_pg_temp(const OSDMap& next, pg_t pgid);
389
390 ParallelPGMapper mapper; ///< for background pg work
391 OSDMapMapping mapping; ///< pg <-> osd mappings
392 unique_ptr<ParallelPGMapper::Job> mapping_job; ///< background mapping job
393 void start_mapping();
394
395 void update_logger();
396
397 void handle_query(PaxosServiceMessage *m);
398 bool preprocess_query(MonOpRequestRef op) override; // true if processed.
399 bool prepare_update(MonOpRequestRef op) override;
400 bool should_propose(double &delay) override;
401
11fdf7f2 402 version_t get_trim_to() const override;
7c673cae
FG
403
404 bool can_mark_down(int o);
405 bool can_mark_up(int o);
406 bool can_mark_out(int o);
407 bool can_mark_in(int o);
408
409 // ...
28e407b8
AA
410 MOSDMap *build_latest_full(uint64_t features);
411 MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features);
7c673cae
FG
412 void send_full(MonOpRequestRef op);
413 void send_incremental(MonOpRequestRef op, epoch_t first);
414public:
415 // @param req an optional op request, if the osdmaps are replies to it. so
416 // @c Monitor::send_reply() can mark_event with it.
417 void send_incremental(epoch_t first, MonSession *session, bool onetime,
418 MonOpRequestRef req = MonOpRequestRef());
419
420private:
421 void print_utilization(ostream &out, Formatter *f, bool tree) const;
422
11fdf7f2 423 bool check_source(MonOpRequestRef op, uuid_d fsid);
7c673cae
FG
424
425 bool preprocess_get_osdmap(MonOpRequestRef op);
426
427 bool preprocess_mark_me_down(MonOpRequestRef op);
428
429 friend class C_AckMarkedDown;
430 bool preprocess_failure(MonOpRequestRef op);
431 bool prepare_failure(MonOpRequestRef op);
432 bool prepare_mark_me_down(MonOpRequestRef op);
433 void process_failures();
434 void take_all_failures(list<MonOpRequestRef>& ls);
435
436 bool preprocess_full(MonOpRequestRef op);
437 bool prepare_full(MonOpRequestRef op);
438
439 bool preprocess_boot(MonOpRequestRef op);
440 bool prepare_boot(MonOpRequestRef op);
441 void _booted(MonOpRequestRef op, bool logit);
442
443 void update_up_thru(int from, epoch_t up_thru);
444 bool preprocess_alive(MonOpRequestRef op);
445 bool prepare_alive(MonOpRequestRef op);
446 void _reply_map(MonOpRequestRef op, epoch_t e);
447
448 bool preprocess_pgtemp(MonOpRequestRef op);
449 bool prepare_pgtemp(MonOpRequestRef op);
450
451 bool preprocess_pg_created(MonOpRequestRef op);
452 bool prepare_pg_created(MonOpRequestRef op);
453
11fdf7f2
TL
454 bool preprocess_pg_ready_to_merge(MonOpRequestRef op);
455 bool prepare_pg_ready_to_merge(MonOpRequestRef op);
456
7c673cae
FG
457 int _check_remove_pool(int64_t pool_id, const pg_pool_t &pool, ostream *ss);
458 bool _check_become_tier(
459 int64_t tier_pool_id, const pg_pool_t *tier_pool,
460 int64_t base_pool_id, const pg_pool_t *base_pool,
461 int *err, ostream *ss) const;
462 bool _check_remove_tier(
463 int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
464 int *err, ostream *ss) const;
465
466 int _prepare_remove_pool(int64_t pool, ostream *ss, bool no_fake);
467 int _prepare_rename_pool(int64_t pool, string newname);
468
28e407b8 469 bool enforce_pool_op_caps(MonOpRequestRef op);
7c673cae
FG
470 bool preprocess_pool_op (MonOpRequestRef op);
471 bool preprocess_pool_op_create (MonOpRequestRef op);
472 bool prepare_pool_op (MonOpRequestRef op);
473 bool prepare_pool_op_create (MonOpRequestRef op);
474 bool prepare_pool_op_delete(MonOpRequestRef op);
475 int crush_rename_bucket(const string& srcname,
476 const string& dstname,
477 ostream *ss);
478 void check_legacy_ec_plugin(const string& plugin,
479 const string& profile) const;
480 int normalize_profile(const string& profilename,
481 ErasureCodeProfile &profile,
482 bool force,
483 ostream *ss);
31f18b77
FG
484 int crush_rule_create_erasure(const string &name,
485 const string &profile,
486 int *rule,
487 ostream *ss);
488 int get_crush_rule(const string &rule_name,
489 int *crush_rule,
7c673cae
FG
490 ostream *ss);
491 int get_erasure_code(const string &erasure_code_profile,
492 ErasureCodeInterfaceRef *erasure_code,
493 ostream *ss) const;
31f18b77 494 int prepare_pool_crush_rule(const unsigned pool_type,
7c673cae 495 const string &erasure_code_profile,
31f18b77
FG
496 const string &rule_name,
497 int *crush_rule,
7c673cae
FG
498 ostream *ss);
499 bool erasure_code_profile_in_use(
500 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
501 const string &profile,
502 ostream *ss);
503 int parse_erasure_code_profile(const vector<string> &erasure_code_profile,
504 map<string,string> *erasure_code_profile_map,
505 ostream *ss);
506 int prepare_pool_size(const unsigned pool_type,
507 const string &erasure_code_profile,
11fdf7f2 508 uint8_t repl_size,
7c673cae
FG
509 unsigned *size, unsigned *min_size,
510 ostream *ss);
511 int prepare_pool_stripe_width(const unsigned pool_type,
512 const string &erasure_code_profile,
513 unsigned *stripe_width,
514 ostream *ss);
3efd9988 515 int check_pg_num(int64_t pool, int pg_num, int size, ostream* ss);
11fdf7f2 516 int prepare_new_pool(string& name,
31f18b77
FG
517 int crush_rule,
518 const string &crush_rule_name,
7c673cae 519 unsigned pg_num, unsigned pgp_num,
11fdf7f2
TL
520 unsigned pg_num_min,
521 uint64_t repl_size,
522 const uint64_t target_size_bytes,
523 const float target_size_ratio,
7c673cae
FG
524 const string &erasure_code_profile,
525 const unsigned pool_type,
526 const uint64_t expected_num_objects,
527 FastReadType fast_read,
528 ostream *ss);
529 int prepare_new_pool(MonOpRequestRef op);
530
3efd9988
FG
531 void set_pool_flags(int64_t pool_id, uint64_t flags);
532 void clear_pool_flags(int64_t pool_id, uint64_t flags);
7c673cae 533 bool update_pools_status();
7c673cae 534
11fdf7f2
TL
535 string make_snap_epoch_key(int64_t pool, epoch_t epoch);
536 string make_snap_key(int64_t pool, snapid_t snap);
537 string make_snap_key_value(int64_t pool, snapid_t snap, snapid_t num,
538 epoch_t epoch, bufferlist *v);
539 string make_snap_purged_key(int64_t pool, snapid_t snap);
540 string make_snap_purged_key_value(int64_t pool, snapid_t snap, snapid_t num,
541 epoch_t epoch, bufferlist *v);
542 bool try_prune_purged_snaps();
543 int lookup_pruned_snap(int64_t pool, snapid_t snap,
544 snapid_t *begin, snapid_t *end);
545
7c673cae
FG
546 bool prepare_set_flag(MonOpRequestRef op, int flag);
547 bool prepare_unset_flag(MonOpRequestRef op, int flag);
548
549 void _pool_op_reply(MonOpRequestRef op,
550 int ret, epoch_t epoch, bufferlist *blp=NULL);
551
552 struct C_Booted : public C_MonOp {
553 OSDMonitor *cmon;
554 bool logit;
555 C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) :
556 C_MonOp(op_), cmon(cm), logit(l) {}
557 void _finish(int r) override {
558 if (r >= 0)
559 cmon->_booted(op, logit);
560 else if (r == -ECANCELED)
561 return;
562 else if (r == -EAGAIN)
563 cmon->dispatch(op);
564 else
11fdf7f2 565 ceph_abort_msg("bad C_Booted return value");
7c673cae
FG
566 }
567 };
568
569 struct C_ReplyMap : public C_MonOp {
570 OSDMonitor *osdmon;
571 epoch_t e;
572 C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee)
573 : C_MonOp(op_), osdmon(o), e(ee) {}
574 void _finish(int r) override {
575 if (r >= 0)
576 osdmon->_reply_map(op, e);
577 else if (r == -ECANCELED)
578 return;
579 else if (r == -EAGAIN)
580 osdmon->dispatch(op);
581 else
11fdf7f2 582 ceph_abort_msg("bad C_ReplyMap return value");
7c673cae
FG
583 }
584 };
585 struct C_PoolOp : public C_MonOp {
586 OSDMonitor *osdmon;
587 int replyCode;
588 int epoch;
589 bufferlist reply_data;
590 C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, bufferlist *rd=NULL) :
591 C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) {
592 if (rd)
593 reply_data = *rd;
594 }
595 void _finish(int r) override {
596 if (r >= 0)
597 osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data);
598 else if (r == -ECANCELED)
599 return;
600 else if (r == -EAGAIN)
601 osdmon->dispatch(op);
602 else
11fdf7f2 603 ceph_abort_msg("bad C_PoolOp return value");
7c673cae
FG
604 }
605 };
606
607 bool preprocess_remove_snaps(MonOpRequestRef op);
608 bool prepare_remove_snaps(MonOpRequestRef op);
609
7c673cae 610 int load_metadata(int osd, map<string, string>& m, ostream *err);
31f18b77 611 void count_metadata(const string& field, Formatter *f);
28e407b8
AA
612
613 void reencode_incremental_map(bufferlist& bl, uint64_t features);
614 void reencode_full_map(bufferlist& bl, uint64_t features);
c07f9fc5
FG
615public:
616 void count_metadata(const string& field, map<string,int> *out);
617protected:
7c673cae
FG
618 int get_osd_objectstore_type(int osd, std::string *type);
619 bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool,
620 ostream *err);
621
622 // when we last received PG stats from each osd
623 map<int,utime_t> last_osd_report;
624 // TODO: use last_osd_report to store the osd report epochs, once we don't
625 // need to upgrade from pre-luminous releases.
626 map<int,epoch_t> osd_epochs;
627 LastEpochClean last_epoch_clean;
628 bool preprocess_beacon(MonOpRequestRef op);
629 bool prepare_beacon(MonOpRequestRef op);
630 epoch_t get_min_last_epoch_clean() const;
631
632 friend class C_UpdateCreatingPGs;
11fdf7f2 633 std::map<int, std::map<epoch_t, std::set<spg_t>>> creating_pgs_by_osd_epoch;
7c673cae
FG
634 std::vector<pg_t> pending_created_pgs;
635 // the epoch when the pg mapping was calculated
636 epoch_t creating_pgs_epoch = 0;
637 creating_pgs_t creating_pgs;
c07f9fc5 638 mutable std::mutex creating_pgs_lock;
7c673cae 639
94b18763
FG
640 creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc,
641 const OSDMap& nextmap);
31f18b77 642 unsigned scan_for_creating_pgs(
7c673cae
FG
643 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
644 const mempool::osdmap::set<int64_t>& removed_pools,
645 utime_t modified,
646 creating_pgs_t* creating_pgs) const;
647 pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const;
648 void update_creating_pgs();
649 void check_pg_creates_subs();
c07f9fc5 650 epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const;
7c673cae 651
31f18b77
FG
652 int32_t _allocate_osd_id(int32_t* existing_id);
653
7c673cae
FG
654public:
655 OSDMonitor(CephContext *cct, Monitor *mn, Paxos *p, const string& service_name);
656
657 void tick() override; // check state, take actions
658
7c673cae
FG
659 bool preprocess_command(MonOpRequestRef op);
660 bool prepare_command(MonOpRequestRef op);
11fdf7f2 661 bool prepare_command_impl(MonOpRequestRef op, const cmdmap_t& cmdmap);
7c673cae 662
31f18b77
FG
663 int validate_osd_create(
664 const int32_t id,
665 const uuid_d& uuid,
666 const bool check_osd_exists,
667 int32_t* existing_id,
668 stringstream& ss);
669 int prepare_command_osd_create(
670 const int32_t id,
671 const uuid_d& uuid,
672 int32_t* existing_id,
673 stringstream& ss);
3a9019d9
FG
674 void do_osd_create(const int32_t id, const uuid_d& uuid,
675 const string& device_class,
676 int32_t* new_id);
31f18b77
FG
677 int prepare_command_osd_purge(int32_t id, stringstream& ss);
678 int prepare_command_osd_destroy(int32_t id, stringstream& ss);
679 int _prepare_command_osd_crush_remove(
680 CrushWrapper &newcrush,
681 int32_t id,
682 int32_t ancestor,
683 bool has_ancestor,
684 bool unlink_only);
685 void do_osd_crush_remove(CrushWrapper& newcrush);
686 int prepare_command_osd_crush_remove(
687 CrushWrapper &newcrush,
688 int32_t id,
689 int32_t ancestor,
690 bool has_ancestor,
691 bool unlink_only);
692 int prepare_command_osd_remove(int32_t id);
693 int prepare_command_osd_new(
694 MonOpRequestRef op,
11fdf7f2 695 const cmdmap_t& cmdmap,
31f18b77
FG
696 const map<string,string>& secrets,
697 stringstream &ss,
698 Formatter *f);
699
11fdf7f2 700 int prepare_command_pool_set(const cmdmap_t& cmdmap,
7c673cae 701 stringstream& ss);
11fdf7f2 702
c07f9fc5 703 int prepare_command_pool_application(const string &prefix,
11fdf7f2 704 const cmdmap_t& cmdmap,
c07f9fc5 705 stringstream& ss);
11fdf7f2
TL
706 int preprocess_command_pool_application(const string &prefix,
707 const cmdmap_t& cmdmap,
708 stringstream& ss,
709 bool *modified);
710 int _command_pool_application(const string &prefix,
711 const cmdmap_t& cmdmap,
712 stringstream& ss,
713 bool *modified,
714 bool preparing);
7c673cae
FG
715
716 bool handle_osd_timeouts(const utime_t &now,
717 std::map<int,utime_t> &last_osd_report);
718
719 void send_latest(MonOpRequestRef op, epoch_t start=0);
720 void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) {
721 op->mark_osdmon_event(__func__);
722 send_incremental(op, start);
723 }
724
11fdf7f2
TL
725 void get_removed_snaps_range(
726 epoch_t start, epoch_t end,
727 mempool::osdmap::map<int64_t,OSDMap::snap_interval_set_t> *gap_removed_snaps);
728
7c673cae 729 int get_version(version_t ver, bufferlist& bl) override;
28e407b8
AA
730 int get_version(version_t ver, uint64_t feature, bufferlist& bl);
731
732 int get_version_full(version_t ver, uint64_t feature, bufferlist& bl);
7c673cae 733 int get_version_full(version_t ver, bufferlist& bl) override;
11fdf7f2
TL
734 int get_inc(version_t ver, OSDMap::Incremental& inc);
735 int get_full_from_pinned_map(version_t ver, bufferlist& bl);
7c673cae 736
11fdf7f2
TL
737 epoch_t blacklist(const entity_addrvec_t& av, utime_t until);
738 epoch_t blacklist(entity_addr_t a, utime_t until);
7c673cae
FG
739
740 void dump_info(Formatter *f);
741 int dump_osd_metadata(int osd, Formatter *f, ostream *err);
742 void print_nodes(Formatter *f);
743
744 void check_osdmap_sub(Subscription *sub);
745 void check_pg_creates_sub(Subscription *sub);
746
11fdf7f2
TL
747 void do_application_enable(int64_t pool_id, const std::string &app_name,
748 const std::string &app_key="",
749 const std::string &app_value="");
494da23a
TL
750 void do_set_pool_opt(int64_t pool_id, pool_opts_t::key_t opt,
751 pool_opts_t::value_t);
c07f9fc5 752
7c673cae
FG
753 void add_flag(int flag) {
754 if (!(osdmap.flags & flag)) {
755 if (pending_inc.new_flags < 0)
756 pending_inc.new_flags = osdmap.flags;
757 pending_inc.new_flags |= flag;
758 }
759 }
760
761 void remove_flag(int flag) {
762 if(osdmap.flags & flag) {
763 if (pending_inc.new_flags < 0)
764 pending_inc.new_flags = osdmap.flags;
765 pending_inc.new_flags &= ~flag;
766 }
767 }
81eedcae 768 void convert_pool_priorities(void);
7c673cae
FG
769};
770
771#endif