]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/OSDMonitor.h
bump version to 12.2.5-pve1
[ceph.git] / ceph / src / mon / OSDMonitor.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18/* Object Store Device (OSD) Monitor
19 */
20
21#ifndef CEPH_OSDMONITOR_H
22#define CEPH_OSDMONITOR_H
23
24#include <map>
25#include <set>
26using namespace std;
27
28#include "include/types.h"
29#include "common/simple_cache.hpp"
30#include "msg/Messenger.h"
31
32#include "osd/OSDMap.h"
33#include "osd/OSDMapMapping.h"
34
35#include "CreatingPGs.h"
36#include "PaxosService.h"
37
38class Monitor;
39class PGMap;
40class MonSession;
41class MOSDMap;
42
43#include "erasure-code/ErasureCodeInterface.h"
44#include "mon/MonOpRequest.h"
45
7c673cae
FG
46/// information about a particular peer's failure reports for one osd
47struct failure_reporter_t {
48 utime_t failed_since; ///< when they think it failed
49 MonOpRequestRef op; ///< failure op request
50
51 failure_reporter_t() {}
52 explicit failure_reporter_t(utime_t s) : failed_since(s) {}
53 ~failure_reporter_t() { }
54};
55
56/// information about all failure reports for one osd
57struct failure_info_t {
58 map<int, failure_reporter_t> reporters; ///< reporter -> failed_since etc
59 utime_t max_failed_since; ///< most recent failed_since
60
61 failure_info_t() {}
62
63 utime_t get_failed_since() {
64 if (max_failed_since == utime_t() && !reporters.empty()) {
65 // the old max must have canceled; recalculate.
66 for (map<int, failure_reporter_t>::iterator p = reporters.begin();
67 p != reporters.end();
68 ++p)
69 if (p->second.failed_since > max_failed_since)
70 max_failed_since = p->second.failed_since;
71 }
72 return max_failed_since;
73 }
74
75 // set the message for the latest report. return any old op request we had,
76 // if any, so we can discard it.
77 MonOpRequestRef add_report(int who, utime_t failed_since,
78 MonOpRequestRef op) {
79 map<int, failure_reporter_t>::iterator p = reporters.find(who);
80 if (p == reporters.end()) {
81 if (max_failed_since < failed_since)
82 max_failed_since = failed_since;
83 p = reporters.insert(map<int, failure_reporter_t>::value_type(who, failure_reporter_t(failed_since))).first;
84 }
85
86 MonOpRequestRef ret = p->second.op;
87 p->second.op = op;
88 return ret;
89 }
90
91 void take_report_messages(list<MonOpRequestRef>& ls) {
92 for (map<int, failure_reporter_t>::iterator p = reporters.begin();
93 p != reporters.end();
94 ++p) {
95 if (p->second.op) {
96 ls.push_back(p->second.op);
97 p->second.op.reset();
98 }
99 }
100 }
101
102 MonOpRequestRef cancel_report(int who) {
103 map<int, failure_reporter_t>::iterator p = reporters.find(who);
104 if (p == reporters.end())
105 return MonOpRequestRef();
106 MonOpRequestRef ret = p->second.op;
107 reporters.erase(p);
108 return ret;
109 }
110};
111
112
113class LastEpochClean {
114 struct Lec {
115 vector<epoch_t> epoch_by_pg;
116 ps_t next_missing = 0;
117 epoch_t floor = std::numeric_limits<epoch_t>::max();
118 void report(ps_t pg, epoch_t last_epoch_clean);
119 };
120 std::map<uint64_t, Lec> report_by_pool;
121public:
122 void report(const pg_t& pg, epoch_t last_epoch_clean);
123 void remove_pool(uint64_t pool);
124 epoch_t get_lower_bound(const OSDMap& latest) const;
125};
126
127
128class OSDMonitor : public PaxosService {
129 CephContext *cct;
130
131public:
132 OSDMap osdmap;
133
134 // [leader]
135 OSDMap::Incremental pending_inc;
136 map<int, bufferlist> pending_metadata;
137 set<int> pending_metadata_rm;
138 map<int, failure_info_t> failure_info;
139 map<int,utime_t> down_pending_out; // osd down -> out
140
141 map<int,double> osd_weight;
142
143 SimpleLRU<version_t, bufferlist> inc_osd_cache;
144 SimpleLRU<version_t, bufferlist> full_osd_cache;
145
146 bool check_failures(utime_t now);
147 bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
224ce89b 148 void force_failure(int target_osd, int by);
7c673cae
FG
149
150 // the time of last msg(MSG_ALIVE and MSG_PGTEMP) proposed without delay
151 utime_t last_attempted_minwait_time;
152
153 bool _have_pending_crush();
154 CrushWrapper &_get_stable_crush();
155 void _get_pending_crush(CrushWrapper& newcrush);
156
157 enum FastReadType {
158 FAST_READ_OFF,
159 FAST_READ_ON,
160 FAST_READ_DEFAULT
161 };
162
163 // svc
164public:
165 void create_initial() override;
166 void get_store_prefixes(std::set<string>& s) override;
167
168private:
169 void update_from_paxos(bool *need_bootstrap) override;
170 void create_pending() override; // prepare a new pending
171 void encode_pending(MonitorDBStore::TransactionRef t) override;
172 void on_active() override;
173 void on_restart() override;
174 void on_shutdown() override;
175 /**
176 * we haven't delegated full version stashing to paxosservice for some time
177 * now, making this function useless in current context.
178 */
179 void encode_full(MonitorDBStore::TransactionRef t) override { }
180 /**
181 * do not let paxosservice periodically stash full osdmaps, or we will break our
182 * locally-managed full maps. (update_from_paxos loads the latest and writes them
183 * out going forward from there, but if we just synced that may mean we skip some.)
184 */
185 bool should_stash_full() override {
186 return false;
187 }
188
189 /**
190 * hook into trim to include the oldest full map in the trim transaction
191 *
192 * This ensures that anyone post-sync will have enough to rebuild their
193 * full osdmaps.
194 */
195 void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) override;
196
197 void update_msgr_features();
198 int check_cluster_features(uint64_t features, stringstream &ss);
199 /**
200 * check if the cluster supports the features required by the
201 * given crush map. Outputs the daemons which don't support it
202 * to the stringstream.
203 *
204 * @returns true if the map is passable, false otherwise
205 */
206 bool validate_crush_against_features(const CrushWrapper *newcrush,
207 stringstream &ss);
208 void check_osdmap_subs();
209 void share_map_with_random_osd();
210
211 Mutex prime_pg_temp_lock = {"OSDMonitor::prime_pg_temp_lock"};
212 struct PrimeTempJob : public ParallelPGMapper::Job {
213 OSDMonitor *osdmon;
214 PrimeTempJob(const OSDMap& om, OSDMonitor *m)
215 : ParallelPGMapper::Job(&om), osdmon(m) {}
216 void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override {
217 for (unsigned ps = ps_begin; ps < ps_end; ++ps) {
218 pg_t pgid(ps, pool);
219 osdmon->prime_pg_temp(*osdmap, pgid);
220 }
221 }
222 void complete() override {}
223 };
224 void maybe_prime_pg_temp();
225 void prime_pg_temp(const OSDMap& next, pg_t pgid);
226
227 ParallelPGMapper mapper; ///< for background pg work
228 OSDMapMapping mapping; ///< pg <-> osd mappings
229 unique_ptr<ParallelPGMapper::Job> mapping_job; ///< background mapping job
230 void start_mapping();
231
232 void update_logger();
233
234 void handle_query(PaxosServiceMessage *m);
235 bool preprocess_query(MonOpRequestRef op) override; // true if processed.
236 bool prepare_update(MonOpRequestRef op) override;
237 bool should_propose(double &delay) override;
238
239 version_t get_trim_to() override;
240
241 bool can_mark_down(int o);
242 bool can_mark_up(int o);
243 bool can_mark_out(int o);
244 bool can_mark_in(int o);
245
246 // ...
247 MOSDMap *build_latest_full();
248 MOSDMap *build_incremental(epoch_t first, epoch_t last);
249 void send_full(MonOpRequestRef op);
250 void send_incremental(MonOpRequestRef op, epoch_t first);
251public:
252 // @param req an optional op request, if the osdmaps are replies to it. so
253 // @c Monitor::send_reply() can mark_event with it.
254 void send_incremental(epoch_t first, MonSession *session, bool onetime,
255 MonOpRequestRef req = MonOpRequestRef());
256
257private:
258 void print_utilization(ostream &out, Formatter *f, bool tree) const;
259
260 bool check_source(PaxosServiceMessage *m, uuid_d fsid);
261
262 bool preprocess_get_osdmap(MonOpRequestRef op);
263
264 bool preprocess_mark_me_down(MonOpRequestRef op);
265
266 friend class C_AckMarkedDown;
267 bool preprocess_failure(MonOpRequestRef op);
268 bool prepare_failure(MonOpRequestRef op);
269 bool prepare_mark_me_down(MonOpRequestRef op);
270 void process_failures();
271 void take_all_failures(list<MonOpRequestRef>& ls);
272
273 bool preprocess_full(MonOpRequestRef op);
274 bool prepare_full(MonOpRequestRef op);
275
276 bool preprocess_boot(MonOpRequestRef op);
277 bool prepare_boot(MonOpRequestRef op);
278 void _booted(MonOpRequestRef op, bool logit);
279
280 void update_up_thru(int from, epoch_t up_thru);
281 bool preprocess_alive(MonOpRequestRef op);
282 bool prepare_alive(MonOpRequestRef op);
283 void _reply_map(MonOpRequestRef op, epoch_t e);
284
285 bool preprocess_pgtemp(MonOpRequestRef op);
286 bool prepare_pgtemp(MonOpRequestRef op);
287
288 bool preprocess_pg_created(MonOpRequestRef op);
289 bool prepare_pg_created(MonOpRequestRef op);
290
291 int _check_remove_pool(int64_t pool_id, const pg_pool_t &pool, ostream *ss);
292 bool _check_become_tier(
293 int64_t tier_pool_id, const pg_pool_t *tier_pool,
294 int64_t base_pool_id, const pg_pool_t *base_pool,
295 int *err, ostream *ss) const;
296 bool _check_remove_tier(
297 int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
298 int *err, ostream *ss) const;
299
300 int _prepare_remove_pool(int64_t pool, ostream *ss, bool no_fake);
301 int _prepare_rename_pool(int64_t pool, string newname);
302
303 bool preprocess_pool_op (MonOpRequestRef op);
304 bool preprocess_pool_op_create (MonOpRequestRef op);
305 bool prepare_pool_op (MonOpRequestRef op);
306 bool prepare_pool_op_create (MonOpRequestRef op);
307 bool prepare_pool_op_delete(MonOpRequestRef op);
308 int crush_rename_bucket(const string& srcname,
309 const string& dstname,
310 ostream *ss);
311 void check_legacy_ec_plugin(const string& plugin,
312 const string& profile) const;
313 int normalize_profile(const string& profilename,
314 ErasureCodeProfile &profile,
315 bool force,
316 ostream *ss);
31f18b77
FG
317 int crush_rule_create_erasure(const string &name,
318 const string &profile,
319 int *rule,
320 ostream *ss);
321 int get_crush_rule(const string &rule_name,
322 int *crush_rule,
7c673cae
FG
323 ostream *ss);
324 int get_erasure_code(const string &erasure_code_profile,
325 ErasureCodeInterfaceRef *erasure_code,
326 ostream *ss) const;
31f18b77 327 int prepare_pool_crush_rule(const unsigned pool_type,
7c673cae 328 const string &erasure_code_profile,
31f18b77
FG
329 const string &rule_name,
330 int *crush_rule,
7c673cae
FG
331 ostream *ss);
332 bool erasure_code_profile_in_use(
333 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
334 const string &profile,
335 ostream *ss);
336 int parse_erasure_code_profile(const vector<string> &erasure_code_profile,
337 map<string,string> *erasure_code_profile_map,
338 ostream *ss);
339 int prepare_pool_size(const unsigned pool_type,
340 const string &erasure_code_profile,
341 unsigned *size, unsigned *min_size,
342 ostream *ss);
343 int prepare_pool_stripe_width(const unsigned pool_type,
344 const string &erasure_code_profile,
345 unsigned *stripe_width,
346 ostream *ss);
3efd9988 347 int check_pg_num(int64_t pool, int pg_num, int size, ostream* ss);
7c673cae 348 int prepare_new_pool(string& name, uint64_t auid,
31f18b77
FG
349 int crush_rule,
350 const string &crush_rule_name,
7c673cae
FG
351 unsigned pg_num, unsigned pgp_num,
352 const string &erasure_code_profile,
353 const unsigned pool_type,
354 const uint64_t expected_num_objects,
355 FastReadType fast_read,
356 ostream *ss);
357 int prepare_new_pool(MonOpRequestRef op);
358
3efd9988
FG
359 void set_pool_flags(int64_t pool_id, uint64_t flags);
360 void clear_pool_flags(int64_t pool_id, uint64_t flags);
7c673cae 361 bool update_pools_status();
7c673cae
FG
362
363 bool prepare_set_flag(MonOpRequestRef op, int flag);
364 bool prepare_unset_flag(MonOpRequestRef op, int flag);
365
366 void _pool_op_reply(MonOpRequestRef op,
367 int ret, epoch_t epoch, bufferlist *blp=NULL);
368
369 struct C_Booted : public C_MonOp {
370 OSDMonitor *cmon;
371 bool logit;
372 C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) :
373 C_MonOp(op_), cmon(cm), logit(l) {}
374 void _finish(int r) override {
375 if (r >= 0)
376 cmon->_booted(op, logit);
377 else if (r == -ECANCELED)
378 return;
379 else if (r == -EAGAIN)
380 cmon->dispatch(op);
381 else
382 assert(0 == "bad C_Booted return value");
383 }
384 };
385
386 struct C_ReplyMap : public C_MonOp {
387 OSDMonitor *osdmon;
388 epoch_t e;
389 C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee)
390 : C_MonOp(op_), osdmon(o), e(ee) {}
391 void _finish(int r) override {
392 if (r >= 0)
393 osdmon->_reply_map(op, e);
394 else if (r == -ECANCELED)
395 return;
396 else if (r == -EAGAIN)
397 osdmon->dispatch(op);
398 else
399 assert(0 == "bad C_ReplyMap return value");
400 }
401 };
402 struct C_PoolOp : public C_MonOp {
403 OSDMonitor *osdmon;
404 int replyCode;
405 int epoch;
406 bufferlist reply_data;
407 C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, bufferlist *rd=NULL) :
408 C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) {
409 if (rd)
410 reply_data = *rd;
411 }
412 void _finish(int r) override {
413 if (r >= 0)
414 osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data);
415 else if (r == -ECANCELED)
416 return;
417 else if (r == -EAGAIN)
418 osdmon->dispatch(op);
419 else
420 assert(0 == "bad C_PoolOp return value");
421 }
422 };
423
424 bool preprocess_remove_snaps(MonOpRequestRef op);
425 bool prepare_remove_snaps(MonOpRequestRef op);
426
427 OpTracker op_tracker;
428
429 int load_metadata(int osd, map<string, string>& m, ostream *err);
31f18b77 430 void count_metadata(const string& field, Formatter *f);
c07f9fc5
FG
431public:
432 void count_metadata(const string& field, map<string,int> *out);
433protected:
7c673cae
FG
434 int get_osd_objectstore_type(int osd, std::string *type);
435 bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool,
436 ostream *err);
437
438 // when we last received PG stats from each osd
439 map<int,utime_t> last_osd_report;
440 // TODO: use last_osd_report to store the osd report epochs, once we don't
441 // need to upgrade from pre-luminous releases.
442 map<int,epoch_t> osd_epochs;
443 LastEpochClean last_epoch_clean;
444 bool preprocess_beacon(MonOpRequestRef op);
445 bool prepare_beacon(MonOpRequestRef op);
446 epoch_t get_min_last_epoch_clean() const;
447
448 friend class C_UpdateCreatingPGs;
449 std::map<int, std::map<epoch_t, std::set<pg_t>>> creating_pgs_by_osd_epoch;
450 std::vector<pg_t> pending_created_pgs;
451 // the epoch when the pg mapping was calculated
452 epoch_t creating_pgs_epoch = 0;
453 creating_pgs_t creating_pgs;
c07f9fc5 454 mutable std::mutex creating_pgs_lock;
7c673cae 455
94b18763
FG
456 creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc,
457 const OSDMap& nextmap);
31f18b77
FG
458 void trim_creating_pgs(creating_pgs_t *creating_pgs,
459 const ceph::unordered_map<pg_t,pg_stat_t>& pgm);
460 unsigned scan_for_creating_pgs(
7c673cae
FG
461 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
462 const mempool::osdmap::set<int64_t>& removed_pools,
463 utime_t modified,
464 creating_pgs_t* creating_pgs) const;
465 pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const;
466 void update_creating_pgs();
467 void check_pg_creates_subs();
c07f9fc5 468 epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const;
7c673cae 469
31f18b77
FG
470 int32_t _allocate_osd_id(int32_t* existing_id);
471
7c673cae
FG
472public:
473 OSDMonitor(CephContext *cct, Monitor *mn, Paxos *p, const string& service_name);
474
475 void tick() override; // check state, take actions
476
7c673cae
FG
477 void get_health(list<pair<health_status_t,string> >& summary,
478 list<pair<health_status_t,string> > *detail,
479 CephContext *cct) const override;
480 bool preprocess_command(MonOpRequestRef op);
481 bool prepare_command(MonOpRequestRef op);
482 bool prepare_command_impl(MonOpRequestRef op, map<string,cmd_vartype>& cmdmap);
483
31f18b77
FG
484 int validate_osd_create(
485 const int32_t id,
486 const uuid_d& uuid,
487 const bool check_osd_exists,
488 int32_t* existing_id,
489 stringstream& ss);
490 int prepare_command_osd_create(
491 const int32_t id,
492 const uuid_d& uuid,
493 int32_t* existing_id,
494 stringstream& ss);
3a9019d9
FG
495 void do_osd_create(const int32_t id, const uuid_d& uuid,
496 const string& device_class,
497 int32_t* new_id);
31f18b77
FG
498 int prepare_command_osd_purge(int32_t id, stringstream& ss);
499 int prepare_command_osd_destroy(int32_t id, stringstream& ss);
500 int _prepare_command_osd_crush_remove(
501 CrushWrapper &newcrush,
502 int32_t id,
503 int32_t ancestor,
504 bool has_ancestor,
505 bool unlink_only);
506 void do_osd_crush_remove(CrushWrapper& newcrush);
507 int prepare_command_osd_crush_remove(
508 CrushWrapper &newcrush,
509 int32_t id,
510 int32_t ancestor,
511 bool has_ancestor,
512 bool unlink_only);
513 int prepare_command_osd_remove(int32_t id);
514 int prepare_command_osd_new(
515 MonOpRequestRef op,
516 const map<string,cmd_vartype>& cmdmap,
517 const map<string,string>& secrets,
518 stringstream &ss,
519 Formatter *f);
520
7c673cae
FG
521 int prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
522 stringstream& ss);
c07f9fc5
FG
523 int prepare_command_pool_application(const string &prefix,
524 map<string,cmd_vartype> &cmdmap,
525 stringstream& ss);
7c673cae
FG
526
527 bool handle_osd_timeouts(const utime_t &now,
528 std::map<int,utime_t> &last_osd_report);
529
530 void send_latest(MonOpRequestRef op, epoch_t start=0);
531 void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) {
532 op->mark_osdmon_event(__func__);
533 send_incremental(op, start);
534 }
535
536 int get_version(version_t ver, bufferlist& bl) override;
537 int get_version_full(version_t ver, bufferlist& bl) override;
538
539 epoch_t blacklist(const entity_addr_t& a, utime_t until);
540
541 void dump_info(Formatter *f);
542 int dump_osd_metadata(int osd, Formatter *f, ostream *err);
543 void print_nodes(Formatter *f);
544
545 void check_osdmap_sub(Subscription *sub);
546 void check_pg_creates_sub(Subscription *sub);
547
c07f9fc5
FG
548 void do_application_enable(int64_t pool_id, const std::string &app_name);
549
7c673cae
FG
550 void add_flag(int flag) {
551 if (!(osdmap.flags & flag)) {
552 if (pending_inc.new_flags < 0)
553 pending_inc.new_flags = osdmap.flags;
554 pending_inc.new_flags |= flag;
555 }
556 }
557
558 void remove_flag(int flag) {
559 if(osdmap.flags & flag) {
560 if (pending_inc.new_flags < 0)
561 pending_inc.new_flags = osdmap.flags;
562 pending_inc.new_flags &= ~flag;
563 }
564 }
565};
566
567#endif