]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.h
update sources to 12.2.7
[ceph.git] / ceph / src / mon / OSDMonitor.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 /* Object Store Device (OSD) Monitor
19 */
20
21 #ifndef CEPH_OSDMONITOR_H
22 #define CEPH_OSDMONITOR_H
23
24 #include <map>
25 #include <set>
26 using namespace std;
27
28 #include "include/types.h"
29 #include "common/simple_cache.hpp"
30 #include "msg/Messenger.h"
31
32 #include "osd/OSDMap.h"
33 #include "osd/OSDMapMapping.h"
34
35 #include "CreatingPGs.h"
36 #include "PaxosService.h"
37
38 class Monitor;
39 class PGMap;
40 class MonSession;
41 class MOSDMap;
42
43 #include "erasure-code/ErasureCodeInterface.h"
44 #include "mon/MonOpRequest.h"
45 #include <boost/functional/hash.hpp>
46 // re-include our assert to clobber the system one; fix dout:
47 #include "include/assert.h"
48
49 /// information about a particular peer's failure reports for one osd
50 struct failure_reporter_t {
51 utime_t failed_since; ///< when they think it failed
52 MonOpRequestRef op; ///< failure op request
53
54 failure_reporter_t() {}
55 explicit failure_reporter_t(utime_t s) : failed_since(s) {}
56 ~failure_reporter_t() { }
57 };
58
59 /// information about all failure reports for one osd
60 struct failure_info_t {
61 map<int, failure_reporter_t> reporters; ///< reporter -> failed_since etc
62 utime_t max_failed_since; ///< most recent failed_since
63
64 failure_info_t() {}
65
66 utime_t get_failed_since() {
67 if (max_failed_since == utime_t() && !reporters.empty()) {
68 // the old max must have canceled; recalculate.
69 for (map<int, failure_reporter_t>::iterator p = reporters.begin();
70 p != reporters.end();
71 ++p)
72 if (p->second.failed_since > max_failed_since)
73 max_failed_since = p->second.failed_since;
74 }
75 return max_failed_since;
76 }
77
78 // set the message for the latest report. return any old op request we had,
79 // if any, so we can discard it.
80 MonOpRequestRef add_report(int who, utime_t failed_since,
81 MonOpRequestRef op) {
82 map<int, failure_reporter_t>::iterator p = reporters.find(who);
83 if (p == reporters.end()) {
84 if (max_failed_since < failed_since)
85 max_failed_since = failed_since;
86 p = reporters.insert(map<int, failure_reporter_t>::value_type(who, failure_reporter_t(failed_since))).first;
87 }
88
89 MonOpRequestRef ret = p->second.op;
90 p->second.op = op;
91 return ret;
92 }
93
94 void take_report_messages(list<MonOpRequestRef>& ls) {
95 for (map<int, failure_reporter_t>::iterator p = reporters.begin();
96 p != reporters.end();
97 ++p) {
98 if (p->second.op) {
99 ls.push_back(p->second.op);
100 p->second.op.reset();
101 }
102 }
103 }
104
105 MonOpRequestRef cancel_report(int who) {
106 map<int, failure_reporter_t>::iterator p = reporters.find(who);
107 if (p == reporters.end())
108 return MonOpRequestRef();
109 MonOpRequestRef ret = p->second.op;
110 reporters.erase(p);
111 return ret;
112 }
113 };
114
115
116 class LastEpochClean {
117 struct Lec {
118 vector<epoch_t> epoch_by_pg;
119 ps_t next_missing = 0;
120 epoch_t floor = std::numeric_limits<epoch_t>::max();
121 void report(ps_t pg, epoch_t last_epoch_clean);
122 };
123 std::map<uint64_t, Lec> report_by_pool;
124 public:
125 void report(const pg_t& pg, epoch_t last_epoch_clean);
126 void remove_pool(uint64_t pool);
127 epoch_t get_lower_bound(const OSDMap& latest) const;
128 };
129
130
131 class OSDMonitor : public PaxosService {
132 CephContext *cct;
133
134 public:
135 OSDMap osdmap;
136
137 // [leader]
138 OSDMap::Incremental pending_inc;
139 map<int, bufferlist> pending_metadata;
140 set<int> pending_metadata_rm;
141 map<int, failure_info_t> failure_info;
142 map<int,utime_t> down_pending_out; // osd down -> out
143
144 map<int,double> osd_weight;
145
146 using osdmap_key_t = std::pair<version_t, uint64_t>;
147 using osdmap_cache_t = SimpleLRU<osdmap_key_t,
148 bufferlist,
149 std::less<osdmap_key_t>,
150 boost::hash<osdmap_key_t>>;
151 osdmap_cache_t inc_osd_cache;
152 osdmap_cache_t full_osd_cache;
153
154 bool check_failures(utime_t now);
155 bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
156 void force_failure(int target_osd, int by);
157
158 bool _have_pending_crush();
159 CrushWrapper &_get_stable_crush();
160 void _get_pending_crush(CrushWrapper& newcrush);
161
162 enum FastReadType {
163 FAST_READ_OFF,
164 FAST_READ_ON,
165 FAST_READ_DEFAULT
166 };
167
168 // svc
169 public:
170 void create_initial() override;
171 void get_store_prefixes(std::set<string>& s) override;
172
173 private:
174 void update_from_paxos(bool *need_bootstrap) override;
175 void create_pending() override; // prepare a new pending
176 void encode_pending(MonitorDBStore::TransactionRef t) override;
177 void on_active() override;
178 void on_restart() override;
179 void on_shutdown() override;
180 /**
181 * we haven't delegated full version stashing to paxosservice for some time
182 * now, making this function useless in current context.
183 */
184 void encode_full(MonitorDBStore::TransactionRef t) override { }
185 /**
186 * do not let paxosservice periodically stash full osdmaps, or we will break our
187 * locally-managed full maps. (update_from_paxos loads the latest and writes them
188 * out going forward from there, but if we just synced that may mean we skip some.)
189 */
190 bool should_stash_full() override {
191 return false;
192 }
193
194 /**
195 * hook into trim to include the oldest full map in the trim transaction
196 *
197 * This ensures that anyone post-sync will have enough to rebuild their
198 * full osdmaps.
199 */
200 void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) override;
201
202 void update_msgr_features();
203 int check_cluster_features(uint64_t features, stringstream &ss);
204 /**
205 * check if the cluster supports the features required by the
206 * given crush map. Outputs the daemons which don't support it
207 * to the stringstream.
208 *
209 * @returns true if the map is passable, false otherwise
210 */
211 bool validate_crush_against_features(const CrushWrapper *newcrush,
212 stringstream &ss);
213 void check_osdmap_subs();
214 void share_map_with_random_osd();
215
216 Mutex prime_pg_temp_lock = {"OSDMonitor::prime_pg_temp_lock"};
217 struct PrimeTempJob : public ParallelPGMapper::Job {
218 OSDMonitor *osdmon;
219 PrimeTempJob(const OSDMap& om, OSDMonitor *m)
220 : ParallelPGMapper::Job(&om), osdmon(m) {}
221 void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override {
222 for (unsigned ps = ps_begin; ps < ps_end; ++ps) {
223 pg_t pgid(ps, pool);
224 osdmon->prime_pg_temp(*osdmap, pgid);
225 }
226 }
227 void complete() override {}
228 };
229 void maybe_prime_pg_temp();
230 void prime_pg_temp(const OSDMap& next, pg_t pgid);
231
232 ParallelPGMapper mapper; ///< for background pg work
233 OSDMapMapping mapping; ///< pg <-> osd mappings
234 unique_ptr<ParallelPGMapper::Job> mapping_job; ///< background mapping job
235 void start_mapping();
236
237 void update_logger();
238
239 void handle_query(PaxosServiceMessage *m);
240 bool preprocess_query(MonOpRequestRef op) override; // true if processed.
241 bool prepare_update(MonOpRequestRef op) override;
242 bool should_propose(double &delay) override;
243
244 version_t get_trim_to() override;
245
246 bool can_mark_down(int o);
247 bool can_mark_up(int o);
248 bool can_mark_out(int o);
249 bool can_mark_in(int o);
250
251 // ...
252 MOSDMap *build_latest_full(uint64_t features);
253 MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features);
254 void send_full(MonOpRequestRef op);
255 void send_incremental(MonOpRequestRef op, epoch_t first);
256 public:
257 // @param req an optional op request, if the osdmaps are replies to it. so
258 // @c Monitor::send_reply() can mark_event with it.
259 void send_incremental(epoch_t first, MonSession *session, bool onetime,
260 MonOpRequestRef req = MonOpRequestRef());
261
262 private:
263 void print_utilization(ostream &out, Formatter *f, bool tree) const;
264
265 bool check_source(PaxosServiceMessage *m, uuid_d fsid);
266
267 bool preprocess_get_osdmap(MonOpRequestRef op);
268
269 bool preprocess_mark_me_down(MonOpRequestRef op);
270
271 friend class C_AckMarkedDown;
272 bool preprocess_failure(MonOpRequestRef op);
273 bool prepare_failure(MonOpRequestRef op);
274 bool prepare_mark_me_down(MonOpRequestRef op);
275 void process_failures();
276 void take_all_failures(list<MonOpRequestRef>& ls);
277
278 bool preprocess_full(MonOpRequestRef op);
279 bool prepare_full(MonOpRequestRef op);
280
281 bool preprocess_boot(MonOpRequestRef op);
282 bool prepare_boot(MonOpRequestRef op);
283 void _booted(MonOpRequestRef op, bool logit);
284
285 void update_up_thru(int from, epoch_t up_thru);
286 bool preprocess_alive(MonOpRequestRef op);
287 bool prepare_alive(MonOpRequestRef op);
288 void _reply_map(MonOpRequestRef op, epoch_t e);
289
290 bool preprocess_pgtemp(MonOpRequestRef op);
291 bool prepare_pgtemp(MonOpRequestRef op);
292
293 bool preprocess_pg_created(MonOpRequestRef op);
294 bool prepare_pg_created(MonOpRequestRef op);
295
296 int _check_remove_pool(int64_t pool_id, const pg_pool_t &pool, ostream *ss);
297 bool _check_become_tier(
298 int64_t tier_pool_id, const pg_pool_t *tier_pool,
299 int64_t base_pool_id, const pg_pool_t *base_pool,
300 int *err, ostream *ss) const;
301 bool _check_remove_tier(
302 int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
303 int *err, ostream *ss) const;
304
305 int _prepare_remove_pool(int64_t pool, ostream *ss, bool no_fake);
306 int _prepare_rename_pool(int64_t pool, string newname);
307
308 bool enforce_pool_op_caps(MonOpRequestRef op);
309 bool preprocess_pool_op (MonOpRequestRef op);
310 bool preprocess_pool_op_create (MonOpRequestRef op);
311 bool prepare_pool_op (MonOpRequestRef op);
312 bool prepare_pool_op_create (MonOpRequestRef op);
313 bool prepare_pool_op_delete(MonOpRequestRef op);
314 int crush_rename_bucket(const string& srcname,
315 const string& dstname,
316 ostream *ss);
317 void check_legacy_ec_plugin(const string& plugin,
318 const string& profile) const;
319 int normalize_profile(const string& profilename,
320 ErasureCodeProfile &profile,
321 bool force,
322 ostream *ss);
323 int crush_rule_create_erasure(const string &name,
324 const string &profile,
325 int *rule,
326 ostream *ss);
327 int get_crush_rule(const string &rule_name,
328 int *crush_rule,
329 ostream *ss);
330 int get_erasure_code(const string &erasure_code_profile,
331 ErasureCodeInterfaceRef *erasure_code,
332 ostream *ss) const;
333 int prepare_pool_crush_rule(const unsigned pool_type,
334 const string &erasure_code_profile,
335 const string &rule_name,
336 int *crush_rule,
337 ostream *ss);
338 bool erasure_code_profile_in_use(
339 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
340 const string &profile,
341 ostream *ss);
342 int parse_erasure_code_profile(const vector<string> &erasure_code_profile,
343 map<string,string> *erasure_code_profile_map,
344 ostream *ss);
345 int prepare_pool_size(const unsigned pool_type,
346 const string &erasure_code_profile,
347 unsigned *size, unsigned *min_size,
348 ostream *ss);
349 int prepare_pool_stripe_width(const unsigned pool_type,
350 const string &erasure_code_profile,
351 unsigned *stripe_width,
352 ostream *ss);
353 int check_pg_num(int64_t pool, int pg_num, int size, ostream* ss);
354 int prepare_new_pool(string& name, uint64_t auid,
355 int crush_rule,
356 const string &crush_rule_name,
357 unsigned pg_num, unsigned pgp_num,
358 const string &erasure_code_profile,
359 const unsigned pool_type,
360 const uint64_t expected_num_objects,
361 FastReadType fast_read,
362 ostream *ss);
363 int prepare_new_pool(MonOpRequestRef op);
364
365 void set_pool_flags(int64_t pool_id, uint64_t flags);
366 void clear_pool_flags(int64_t pool_id, uint64_t flags);
367 bool update_pools_status();
368
369 bool prepare_set_flag(MonOpRequestRef op, int flag);
370 bool prepare_unset_flag(MonOpRequestRef op, int flag);
371
372 void _pool_op_reply(MonOpRequestRef op,
373 int ret, epoch_t epoch, bufferlist *blp=NULL);
374
375 struct C_Booted : public C_MonOp {
376 OSDMonitor *cmon;
377 bool logit;
378 C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) :
379 C_MonOp(op_), cmon(cm), logit(l) {}
380 void _finish(int r) override {
381 if (r >= 0)
382 cmon->_booted(op, logit);
383 else if (r == -ECANCELED)
384 return;
385 else if (r == -EAGAIN)
386 cmon->dispatch(op);
387 else
388 assert(0 == "bad C_Booted return value");
389 }
390 };
391
392 struct C_ReplyMap : public C_MonOp {
393 OSDMonitor *osdmon;
394 epoch_t e;
395 C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee)
396 : C_MonOp(op_), osdmon(o), e(ee) {}
397 void _finish(int r) override {
398 if (r >= 0)
399 osdmon->_reply_map(op, e);
400 else if (r == -ECANCELED)
401 return;
402 else if (r == -EAGAIN)
403 osdmon->dispatch(op);
404 else
405 assert(0 == "bad C_ReplyMap return value");
406 }
407 };
408 struct C_PoolOp : public C_MonOp {
409 OSDMonitor *osdmon;
410 int replyCode;
411 int epoch;
412 bufferlist reply_data;
413 C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, bufferlist *rd=NULL) :
414 C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) {
415 if (rd)
416 reply_data = *rd;
417 }
418 void _finish(int r) override {
419 if (r >= 0)
420 osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data);
421 else if (r == -ECANCELED)
422 return;
423 else if (r == -EAGAIN)
424 osdmon->dispatch(op);
425 else
426 assert(0 == "bad C_PoolOp return value");
427 }
428 };
429
430 bool preprocess_remove_snaps(MonOpRequestRef op);
431 bool prepare_remove_snaps(MonOpRequestRef op);
432
433 OpTracker op_tracker;
434
435 int load_metadata(int osd, map<string, string>& m, ostream *err);
436 void count_metadata(const string& field, Formatter *f);
437
438 void reencode_incremental_map(bufferlist& bl, uint64_t features);
439 void reencode_full_map(bufferlist& bl, uint64_t features);
440 public:
441 void count_metadata(const string& field, map<string,int> *out);
442 protected:
443 int get_osd_objectstore_type(int osd, std::string *type);
444 bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool,
445 ostream *err);
446
447 // when we last received PG stats from each osd
448 map<int,utime_t> last_osd_report;
449 // TODO: use last_osd_report to store the osd report epochs, once we don't
450 // need to upgrade from pre-luminous releases.
451 map<int,epoch_t> osd_epochs;
452 LastEpochClean last_epoch_clean;
453 bool preprocess_beacon(MonOpRequestRef op);
454 bool prepare_beacon(MonOpRequestRef op);
455 epoch_t get_min_last_epoch_clean() const;
456
457 friend class C_UpdateCreatingPGs;
458 std::map<int, std::map<epoch_t, std::set<pg_t>>> creating_pgs_by_osd_epoch;
459 std::vector<pg_t> pending_created_pgs;
460 // the epoch when the pg mapping was calculated
461 epoch_t creating_pgs_epoch = 0;
462 creating_pgs_t creating_pgs;
463 mutable std::mutex creating_pgs_lock;
464
465 creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc,
466 const OSDMap& nextmap);
467 void trim_creating_pgs(creating_pgs_t *creating_pgs,
468 const ceph::unordered_map<pg_t,pg_stat_t>& pgm);
469 unsigned scan_for_creating_pgs(
470 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
471 const mempool::osdmap::set<int64_t>& removed_pools,
472 utime_t modified,
473 creating_pgs_t* creating_pgs) const;
474 pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const;
475 void update_creating_pgs();
476 void check_pg_creates_subs();
477 epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const;
478
479 int32_t _allocate_osd_id(int32_t* existing_id);
480
481 public:
482 OSDMonitor(CephContext *cct, Monitor *mn, Paxos *p, const string& service_name);
483
484 void tick() override; // check state, take actions
485
486 void get_health(list<pair<health_status_t,string> >& summary,
487 list<pair<health_status_t,string> > *detail,
488 CephContext *cct) const override;
489 bool preprocess_command(MonOpRequestRef op);
490 bool prepare_command(MonOpRequestRef op);
491 bool prepare_command_impl(MonOpRequestRef op, map<string,cmd_vartype>& cmdmap);
492
493 int validate_osd_create(
494 const int32_t id,
495 const uuid_d& uuid,
496 const bool check_osd_exists,
497 int32_t* existing_id,
498 stringstream& ss);
499 int prepare_command_osd_create(
500 const int32_t id,
501 const uuid_d& uuid,
502 int32_t* existing_id,
503 stringstream& ss);
504 void do_osd_create(const int32_t id, const uuid_d& uuid,
505 const string& device_class,
506 int32_t* new_id);
507 int prepare_command_osd_purge(int32_t id, stringstream& ss);
508 int prepare_command_osd_destroy(int32_t id, stringstream& ss);
509 int _prepare_command_osd_crush_remove(
510 CrushWrapper &newcrush,
511 int32_t id,
512 int32_t ancestor,
513 bool has_ancestor,
514 bool unlink_only);
515 void do_osd_crush_remove(CrushWrapper& newcrush);
516 int prepare_command_osd_crush_remove(
517 CrushWrapper &newcrush,
518 int32_t id,
519 int32_t ancestor,
520 bool has_ancestor,
521 bool unlink_only);
522 int prepare_command_osd_remove(int32_t id);
523 int prepare_command_osd_new(
524 MonOpRequestRef op,
525 const map<string,cmd_vartype>& cmdmap,
526 const map<string,string>& secrets,
527 stringstream &ss,
528 Formatter *f);
529
530 int prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
531 stringstream& ss);
532 int prepare_command_pool_application(const string &prefix,
533 map<string,cmd_vartype> &cmdmap,
534 stringstream& ss);
535
536 bool handle_osd_timeouts(const utime_t &now,
537 std::map<int,utime_t> &last_osd_report);
538
539 void send_latest(MonOpRequestRef op, epoch_t start=0);
540 void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) {
541 op->mark_osdmon_event(__func__);
542 send_incremental(op, start);
543 }
544
545 int get_version(version_t ver, bufferlist& bl) override;
546 int get_version(version_t ver, uint64_t feature, bufferlist& bl);
547
548 int get_version_full(version_t ver, uint64_t feature, bufferlist& bl);
549 int get_version_full(version_t ver, bufferlist& bl) override;
550
551 epoch_t blacklist(const entity_addr_t& a, utime_t until);
552
553 void dump_info(Formatter *f);
554 int dump_osd_metadata(int osd, Formatter *f, ostream *err);
555 void print_nodes(Formatter *f);
556
557 void check_osdmap_sub(Subscription *sub);
558 void check_pg_creates_sub(Subscription *sub);
559
560 void do_application_enable(int64_t pool_id, const std::string &app_name);
561
562 void add_flag(int flag) {
563 if (!(osdmap.flags & flag)) {
564 if (pending_inc.new_flags < 0)
565 pending_inc.new_flags = osdmap.flags;
566 pending_inc.new_flags |= flag;
567 }
568 }
569
570 void remove_flag(int flag) {
571 if(osdmap.flags & flag) {
572 if (pending_inc.new_flags < 0)
573 pending_inc.new_flags = osdmap.flags;
574 pending_inc.new_flags &= ~flag;
575 }
576 }
577 };
578
579 #endif