1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 /* Object Store Device (OSD) Monitor
21 #ifndef CEPH_OSDMONITOR_H
22 #define CEPH_OSDMONITOR_H
28 #include "include/types.h"
29 #include "common/simple_cache.hpp"
30 #include "msg/Messenger.h"
32 #include "osd/OSDMap.h"
33 #include "osd/OSDMapMapping.h"
35 #include "CreatingPGs.h"
36 #include "PaxosService.h"
43 #include "erasure-code/ErasureCodeInterface.h"
44 #include "mon/MonOpRequest.h"
45 #include <boost/functional/hash.hpp>
46 // re-include our assert to clobber the system one; fix dout:
47 #include "include/assert.h"
49 /// information about a particular peer's failure reports for one osd
50 struct failure_reporter_t
{
51 utime_t failed_since
; ///< when they think it failed
52 MonOpRequestRef op
; ///< failure op request
54 failure_reporter_t() {}
55 explicit failure_reporter_t(utime_t s
) : failed_since(s
) {}
56 ~failure_reporter_t() { }
59 /// information about all failure reports for one osd
60 struct failure_info_t
{
61 map
<int, failure_reporter_t
> reporters
; ///< reporter -> failed_since etc
62 utime_t max_failed_since
; ///< most recent failed_since
66 utime_t
get_failed_since() {
67 if (max_failed_since
== utime_t() && !reporters
.empty()) {
68 // the old max must have canceled; recalculate.
69 for (map
<int, failure_reporter_t
>::iterator p
= reporters
.begin();
72 if (p
->second
.failed_since
> max_failed_since
)
73 max_failed_since
= p
->second
.failed_since
;
75 return max_failed_since
;
78 // set the message for the latest report. return any old op request we had,
79 // if any, so we can discard it.
80 MonOpRequestRef
add_report(int who
, utime_t failed_since
,
82 map
<int, failure_reporter_t
>::iterator p
= reporters
.find(who
);
83 if (p
== reporters
.end()) {
84 if (max_failed_since
< failed_since
)
85 max_failed_since
= failed_since
;
86 p
= reporters
.insert(map
<int, failure_reporter_t
>::value_type(who
, failure_reporter_t(failed_since
))).first
;
89 MonOpRequestRef ret
= p
->second
.op
;
94 void take_report_messages(list
<MonOpRequestRef
>& ls
) {
95 for (map
<int, failure_reporter_t
>::iterator p
= reporters
.begin();
99 ls
.push_back(p
->second
.op
);
100 p
->second
.op
.reset();
105 MonOpRequestRef
cancel_report(int who
) {
106 map
<int, failure_reporter_t
>::iterator p
= reporters
.find(who
);
107 if (p
== reporters
.end())
108 return MonOpRequestRef();
109 MonOpRequestRef ret
= p
->second
.op
;
116 class LastEpochClean
{
118 vector
<epoch_t
> epoch_by_pg
;
119 ps_t next_missing
= 0;
120 epoch_t floor
= std::numeric_limits
<epoch_t
>::max();
121 void report(ps_t pg
, epoch_t last_epoch_clean
);
123 std::map
<uint64_t, Lec
> report_by_pool
;
125 void report(const pg_t
& pg
, epoch_t last_epoch_clean
);
126 void remove_pool(uint64_t pool
);
127 epoch_t
get_lower_bound(const OSDMap
& latest
) const;
131 class OSDMonitor
: public PaxosService
{
138 OSDMap::Incremental pending_inc
;
139 map
<int, bufferlist
> pending_metadata
;
140 set
<int> pending_metadata_rm
;
141 map
<int, failure_info_t
> failure_info
;
142 map
<int,utime_t
> down_pending_out
; // osd down -> out
144 map
<int,double> osd_weight
;
146 using osdmap_key_t
= std::pair
<version_t
, uint64_t>;
147 using osdmap_cache_t
= SimpleLRU
<osdmap_key_t
,
149 std::less
<osdmap_key_t
>,
150 boost::hash
<osdmap_key_t
>>;
151 osdmap_cache_t inc_osd_cache
;
152 osdmap_cache_t full_osd_cache
;
154 bool check_failures(utime_t now
);
155 bool check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
);
156 void force_failure(int target_osd
, int by
);
158 bool _have_pending_crush();
159 CrushWrapper
&_get_stable_crush();
160 void _get_pending_crush(CrushWrapper
& newcrush
);
170 void create_initial() override
;
171 void get_store_prefixes(std::set
<string
>& s
) override
;
174 void update_from_paxos(bool *need_bootstrap
) override
;
175 void create_pending() override
; // prepare a new pending
176 void encode_pending(MonitorDBStore::TransactionRef t
) override
;
177 void on_active() override
;
178 void on_restart() override
;
179 void on_shutdown() override
;
181 * we haven't delegated full version stashing to paxosservice for some time
182 * now, making this function useless in current context.
184 void encode_full(MonitorDBStore::TransactionRef t
) override
{ }
186 * do not let paxosservice periodically stash full osdmaps, or we will break our
187 * locally-managed full maps. (update_from_paxos loads the latest and writes them
188 * out going forward from there, but if we just synced that may mean we skip some.)
190 bool should_stash_full() override
{
195 * hook into trim to include the oldest full map in the trim transaction
197 * This ensures that anyone post-sync will have enough to rebuild their
200 void encode_trim_extra(MonitorDBStore::TransactionRef tx
, version_t first
) override
;
202 void update_msgr_features();
203 int check_cluster_features(uint64_t features
, stringstream
&ss
);
205 * check if the cluster supports the features required by the
206 * given crush map. Outputs the daemons which don't support it
207 * to the stringstream.
209 * @returns true if the map is passable, false otherwise
211 bool validate_crush_against_features(const CrushWrapper
*newcrush
,
213 void check_osdmap_subs();
214 void share_map_with_random_osd();
216 Mutex prime_pg_temp_lock
= {"OSDMonitor::prime_pg_temp_lock"};
217 struct PrimeTempJob
: public ParallelPGMapper::Job
{
219 PrimeTempJob(const OSDMap
& om
, OSDMonitor
*m
)
220 : ParallelPGMapper::Job(&om
), osdmon(m
) {}
221 void process(int64_t pool
, unsigned ps_begin
, unsigned ps_end
) override
{
222 for (unsigned ps
= ps_begin
; ps
< ps_end
; ++ps
) {
224 osdmon
->prime_pg_temp(*osdmap
, pgid
);
227 void complete() override
{}
229 void maybe_prime_pg_temp();
230 void prime_pg_temp(const OSDMap
& next
, pg_t pgid
);
232 ParallelPGMapper mapper
; ///< for background pg work
233 OSDMapMapping mapping
; ///< pg <-> osd mappings
234 unique_ptr
<ParallelPGMapper::Job
> mapping_job
; ///< background mapping job
235 void start_mapping();
237 void update_logger();
239 void handle_query(PaxosServiceMessage
*m
);
240 bool preprocess_query(MonOpRequestRef op
) override
; // true if processed.
241 bool prepare_update(MonOpRequestRef op
) override
;
242 bool should_propose(double &delay
) override
;
244 version_t
get_trim_to() override
;
246 bool can_mark_down(int o
);
247 bool can_mark_up(int o
);
248 bool can_mark_out(int o
);
249 bool can_mark_in(int o
);
252 MOSDMap
*build_latest_full(uint64_t features
);
253 MOSDMap
*build_incremental(epoch_t first
, epoch_t last
, uint64_t features
);
254 void send_full(MonOpRequestRef op
);
255 void send_incremental(MonOpRequestRef op
, epoch_t first
);
257 // @param req an optional op request, if the osdmaps are replies to it. so
258 // @c Monitor::send_reply() can mark_event with it.
259 void send_incremental(epoch_t first
, MonSession
*session
, bool onetime
,
260 MonOpRequestRef req
= MonOpRequestRef());
263 void print_utilization(ostream
&out
, Formatter
*f
, bool tree
) const;
265 bool check_source(PaxosServiceMessage
*m
, uuid_d fsid
);
267 bool preprocess_get_osdmap(MonOpRequestRef op
);
269 bool preprocess_mark_me_down(MonOpRequestRef op
);
271 friend class C_AckMarkedDown
;
272 bool preprocess_failure(MonOpRequestRef op
);
273 bool prepare_failure(MonOpRequestRef op
);
274 bool prepare_mark_me_down(MonOpRequestRef op
);
275 void process_failures();
276 void take_all_failures(list
<MonOpRequestRef
>& ls
);
278 bool preprocess_full(MonOpRequestRef op
);
279 bool prepare_full(MonOpRequestRef op
);
281 bool preprocess_boot(MonOpRequestRef op
);
282 bool prepare_boot(MonOpRequestRef op
);
283 void _booted(MonOpRequestRef op
, bool logit
);
285 void update_up_thru(int from
, epoch_t up_thru
);
286 bool preprocess_alive(MonOpRequestRef op
);
287 bool prepare_alive(MonOpRequestRef op
);
288 void _reply_map(MonOpRequestRef op
, epoch_t e
);
290 bool preprocess_pgtemp(MonOpRequestRef op
);
291 bool prepare_pgtemp(MonOpRequestRef op
);
293 bool preprocess_pg_created(MonOpRequestRef op
);
294 bool prepare_pg_created(MonOpRequestRef op
);
296 int _check_remove_pool(int64_t pool_id
, const pg_pool_t
&pool
, ostream
*ss
);
297 bool _check_become_tier(
298 int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
299 int64_t base_pool_id
, const pg_pool_t
*base_pool
,
300 int *err
, ostream
*ss
) const;
301 bool _check_remove_tier(
302 int64_t base_pool_id
, const pg_pool_t
*base_pool
, const pg_pool_t
*tier_pool
,
303 int *err
, ostream
*ss
) const;
305 int _prepare_remove_pool(int64_t pool
, ostream
*ss
, bool no_fake
);
306 int _prepare_rename_pool(int64_t pool
, string newname
);
308 bool enforce_pool_op_caps(MonOpRequestRef op
);
309 bool preprocess_pool_op (MonOpRequestRef op
);
310 bool preprocess_pool_op_create (MonOpRequestRef op
);
311 bool prepare_pool_op (MonOpRequestRef op
);
312 bool prepare_pool_op_create (MonOpRequestRef op
);
313 bool prepare_pool_op_delete(MonOpRequestRef op
);
314 int crush_rename_bucket(const string
& srcname
,
315 const string
& dstname
,
317 void check_legacy_ec_plugin(const string
& plugin
,
318 const string
& profile
) const;
319 int normalize_profile(const string
& profilename
,
320 ErasureCodeProfile
&profile
,
323 int crush_rule_create_erasure(const string
&name
,
324 const string
&profile
,
327 int get_crush_rule(const string
&rule_name
,
330 int get_erasure_code(const string
&erasure_code_profile
,
331 ErasureCodeInterfaceRef
*erasure_code
,
333 int prepare_pool_crush_rule(const unsigned pool_type
,
334 const string
&erasure_code_profile
,
335 const string
&rule_name
,
338 bool erasure_code_profile_in_use(
339 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
340 const string
&profile
,
342 int parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
343 map
<string
,string
> *erasure_code_profile_map
,
345 int prepare_pool_size(const unsigned pool_type
,
346 const string
&erasure_code_profile
,
347 unsigned *size
, unsigned *min_size
,
349 int prepare_pool_stripe_width(const unsigned pool_type
,
350 const string
&erasure_code_profile
,
351 unsigned *stripe_width
,
353 int check_pg_num(int64_t pool
, int pg_num
, int size
, ostream
* ss
);
354 int prepare_new_pool(string
& name
, uint64_t auid
,
356 const string
&crush_rule_name
,
357 unsigned pg_num
, unsigned pgp_num
,
358 const string
&erasure_code_profile
,
359 const unsigned pool_type
,
360 const uint64_t expected_num_objects
,
361 FastReadType fast_read
,
363 int prepare_new_pool(MonOpRequestRef op
);
365 void set_pool_flags(int64_t pool_id
, uint64_t flags
);
366 void clear_pool_flags(int64_t pool_id
, uint64_t flags
);
367 bool update_pools_status();
369 bool prepare_set_flag(MonOpRequestRef op
, int flag
);
370 bool prepare_unset_flag(MonOpRequestRef op
, int flag
);
372 void _pool_op_reply(MonOpRequestRef op
,
373 int ret
, epoch_t epoch
, bufferlist
*blp
=NULL
);
375 struct C_Booted
: public C_MonOp
{
378 C_Booted(OSDMonitor
*cm
, MonOpRequestRef op_
, bool l
=true) :
379 C_MonOp(op_
), cmon(cm
), logit(l
) {}
380 void _finish(int r
) override
{
382 cmon
->_booted(op
, logit
);
383 else if (r
== -ECANCELED
)
385 else if (r
== -EAGAIN
)
388 assert(0 == "bad C_Booted return value");
392 struct C_ReplyMap
: public C_MonOp
{
395 C_ReplyMap(OSDMonitor
*o
, MonOpRequestRef op_
, epoch_t ee
)
396 : C_MonOp(op_
), osdmon(o
), e(ee
) {}
397 void _finish(int r
) override
{
399 osdmon
->_reply_map(op
, e
);
400 else if (r
== -ECANCELED
)
402 else if (r
== -EAGAIN
)
403 osdmon
->dispatch(op
);
405 assert(0 == "bad C_ReplyMap return value");
408 struct C_PoolOp
: public C_MonOp
{
412 bufferlist reply_data
;
413 C_PoolOp(OSDMonitor
* osd
, MonOpRequestRef op_
, int rc
, int e
, bufferlist
*rd
=NULL
) :
414 C_MonOp(op_
), osdmon(osd
), replyCode(rc
), epoch(e
) {
418 void _finish(int r
) override
{
420 osdmon
->_pool_op_reply(op
, replyCode
, epoch
, &reply_data
);
421 else if (r
== -ECANCELED
)
423 else if (r
== -EAGAIN
)
424 osdmon
->dispatch(op
);
426 assert(0 == "bad C_PoolOp return value");
430 bool preprocess_remove_snaps(MonOpRequestRef op
);
431 bool prepare_remove_snaps(MonOpRequestRef op
);
433 OpTracker op_tracker
;
435 int load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
);
436 void count_metadata(const string
& field
, Formatter
*f
);
438 void reencode_incremental_map(bufferlist
& bl
, uint64_t features
);
439 void reencode_full_map(bufferlist
& bl
, uint64_t features
);
441 void count_metadata(const string
& field
, map
<string
,int> *out
);
443 int get_osd_objectstore_type(int osd
, std::string
*type
);
444 bool is_pool_currently_all_bluestore(int64_t pool_id
, const pg_pool_t
&pool
,
447 // when we last received PG stats from each osd
448 map
<int,utime_t
> last_osd_report
;
449 // TODO: use last_osd_report to store the osd report epochs, once we don't
450 // need to upgrade from pre-luminous releases.
451 map
<int,epoch_t
> osd_epochs
;
452 LastEpochClean last_epoch_clean
;
453 bool preprocess_beacon(MonOpRequestRef op
);
454 bool prepare_beacon(MonOpRequestRef op
);
455 epoch_t
get_min_last_epoch_clean() const;
457 friend class C_UpdateCreatingPGs
;
458 std::map
<int, std::map
<epoch_t
, std::set
<pg_t
>>> creating_pgs_by_osd_epoch
;
459 std::vector
<pg_t
> pending_created_pgs
;
460 // the epoch when the pg mapping was calculated
461 epoch_t creating_pgs_epoch
= 0;
462 creating_pgs_t creating_pgs
;
463 mutable std::mutex creating_pgs_lock
;
465 creating_pgs_t
update_pending_pgs(const OSDMap::Incremental
& inc
,
466 const OSDMap
& nextmap
);
467 void trim_creating_pgs(creating_pgs_t
*creating_pgs
,
468 const ceph::unordered_map
<pg_t
,pg_stat_t
>& pgm
);
469 unsigned scan_for_creating_pgs(
470 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
471 const mempool::osdmap::set
<int64_t>& removed_pools
,
473 creating_pgs_t
* creating_pgs
) const;
474 pair
<int32_t, pg_t
> get_parent_pg(pg_t pgid
) const;
475 void update_creating_pgs();
476 void check_pg_creates_subs();
477 epoch_t
send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const;
479 int32_t _allocate_osd_id(int32_t* existing_id
);
482 OSDMonitor(CephContext
*cct
, Monitor
*mn
, Paxos
*p
, const string
& service_name
);
484 void tick() override
; // check state, take actions
486 void get_health(list
<pair
<health_status_t
,string
> >& summary
,
487 list
<pair
<health_status_t
,string
> > *detail
,
488 CephContext
*cct
) const override
;
489 bool preprocess_command(MonOpRequestRef op
);
490 bool prepare_command(MonOpRequestRef op
);
491 bool prepare_command_impl(MonOpRequestRef op
, map
<string
,cmd_vartype
>& cmdmap
);
493 int validate_osd_create(
496 const bool check_osd_exists
,
497 int32_t* existing_id
,
499 int prepare_command_osd_create(
502 int32_t* existing_id
,
504 void do_osd_create(const int32_t id
, const uuid_d
& uuid
,
505 const string
& device_class
,
507 int prepare_command_osd_purge(int32_t id
, stringstream
& ss
);
508 int prepare_command_osd_destroy(int32_t id
, stringstream
& ss
);
509 int _prepare_command_osd_crush_remove(
510 CrushWrapper
&newcrush
,
515 void do_osd_crush_remove(CrushWrapper
& newcrush
);
516 int prepare_command_osd_crush_remove(
517 CrushWrapper
&newcrush
,
522 int prepare_command_osd_remove(int32_t id
);
523 int prepare_command_osd_new(
525 const map
<string
,cmd_vartype
>& cmdmap
,
526 const map
<string
,string
>& secrets
,
530 int prepare_command_pool_set(map
<string
,cmd_vartype
> &cmdmap
,
532 int prepare_command_pool_application(const string
&prefix
,
533 map
<string
,cmd_vartype
> &cmdmap
,
536 bool handle_osd_timeouts(const utime_t
&now
,
537 std::map
<int,utime_t
> &last_osd_report
);
539 void send_latest(MonOpRequestRef op
, epoch_t start
=0);
540 void send_latest_now_nodelete(MonOpRequestRef op
, epoch_t start
=0) {
541 op
->mark_osdmon_event(__func__
);
542 send_incremental(op
, start
);
545 int get_version(version_t ver
, bufferlist
& bl
) override
;
546 int get_version(version_t ver
, uint64_t feature
, bufferlist
& bl
);
548 int get_version_full(version_t ver
, uint64_t feature
, bufferlist
& bl
);
549 int get_version_full(version_t ver
, bufferlist
& bl
) override
;
551 epoch_t
blacklist(const entity_addr_t
& a
, utime_t until
);
553 void dump_info(Formatter
*f
);
554 int dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
);
555 void print_nodes(Formatter
*f
);
557 void check_osdmap_sub(Subscription
*sub
);
558 void check_pg_creates_sub(Subscription
*sub
);
560 void do_application_enable(int64_t pool_id
, const std::string
&app_name
);
562 void add_flag(int flag
) {
563 if (!(osdmap
.flags
& flag
)) {
564 if (pending_inc
.new_flags
< 0)
565 pending_inc
.new_flags
= osdmap
.flags
;
566 pending_inc
.new_flags
|= flag
;
570 void remove_flag(int flag
) {
571 if(osdmap
.flags
& flag
) {
572 if (pending_inc
.new_flags
< 0)
573 pending_inc
.new_flags
= osdmap
.flags
;
574 pending_inc
.new_flags
&= ~flag
;