1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 /* Object Store Device (OSD) Monitor
21 #ifndef CEPH_OSDMONITOR_H
22 #define CEPH_OSDMONITOR_H
28 #include "include/types.h"
29 #include "common/simple_cache.hpp"
30 #include "msg/Messenger.h"
32 #include "osd/OSDMap.h"
33 #include "osd/OSDMapMapping.h"
35 #include "CreatingPGs.h"
36 #include "PaxosService.h"
43 #include "erasure-code/ErasureCodeInterface.h"
44 #include "mon/MonOpRequest.h"
46 #define OSD_METADATA_PREFIX "osd_metadata"
48 /// information about a particular peer's failure reports for one osd
49 struct failure_reporter_t
{
50 utime_t failed_since
; ///< when they think it failed
51 MonOpRequestRef op
; ///< failure op request
53 failure_reporter_t() {}
54 explicit failure_reporter_t(utime_t s
) : failed_since(s
) {}
55 ~failure_reporter_t() { }
58 /// information about all failure reports for one osd
59 struct failure_info_t
{
60 map
<int, failure_reporter_t
> reporters
; ///< reporter -> failed_since etc
61 utime_t max_failed_since
; ///< most recent failed_since
65 utime_t
get_failed_since() {
66 if (max_failed_since
== utime_t() && !reporters
.empty()) {
67 // the old max must have canceled; recalculate.
68 for (map
<int, failure_reporter_t
>::iterator p
= reporters
.begin();
71 if (p
->second
.failed_since
> max_failed_since
)
72 max_failed_since
= p
->second
.failed_since
;
74 return max_failed_since
;
77 // set the message for the latest report. return any old op request we had,
78 // if any, so we can discard it.
79 MonOpRequestRef
add_report(int who
, utime_t failed_since
,
81 map
<int, failure_reporter_t
>::iterator p
= reporters
.find(who
);
82 if (p
== reporters
.end()) {
83 if (max_failed_since
< failed_since
)
84 max_failed_since
= failed_since
;
85 p
= reporters
.insert(map
<int, failure_reporter_t
>::value_type(who
, failure_reporter_t(failed_since
))).first
;
88 MonOpRequestRef ret
= p
->second
.op
;
93 void take_report_messages(list
<MonOpRequestRef
>& ls
) {
94 for (map
<int, failure_reporter_t
>::iterator p
= reporters
.begin();
98 ls
.push_back(p
->second
.op
);
104 MonOpRequestRef
cancel_report(int who
) {
105 map
<int, failure_reporter_t
>::iterator p
= reporters
.find(who
);
106 if (p
== reporters
.end())
107 return MonOpRequestRef();
108 MonOpRequestRef ret
= p
->second
.op
;
115 class LastEpochClean
{
117 vector
<epoch_t
> epoch_by_pg
;
118 ps_t next_missing
= 0;
119 epoch_t floor
= std::numeric_limits
<epoch_t
>::max();
120 void report(ps_t pg
, epoch_t last_epoch_clean
);
122 std::map
<uint64_t, Lec
> report_by_pool
;
124 void report(const pg_t
& pg
, epoch_t last_epoch_clean
);
125 void remove_pool(uint64_t pool
);
126 epoch_t
get_lower_bound(const OSDMap
& latest
) const;
130 class OSDMonitor
: public PaxosService
{
137 OSDMap::Incremental pending_inc
;
138 map
<int, bufferlist
> pending_metadata
;
139 set
<int> pending_metadata_rm
;
140 map
<int, failure_info_t
> failure_info
;
141 map
<int,utime_t
> down_pending_out
; // osd down -> out
143 map
<int,double> osd_weight
;
145 SimpleLRU
<version_t
, bufferlist
> inc_osd_cache
;
146 SimpleLRU
<version_t
, bufferlist
> full_osd_cache
;
148 bool check_failures(utime_t now
);
149 bool check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
);
150 void force_failure(utime_t now
, int target_osd
, int by
);
152 // the time of last msg(MSG_ALIVE and MSG_PGTEMP) proposed without delay
153 utime_t last_attempted_minwait_time
;
155 bool _have_pending_crush();
156 CrushWrapper
&_get_stable_crush();
157 void _get_pending_crush(CrushWrapper
& newcrush
);
167 void create_initial() override
;
168 void get_store_prefixes(std::set
<string
>& s
) override
;
171 void update_from_paxos(bool *need_bootstrap
) override
;
172 void create_pending() override
; // prepare a new pending
173 void encode_pending(MonitorDBStore::TransactionRef t
) override
;
174 void on_active() override
;
175 void on_restart() override
;
176 void on_shutdown() override
;
178 * we haven't delegated full version stashing to paxosservice for some time
179 * now, making this function useless in current context.
181 void encode_full(MonitorDBStore::TransactionRef t
) override
{ }
183 * do not let paxosservice periodically stash full osdmaps, or we will break our
184 * locally-managed full maps. (update_from_paxos loads the latest and writes them
185 * out going forward from there, but if we just synced that may mean we skip some.)
187 bool should_stash_full() override
{
192 * hook into trim to include the oldest full map in the trim transaction
194 * This ensures that anyone post-sync will have enough to rebuild their
197 void encode_trim_extra(MonitorDBStore::TransactionRef tx
, version_t first
) override
;
199 void update_msgr_features();
200 int check_cluster_features(uint64_t features
, stringstream
&ss
);
202 * check if the cluster supports the features required by the
203 * given crush map. Outputs the daemons which don't support it
204 * to the stringstream.
206 * @returns true if the map is passable, false otherwise
208 bool validate_crush_against_features(const CrushWrapper
*newcrush
,
210 void check_osdmap_subs();
211 void share_map_with_random_osd();
213 Mutex prime_pg_temp_lock
= {"OSDMonitor::prime_pg_temp_lock"};
214 struct PrimeTempJob
: public ParallelPGMapper::Job
{
216 PrimeTempJob(const OSDMap
& om
, OSDMonitor
*m
)
217 : ParallelPGMapper::Job(&om
), osdmon(m
) {}
218 void process(int64_t pool
, unsigned ps_begin
, unsigned ps_end
) override
{
219 for (unsigned ps
= ps_begin
; ps
< ps_end
; ++ps
) {
221 osdmon
->prime_pg_temp(*osdmap
, pgid
);
224 void complete() override
{}
226 void maybe_prime_pg_temp();
227 void prime_pg_temp(const OSDMap
& next
, pg_t pgid
);
229 ParallelPGMapper mapper
; ///< for background pg work
230 OSDMapMapping mapping
; ///< pg <-> osd mappings
231 unique_ptr
<ParallelPGMapper::Job
> mapping_job
; ///< background mapping job
232 void start_mapping();
234 void update_logger();
236 void handle_query(PaxosServiceMessage
*m
);
237 bool preprocess_query(MonOpRequestRef op
) override
; // true if processed.
238 bool prepare_update(MonOpRequestRef op
) override
;
239 bool should_propose(double &delay
) override
;
241 version_t
get_trim_to() override
;
243 bool can_mark_down(int o
);
244 bool can_mark_up(int o
);
245 bool can_mark_out(int o
);
246 bool can_mark_in(int o
);
249 MOSDMap
*build_latest_full();
250 MOSDMap
*build_incremental(epoch_t first
, epoch_t last
);
251 void send_full(MonOpRequestRef op
);
252 void send_incremental(MonOpRequestRef op
, epoch_t first
);
254 // @param req an optional op request, if the osdmaps are replies to it. so
255 // @c Monitor::send_reply() can mark_event with it.
256 void send_incremental(epoch_t first
, MonSession
*session
, bool onetime
,
257 MonOpRequestRef req
= MonOpRequestRef());
260 void print_utilization(ostream
&out
, Formatter
*f
, bool tree
) const;
262 bool check_source(PaxosServiceMessage
*m
, uuid_d fsid
);
264 bool preprocess_get_osdmap(MonOpRequestRef op
);
266 bool preprocess_mark_me_down(MonOpRequestRef op
);
268 friend class C_AckMarkedDown
;
269 bool preprocess_failure(MonOpRequestRef op
);
270 bool prepare_failure(MonOpRequestRef op
);
271 bool prepare_mark_me_down(MonOpRequestRef op
);
272 void process_failures();
273 void take_all_failures(list
<MonOpRequestRef
>& ls
);
275 bool preprocess_full(MonOpRequestRef op
);
276 bool prepare_full(MonOpRequestRef op
);
278 bool preprocess_boot(MonOpRequestRef op
);
279 bool prepare_boot(MonOpRequestRef op
);
280 void _booted(MonOpRequestRef op
, bool logit
);
282 void update_up_thru(int from
, epoch_t up_thru
);
283 bool preprocess_alive(MonOpRequestRef op
);
284 bool prepare_alive(MonOpRequestRef op
);
285 void _reply_map(MonOpRequestRef op
, epoch_t e
);
287 bool preprocess_pgtemp(MonOpRequestRef op
);
288 bool prepare_pgtemp(MonOpRequestRef op
);
290 bool preprocess_pg_created(MonOpRequestRef op
);
291 bool prepare_pg_created(MonOpRequestRef op
);
293 int _check_remove_pool(int64_t pool_id
, const pg_pool_t
&pool
, ostream
*ss
);
294 bool _check_become_tier(
295 int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
296 int64_t base_pool_id
, const pg_pool_t
*base_pool
,
297 int *err
, ostream
*ss
) const;
298 bool _check_remove_tier(
299 int64_t base_pool_id
, const pg_pool_t
*base_pool
, const pg_pool_t
*tier_pool
,
300 int *err
, ostream
*ss
) const;
302 int _prepare_remove_pool(int64_t pool
, ostream
*ss
, bool no_fake
);
303 int _prepare_rename_pool(int64_t pool
, string newname
);
305 bool preprocess_pool_op (MonOpRequestRef op
);
306 bool preprocess_pool_op_create (MonOpRequestRef op
);
307 bool prepare_pool_op (MonOpRequestRef op
);
308 bool prepare_pool_op_create (MonOpRequestRef op
);
309 bool prepare_pool_op_delete(MonOpRequestRef op
);
310 int crush_rename_bucket(const string
& srcname
,
311 const string
& dstname
,
313 void check_legacy_ec_plugin(const string
& plugin
,
314 const string
& profile
) const;
315 int normalize_profile(const string
& profilename
,
316 ErasureCodeProfile
&profile
,
319 int crush_rule_create_erasure(const string
&name
,
320 const string
&profile
,
323 int get_crush_rule(const string
&rule_name
,
326 int get_erasure_code(const string
&erasure_code_profile
,
327 ErasureCodeInterfaceRef
*erasure_code
,
329 int prepare_pool_crush_rule(const unsigned pool_type
,
330 const string
&erasure_code_profile
,
331 const string
&rule_name
,
334 bool erasure_code_profile_in_use(
335 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
336 const string
&profile
,
338 int parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
339 map
<string
,string
> *erasure_code_profile_map
,
341 int prepare_pool_size(const unsigned pool_type
,
342 const string
&erasure_code_profile
,
343 unsigned *size
, unsigned *min_size
,
345 int prepare_pool_stripe_width(const unsigned pool_type
,
346 const string
&erasure_code_profile
,
347 unsigned *stripe_width
,
349 int prepare_new_pool(string
& name
, uint64_t auid
,
351 const string
&crush_rule_name
,
352 unsigned pg_num
, unsigned pgp_num
,
353 const string
&erasure_code_profile
,
354 const unsigned pool_type
,
355 const uint64_t expected_num_objects
,
356 FastReadType fast_read
,
358 int prepare_new_pool(MonOpRequestRef op
);
360 void update_pool_flags(int64_t pool_id
, uint64_t flags
);
361 bool update_pools_status();
362 void get_pools_health(list
<pair
<health_status_t
,string
> >& summary
,
363 list
<pair
<health_status_t
,string
> > *detail
) const;
365 bool prepare_set_flag(MonOpRequestRef op
, int flag
);
366 bool prepare_unset_flag(MonOpRequestRef op
, int flag
);
368 void _pool_op_reply(MonOpRequestRef op
,
369 int ret
, epoch_t epoch
, bufferlist
*blp
=NULL
);
371 struct C_Booted
: public C_MonOp
{
374 C_Booted(OSDMonitor
*cm
, MonOpRequestRef op_
, bool l
=true) :
375 C_MonOp(op_
), cmon(cm
), logit(l
) {}
376 void _finish(int r
) override
{
378 cmon
->_booted(op
, logit
);
379 else if (r
== -ECANCELED
)
381 else if (r
== -EAGAIN
)
384 assert(0 == "bad C_Booted return value");
388 struct C_ReplyMap
: public C_MonOp
{
391 C_ReplyMap(OSDMonitor
*o
, MonOpRequestRef op_
, epoch_t ee
)
392 : C_MonOp(op_
), osdmon(o
), e(ee
) {}
393 void _finish(int r
) override
{
395 osdmon
->_reply_map(op
, e
);
396 else if (r
== -ECANCELED
)
398 else if (r
== -EAGAIN
)
399 osdmon
->dispatch(op
);
401 assert(0 == "bad C_ReplyMap return value");
404 struct C_PoolOp
: public C_MonOp
{
408 bufferlist reply_data
;
409 C_PoolOp(OSDMonitor
* osd
, MonOpRequestRef op_
, int rc
, int e
, bufferlist
*rd
=NULL
) :
410 C_MonOp(op_
), osdmon(osd
), replyCode(rc
), epoch(e
) {
414 void _finish(int r
) override
{
416 osdmon
->_pool_op_reply(op
, replyCode
, epoch
, &reply_data
);
417 else if (r
== -ECANCELED
)
419 else if (r
== -EAGAIN
)
420 osdmon
->dispatch(op
);
422 assert(0 == "bad C_PoolOp return value");
426 bool preprocess_remove_snaps(MonOpRequestRef op
);
427 bool prepare_remove_snaps(MonOpRequestRef op
);
429 OpTracker op_tracker
;
431 int load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
);
432 void count_metadata(const string
& field
, Formatter
*f
);
433 int get_osd_objectstore_type(int osd
, std::string
*type
);
434 bool is_pool_currently_all_bluestore(int64_t pool_id
, const pg_pool_t
&pool
,
437 // when we last received PG stats from each osd
438 map
<int,utime_t
> last_osd_report
;
439 // TODO: use last_osd_report to store the osd report epochs, once we don't
440 // need to upgrade from pre-luminous releases.
441 map
<int,epoch_t
> osd_epochs
;
442 LastEpochClean last_epoch_clean
;
443 bool preprocess_beacon(MonOpRequestRef op
);
444 bool prepare_beacon(MonOpRequestRef op
);
445 epoch_t
get_min_last_epoch_clean() const;
447 friend class C_UpdateCreatingPGs
;
448 std::map
<int, std::map
<epoch_t
, std::set
<pg_t
>>> creating_pgs_by_osd_epoch
;
449 std::vector
<pg_t
> pending_created_pgs
;
450 // the epoch when the pg mapping was calculated
451 epoch_t creating_pgs_epoch
= 0;
452 creating_pgs_t creating_pgs
;
453 std::mutex creating_pgs_lock
;
455 creating_pgs_t
update_pending_pgs(const OSDMap::Incremental
& inc
);
456 void trim_creating_pgs(creating_pgs_t
*creating_pgs
,
457 const ceph::unordered_map
<pg_t
,pg_stat_t
>& pgm
);
458 unsigned scan_for_creating_pgs(
459 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
460 const mempool::osdmap::set
<int64_t>& removed_pools
,
462 creating_pgs_t
* creating_pgs
) const;
463 pair
<int32_t, pg_t
> get_parent_pg(pg_t pgid
) const;
464 void update_creating_pgs();
465 void check_pg_creates_subs();
466 epoch_t
send_pg_creates(int osd
, Connection
*con
, epoch_t next
);
468 int32_t _allocate_osd_id(int32_t* existing_id
);
471 OSDMonitor(CephContext
*cct
, Monitor
*mn
, Paxos
*p
, const string
& service_name
);
473 void tick() override
; // check state, take actions
475 void get_health(list
<pair
<health_status_t
,string
> >& summary
,
476 list
<pair
<health_status_t
,string
> > *detail
,
477 CephContext
*cct
) const override
;
478 bool preprocess_command(MonOpRequestRef op
);
479 bool prepare_command(MonOpRequestRef op
);
480 bool prepare_command_impl(MonOpRequestRef op
, map
<string
,cmd_vartype
>& cmdmap
);
482 int validate_osd_create(
485 const bool check_osd_exists
,
486 int32_t* existing_id
,
488 int prepare_command_osd_create(
491 int32_t* existing_id
,
493 void do_osd_create(const int32_t id
, const uuid_d
& uuid
, int32_t* new_id
);
494 int prepare_command_osd_purge(int32_t id
, stringstream
& ss
);
495 int prepare_command_osd_destroy(int32_t id
, stringstream
& ss
);
496 int _prepare_command_osd_crush_remove(
497 CrushWrapper
&newcrush
,
502 void do_osd_crush_remove(CrushWrapper
& newcrush
);
503 int prepare_command_osd_crush_remove(
504 CrushWrapper
&newcrush
,
509 int prepare_command_osd_remove(int32_t id
);
510 int prepare_command_osd_new(
512 const map
<string
,cmd_vartype
>& cmdmap
,
513 const map
<string
,string
>& secrets
,
517 int prepare_command_pool_set(map
<string
,cmd_vartype
> &cmdmap
,
520 bool handle_osd_timeouts(const utime_t
&now
,
521 std::map
<int,utime_t
> &last_osd_report
);
523 void send_latest(MonOpRequestRef op
, epoch_t start
=0);
524 void send_latest_now_nodelete(MonOpRequestRef op
, epoch_t start
=0) {
525 op
->mark_osdmon_event(__func__
);
526 send_incremental(op
, start
);
529 int get_version(version_t ver
, bufferlist
& bl
) override
;
530 int get_version_full(version_t ver
, bufferlist
& bl
) override
;
532 epoch_t
blacklist(const entity_addr_t
& a
, utime_t until
);
534 void dump_info(Formatter
*f
);
535 int dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
);
536 void print_nodes(Formatter
*f
);
538 void check_osdmap_sub(Subscription
*sub
);
539 void check_pg_creates_sub(Subscription
*sub
);
541 void add_flag(int flag
) {
542 if (!(osdmap
.flags
& flag
)) {
543 if (pending_inc
.new_flags
< 0)
544 pending_inc
.new_flags
= osdmap
.flags
;
545 pending_inc
.new_flags
|= flag
;
549 void remove_flag(int flag
) {
550 if(osdmap
.flags
& flag
) {
551 if (pending_inc
.new_flags
< 0)
552 pending_inc
.new_flags
= osdmap
.flags
;
553 pending_inc
.new_flags
&= ~flag
;