ceph/src/mon/OSDMonitor.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  *
   9  * Author: Loic Dachary <loic@dachary.org>
  10  *
  11  * This is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License version 2.1, as published by the Free Software
  14  * Foundation.  See file COPYING.
  15  *
  16  */
  17
  18 /* Object Store Device (OSD) Monitor
  19  */
  20
  21 #ifndef CEPH_OSDMONITOR_H
  22 #define CEPH_OSDMONITOR_H
  23
  24 #include <map>
  25 #include <set>
  26 #include <utility>
  27 #include <sstream>
  28
  29 #include "include/types.h"
  30 #include "include/encoding.h"
  31 #include "common/simple_cache.hpp"
  32 #include "common/PriorityCache.h"
  33 #include "msg/Messenger.h"
  34
  35 #include "osd/OSDMap.h"
  36 #include "osd/OSDMapMapping.h"
  37
  38 #include "CreatingPGs.h"
  39 #include "PaxosService.h"
  40
  41 #include "erasure-code/ErasureCodeInterface.h"
  42 #include "mon/MonOpRequest.h"
  43 #include <boost/functional/hash.hpp>
  44
  45 class Monitor;
  46 class PGMap;
  47 struct MonSession;
  48 class MOSDMap;
  49
  50
  51 /// information about a particular peer's failure reports for one osd
  52 struct failure_reporter_t {
  53   utime_t failed_since;     ///< when they think it failed
  54   MonOpRequestRef op;       ///< failure op request
  55
  56   failure_reporter_t() {}
  57   failure_reporter_t(utime_t s, MonOpRequestRef op)
  58     : failed_since(s), op(op) {}
  59   ~failure_reporter_t() { }
  60 };
  61
  62 /// information about all failure reports for one osd
  63 struct failure_info_t {
  64   std::map<int, failure_reporter_t> reporters;  ///< reporter -> failed_since etc
  65   utime_t max_failed_since;                ///< most recent failed_since
  66
  67   failure_info_t() {}
  68
  69   utime_t get_failed_since() {
  70     if (max_failed_since == utime_t() && !reporters.empty()) {
  71       // the old max must have canceled; recalculate.
  72       for (auto p = reporters.begin(); p != reporters.end(); ++p)
  73         if (p->second.failed_since > max_failed_since)
  74           max_failed_since = p->second.failed_since;
  75     }
  76     return max_failed_since;
  77   }
  78
  79   // set the message for the latest report.
  80   void add_report(int who, utime_t failed_since, MonOpRequestRef op) {
  81     [[maybe_unused]] auto [it, new_reporter] =
  82       reporters.insert_or_assign(who, failure_reporter_t{failed_since, op});
  83     if (new_reporter) {
  84       if (max_failed_since != utime_t() && max_failed_since < failed_since) {
  85         max_failed_since = failed_since;
  86       }
  87     }
  88   }
  89
  90   void take_report_messages(std::list<MonOpRequestRef>& ls) {
  91     for (auto p = reporters.begin(); p != reporters.end(); ++p) {
  92       if (p->second.op) {
  93         ls.push_back(p->second.op);
  94         p->second.op.reset();
  95       }
  96     }
  97   }
  98
  99   void cancel_report(int who) {
 100     reporters.erase(who);
 101     max_failed_since = utime_t();
 102   }
 103 };
 104
 105
 106 class LastEpochClean {
 107   struct Lec {
 108     std::vector<epoch_t> epoch_by_pg;
 109     ps_t next_missing = 0;
 110     epoch_t floor = std::numeric_limits<epoch_t>::max();
 111     void report(unsigned pg_num, ps_t pg, epoch_t last_epoch_clean);
 112   };
 113   std::map<uint64_t, Lec> report_by_pool;
 114 public:
 115   void report(unsigned pg_num, const pg_t& pg, epoch_t last_epoch_clean);
 116   void remove_pool(uint64_t pool);
 117   epoch_t get_lower_bound(const OSDMap& latest) const;
 118
 119   void dump(Formatter *f) const;
 120 };
 121
 122
 123 struct osdmap_manifest_t {
 124   // all the maps we have pinned -- i.e., won't be removed unless
 125   // they are inside a trim interval.
 126   std::set<version_t> pinned;
 127
 128   osdmap_manifest_t() {}
 129
 130   version_t get_last_pinned() const
 131   {
 132     auto it = pinned.crbegin();
 133     if (it == pinned.crend()) {
 134       return 0;
 135     }
 136     return *it;
 137   }
 138
 139   version_t get_first_pinned() const
 140   {
 141     auto it = pinned.cbegin();
 142     if (it == pinned.cend()) {
 143       return 0;
 144     }
 145     return *it;
 146   }
 147
 148   bool is_pinned(version_t v) const
 149   {
 150     return pinned.find(v) != pinned.end();
 151   }
 152
 153   void pin(version_t v)
 154   {
 155     pinned.insert(v);
 156   }
 157
 158   version_t get_lower_closest_pinned(version_t v) const {
 159     auto p = pinned.lower_bound(v);
 160     if (p == pinned.cend()) {
 161       return 0;
 162     } else if (*p > v) {
 163       if (p == pinned.cbegin()) {
 164         return 0;
 165       }
 166       --p;
 167     }
 168     return *p;
 169   }
 170
 171   void encode(ceph::buffer::list& bl) const
 172   {
 173     ENCODE_START(1, 1, bl);
 174     encode(pinned, bl);
 175     ENCODE_FINISH(bl);
 176   }
 177
 178   void decode(ceph::buffer::list::const_iterator& bl)
 179   {
 180     DECODE_START(1, bl);
 181     decode(pinned, bl);
 182     DECODE_FINISH(bl);
 183   }
 184
 185   void decode(ceph::buffer::list& bl) {
 186     auto p = bl.cbegin();
 187     decode(p);
 188   }
 189
 190   void dump(ceph::Formatter *f) {
 191     f->dump_unsigned("first_pinned", get_first_pinned());
 192     f->dump_unsigned("last_pinned", get_last_pinned());
 193     f->open_array_section("pinned_maps");
 194     for (auto& i : pinned) {
 195       f->dump_unsigned("epoch", i);
 196     }
 197     f->close_section();
 198  }
 199 };
 200 WRITE_CLASS_ENCODER(osdmap_manifest_t);
 201
 202 class OSDMonitor : public PaxosService,
 203                    public md_config_obs_t {
 204   CephContext *cct;
 205
 206 public:
 207   OSDMap osdmap;
 208
 209   // config observer
 210   const char** get_tracked_conf_keys() const override;
 211   void handle_conf_change(const ConfigProxy& conf,
 212     const std::set<std::string> &changed) override;
 213   // [leader]
 214   OSDMap::Incremental pending_inc;
 215   std::map<int, ceph::buffer::list> pending_metadata;
 216   std::set<int>             pending_metadata_rm;
 217   std::map<int, failure_info_t> failure_info;
 218   std::map<int,utime_t>    down_pending_out;  // osd down -> out
 219   bool priority_convert = false;
 220   std::map<int64_t,std::set<snapid_t>> pending_pseudo_purged_snaps;
 221   std::shared_ptr<PriorityCache::PriCache> rocksdb_binned_kv_cache = nullptr;
 222   std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
 223   ceph::mutex balancer_lock = ceph::make_mutex("OSDMonitor::balancer_lock");
 224
 225   std::map<int,double> osd_weight;
 226   std::set<int32_t> filestore_osds;
 227
 228   using osdmap_key_t = std::pair<version_t, uint64_t>;
 229   using osdmap_cache_t = SimpleLRU<osdmap_key_t,
 230                                    ceph::buffer::list,
 231                                    std::less<osdmap_key_t>,
 232                                    boost::hash<osdmap_key_t>>;
 233   osdmap_cache_t inc_osd_cache;
 234   osdmap_cache_t full_osd_cache;
 235
 236   bool has_osdmap_manifest;
 237   osdmap_manifest_t osdmap_manifest;
 238
 239   bool check_failures(utime_t now);
 240   bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
 241   utime_t get_grace_time(utime_t now, int target_osd, failure_info_t& fi) const;
 242   bool is_failure_stale(utime_t now, failure_info_t& fi) const;
 243   void force_failure(int target_osd, int by);
 244
 245   bool _have_pending_crush();
 246   CrushWrapper &_get_stable_crush();
 247   CrushWrapper _get_pending_crush();
 248
 249   enum FastReadType {
 250     FAST_READ_OFF,
 251     FAST_READ_ON,
 252     FAST_READ_DEFAULT
 253   };
 254
 255   struct CleanUpmapJob : public ParallelPGMapper::Job {
 256     CephContext *cct;
 257     const OSDMap& osdmap;
 258     OSDMap::Incremental& pending_inc;
 259     // lock to protect pending_inc form changing
 260     // when checking is done
 261     ceph::mutex pending_inc_lock =
 262       ceph::make_mutex("CleanUpmapJob::pending_inc_lock");
 263
 264     CleanUpmapJob(CephContext *cct, const OSDMap& om, OSDMap::Incremental& pi)
 265       : ParallelPGMapper::Job(&om),
 266         cct(cct),
 267         osdmap(om),
 268         pending_inc(pi) {}
 269
 270     void process(const std::vector<pg_t>& to_check) override {
 271       std::vector<pg_t> to_cancel;
 272       std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>> to_remap;
 273       osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
 274       // don't bother taking lock if nothing changes
 275       if (!to_cancel.empty() || !to_remap.empty()) {
 276         std::lock_guard l(pending_inc_lock);
 277         osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_remap);
 278       }
 279     }
 280
 281     void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) override {}
 282     void complete() override {}
 283   }; // public as this will need to be accessible from TestTestOSDMap.cc
 284
 285   // svc
 286 public:
 287   void create_initial() override;
 288   void get_store_prefixes(std::set<std::string>& s) const override;
 289
 290 private:
 291   void update_from_paxos(bool *need_bootstrap) override;
 292   void create_pending() override;  // prepare a new pending
 293   void encode_pending(MonitorDBStore::TransactionRef t) override;
 294   void on_active() override;
 295   void on_restart() override;
 296   void on_shutdown() override;
 297
 298   /* osdmap full map prune */
 299   void load_osdmap_manifest();
 300   bool should_prune() const;
 301   void _prune_update_trimmed(
 302       MonitorDBStore::TransactionRef tx,
 303       version_t first);
 304   void prune_init(osdmap_manifest_t& manifest);
 305   bool _prune_sanitize_options() const;
 306   bool is_prune_enabled() const;
 307   bool is_prune_supported() const;
 308   bool do_prune(MonitorDBStore::TransactionRef tx);
 309
 310   // Priority cache control
 311   uint32_t mon_osd_cache_size = 0;  ///< Number of cached OSDMaps
 312   uint64_t rocksdb_cache_size = 0;  ///< Cache for kv Db
 313   double cache_kv_ratio = 0;        ///< Cache ratio dedicated to kv
 314   double cache_inc_ratio = 0;       ///< Cache ratio dedicated to inc
 315   double cache_full_ratio = 0;      ///< Cache ratio dedicated to full
 316   uint64_t mon_memory_base = 0;     ///< Mon base memory for cache autotuning
 317   double mon_memory_fragmentation = 0; ///< Expected memory fragmentation
 318   uint64_t mon_memory_target = 0;   ///< Mon target memory for cache autotuning
 319   uint64_t mon_memory_min = 0;      ///< Min memory to cache osdmaps
 320   bool mon_memory_autotune = false; ///< Cache auto tune setting
 321   int register_cache_with_pcm();
 322   int _set_cache_sizes();
 323   int _set_cache_ratios();
 324   void _set_new_cache_sizes();
 325   void _set_cache_autotuning();
 326   int _update_mon_cache_settings();
 327
 328   friend struct OSDMemCache;
 329   friend struct IncCache;
 330   friend struct FullCache;
 331
 332   /**
 333    * we haven't delegated full version stashing to paxosservice for some time
 334    * now, making this function useless in current context.
 335    */
 336   void encode_full(MonitorDBStore::TransactionRef t) override { }
 337   /**
 338    * do not let paxosservice periodically stash full osdmaps, or we will break our
 339    * locally-managed full maps.  (update_from_paxos loads the latest and writes them
 340    * out going forward from there, but if we just synced that may mean we skip some.)
 341    */
 342   bool should_stash_full() override {
 343     return false;
 344   }
 345
 346   /**
 347    * hook into trim to include the oldest full map in the trim transaction
 348    *
 349    * This ensures that anyone post-sync will have enough to rebuild their
 350    * full osdmaps.
 351    */
 352   void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) override;
 353
 354   void update_msgr_features();
 355   /**
 356    * check if the cluster supports the features required by the
 357    * given crush map. Outputs the daemons which don't support it
 358    * to the stringstream.
 359    *
 360    * @returns true if the map is passable, false otherwise
 361    */
 362   bool validate_crush_against_features(const CrushWrapper *newcrush,
 363                                        std::stringstream &ss);
 364   void check_osdmap_subs();
 365   void share_map_with_random_osd();
 366
 367   ceph::mutex prime_pg_temp_lock =
 368     ceph::make_mutex("OSDMonitor::prime_pg_temp_lock");
 369   struct PrimeTempJob : public ParallelPGMapper::Job {
 370     OSDMonitor *osdmon;
 371     PrimeTempJob(const OSDMap& om, OSDMonitor *m)
 372       : ParallelPGMapper::Job(&om), osdmon(m) {}
 373     void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override {
 374       for (unsigned ps = ps_begin; ps < ps_end; ++ps) {
 375         pg_t pgid(ps, pool);
 376         osdmon->prime_pg_temp(*osdmap, pgid);
 377       }
 378     }
 379     void process(const std::vector<pg_t>& pgs) override {}
 380     void complete() override {}
 381   };
 382   void maybe_prime_pg_temp();
 383   void prime_pg_temp(const OSDMap& next, pg_t pgid);
 384
 385   ParallelPGMapper mapper;                        ///< for background pg work
 386   OSDMapMapping mapping;                          ///< pg <-> osd mappings
 387   std::unique_ptr<ParallelPGMapper::Job> mapping_job;  ///< background mapping job
 388   void start_mapping();
 389
 390   void update_logger();
 391
 392   void handle_query(PaxosServiceMessage *m);
 393   bool preprocess_query(MonOpRequestRef op) override;  // true if processed.
 394   bool prepare_update(MonOpRequestRef op) override;
 395   bool should_propose(double &delay) override;
 396
 397   version_t get_trim_to() const override;
 398
 399   bool can_mark_down(int o);
 400   bool can_mark_up(int o);
 401   bool can_mark_out(int o);
 402   bool can_mark_in(int o);
 403
 404   // ...
 405   MOSDMap *build_latest_full(uint64_t features);
 406   MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features);
 407   void send_full(MonOpRequestRef op);
 408   void send_incremental(MonOpRequestRef op, epoch_t first);
 409 public:
 410   /**
 411    * Make sure the existing (up) OSDs support the given features
 412    * @return 0 on success, or an error code if any OSDs re missing features.
 413    * @param ss Filled in with ane explanation of failure, if any
 414    */
 415   int check_cluster_features(uint64_t features, std::stringstream &ss);
 416   // @param req an optional op request, if the osdmaps are replies to it. so
 417   //            @c Monitor::send_reply() can mark_event with it.
 418   void send_incremental(epoch_t first, MonSession *session, bool onetime,
 419                         MonOpRequestRef req = MonOpRequestRef());
 420
 421 private:
 422   void print_utilization(std::ostream &out, ceph::Formatter *f, bool tree) const;
 423
 424   bool check_source(MonOpRequestRef op, uuid_d fsid);
 425
 426   bool preprocess_get_osdmap(MonOpRequestRef op);
 427
 428   bool preprocess_mark_me_down(MonOpRequestRef op);
 429
 430   friend class C_AckMarkedDown;
 431   bool preprocess_failure(MonOpRequestRef op);
 432   bool prepare_failure(MonOpRequestRef op);
 433   bool prepare_mark_me_down(MonOpRequestRef op);
 434   void process_failures();
 435   void take_all_failures(std::list<MonOpRequestRef>& ls);
 436
 437   bool preprocess_mark_me_dead(MonOpRequestRef op);
 438   bool prepare_mark_me_dead(MonOpRequestRef op);
 439
 440   bool preprocess_full(MonOpRequestRef op);
 441   bool prepare_full(MonOpRequestRef op);
 442
 443   bool preprocess_boot(MonOpRequestRef op);
 444   bool prepare_boot(MonOpRequestRef op);
 445   void _booted(MonOpRequestRef op, bool logit);
 446
 447   void update_up_thru(int from, epoch_t up_thru);
 448   bool preprocess_alive(MonOpRequestRef op);
 449   bool prepare_alive(MonOpRequestRef op);
 450   void _reply_map(MonOpRequestRef op, epoch_t e);
 451
 452   bool preprocess_pgtemp(MonOpRequestRef op);
 453   bool prepare_pgtemp(MonOpRequestRef op);
 454
 455   bool preprocess_pg_created(MonOpRequestRef op);
 456   bool prepare_pg_created(MonOpRequestRef op);
 457
 458   bool preprocess_pg_ready_to_merge(MonOpRequestRef op);
 459   bool prepare_pg_ready_to_merge(MonOpRequestRef op);
 460
 461   int _check_remove_pool(int64_t pool_id, const pg_pool_t &pool, std::ostream *ss);
 462   bool _check_become_tier(
 463       int64_t tier_pool_id, const pg_pool_t *tier_pool,
 464       int64_t base_pool_id, const pg_pool_t *base_pool,
 465       int *err, std::ostream *ss) const;
 466   bool _check_remove_tier(
 467       int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
 468       int *err, std::ostream *ss) const;
 469
 470   int _prepare_remove_pool(int64_t pool, std::ostream *ss, bool no_fake);
 471   int _prepare_rename_pool(int64_t pool, std::string newname);
 472
 473   bool enforce_pool_op_caps(MonOpRequestRef op);
 474   bool preprocess_pool_op (MonOpRequestRef op);
 475   bool preprocess_pool_op_create (MonOpRequestRef op);
 476   bool prepare_pool_op (MonOpRequestRef op);
 477   bool prepare_pool_op_create (MonOpRequestRef op);
 478   bool prepare_pool_op_delete(MonOpRequestRef op);
 479   int crush_rename_bucket(const std::string& srcname,
 480                           const std::string& dstname,
 481                           std::ostream *ss);
 482   void check_legacy_ec_plugin(const std::string& plugin,
 483                               const std::string& profile) const;
 484   int normalize_profile(const std::string& profilename,
 485                         ceph::ErasureCodeProfile &profile,
 486                         bool force,
 487                         std::ostream *ss);
 488   int crush_rule_create_erasure(const std::string &name,
 489                                 const std::string &profile,
 490                                 int *rule,
 491                                 std::ostream *ss);
 492   int get_crush_rule(const std::string &rule_name,
 493                      int *crush_rule,
 494                      std::ostream *ss);
 495   int get_erasure_code(const std::string &erasure_code_profile,
 496                        ceph::ErasureCodeInterfaceRef *erasure_code,
 497                        std::ostream *ss) const;
 498   int prepare_pool_crush_rule(const unsigned pool_type,
 499                               const std::string &erasure_code_profile,
 500                               const std::string &rule_name,
 501                               int *crush_rule,
 502                               std::ostream *ss);
 503   bool erasure_code_profile_in_use(
 504     const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
 505     const std::string &profile,
 506     std::ostream *ss);
 507   int parse_erasure_code_profile(const std::vector<std::string> &erasure_code_profile,
 508                                  std::map<std::string,std::string> *erasure_code_profile_map,
 509                                  std::ostream *ss);
 510   int prepare_pool_size(const unsigned pool_type,
 511                         const std::string &erasure_code_profile,
 512                         uint8_t repl_size,
 513                         unsigned *size, unsigned *min_size,
 514                         std::ostream *ss);
 515   int prepare_pool_stripe_width(const unsigned pool_type,
 516                                 const std::string &erasure_code_profile,
 517                                 unsigned *stripe_width,
 518                                 std::ostream *ss);
 519   uint32_t get_osd_num_by_crush(int crush_rule);
 520   int check_pg_num(int64_t pool, int pg_num, int size, int crush_rule, std::ostream* ss);
 521   int prepare_new_pool(std::string& name,
 522                        int crush_rule,
 523                        const std::string &crush_rule_name,
 524                        unsigned pg_num, unsigned pgp_num,
 525                        unsigned pg_num_min,
 526                        unsigned pg_num_max,
 527                        uint64_t repl_size,
 528                        const uint64_t target_size_bytes,
 529                        const float target_size_ratio,
 530                        const std::string &erasure_code_profile,
 531                        const unsigned pool_type,
 532                        const uint64_t expected_num_objects,
 533                        FastReadType fast_read,
 534                        std::string pg_autoscale_mode,
 535                        bool bulk,
 536                        bool crimson,
 537                        std::ostream *ss);
 538   int prepare_new_pool(MonOpRequestRef op);
 539
 540   void set_pool_flags(int64_t pool_id, uint64_t flags);
 541   void clear_pool_flags(int64_t pool_id, uint64_t flags);
 542   bool update_pools_status();
 543
 544   bool _is_removed_snap(int64_t pool_id, snapid_t snapid);
 545   bool _is_pending_removed_snap(int64_t pool_id, snapid_t snapid);
 546
 547   std::string make_purged_snap_epoch_key(epoch_t epoch);
 548   std::string make_purged_snap_key(int64_t pool, snapid_t snap);
 549   std::string make_purged_snap_key_value(int64_t pool, snapid_t snap, snapid_t num,
 550                                     epoch_t epoch, ceph::buffer::list *v);
 551
 552   bool try_prune_purged_snaps();
 553   int lookup_purged_snap(int64_t pool, snapid_t snap,
 554                          snapid_t *begin, snapid_t *end);
 555
 556   void insert_purged_snap_update(
 557     int64_t pool,
 558     snapid_t start, snapid_t end,
 559     epoch_t epoch,
 560     MonitorDBStore::TransactionRef t);
 561
 562   bool prepare_set_flag(MonOpRequestRef op, int flag);
 563   bool prepare_unset_flag(MonOpRequestRef op, int flag);
 564
 565   void _pool_op_reply(MonOpRequestRef op,
 566                       int ret, epoch_t epoch, ceph::buffer::list *blp=NULL);
 567
 568   struct C_Booted : public C_MonOp {
 569     OSDMonitor *cmon;
 570     bool logit;
 571     C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) :
 572       C_MonOp(op_), cmon(cm), logit(l) {}
 573     void _finish(int r) override {
 574       if (r >= 0)
 575         cmon->_booted(op, logit);
 576       else if (r == -ECANCELED)
 577         return;
 578       else if (r == -EAGAIN)
 579         cmon->dispatch(op);
 580       else
 581         ceph_abort_msg("bad C_Booted return value");
 582     }
 583   };
 584
 585   struct C_ReplyMap : public C_MonOp {
 586     OSDMonitor *osdmon;
 587     epoch_t e;
 588     C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee)
 589       : C_MonOp(op_), osdmon(o), e(ee) {}
 590     void _finish(int r) override {
 591       if (r >= 0)
 592         osdmon->_reply_map(op, e);
 593       else if (r == -ECANCELED)
 594         return;
 595       else if (r == -EAGAIN)
 596         osdmon->dispatch(op);
 597       else
 598         ceph_abort_msg("bad C_ReplyMap return value");
 599     }
 600   };
 601   struct C_PoolOp : public C_MonOp {
 602     OSDMonitor *osdmon;
 603     int replyCode;
 604     int epoch;
 605     ceph::buffer::list reply_data;
 606     C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, ceph::buffer::list *rd=NULL) :
 607       C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) {
 608       if (rd)
 609         reply_data = *rd;
 610     }
 611     void _finish(int r) override {
 612       if (r >= 0)
 613         osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data);
 614       else if (r == -ECANCELED)
 615         return;
 616       else if (r == -EAGAIN)
 617         osdmon->dispatch(op);
 618       else
 619         ceph_abort_msg("bad C_PoolOp return value");
 620     }
 621   };
 622
 623   bool preprocess_remove_snaps(MonOpRequestRef op);
 624   bool prepare_remove_snaps(MonOpRequestRef op);
 625
 626   bool preprocess_get_purged_snaps(MonOpRequestRef op);
 627
 628   int load_metadata(int osd, std::map<std::string, std::string>& m,
 629                     std::ostream *err);
 630   void count_metadata(const std::string& field, ceph::Formatter *f);
 631
 632   void reencode_incremental_map(ceph::buffer::list& bl, uint64_t features);
 633   void reencode_full_map(ceph::buffer::list& bl, uint64_t features);
 634 public:
 635   void count_metadata(const std::string& field, std::map<std::string,int> *out);
 636   void get_versions(std::map<std::string, std::list<std::string>> &versions);
 637   void get_filestore_osd_list();
 638   void check_for_filestore_osds(health_check_map_t *checks);
 639 protected:
 640   int get_osd_objectstore_type(int osd, std::string *type);
 641   bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool,
 642                                        std::ostream *err);
 643
 644   // when we last received PG stats from each osd and the osd's osd_beacon_report_interval
 645   std::map<int, std::pair<utime_t, int>> last_osd_report;
 646   // TODO: use last_osd_report to store the osd report epochs, once we don't
 647   //       need to upgrade from pre-luminous releases.
 648   std::map<int,epoch_t> osd_epochs;
 649   LastEpochClean last_epoch_clean;
 650   bool preprocess_beacon(MonOpRequestRef op);
 651   bool prepare_beacon(MonOpRequestRef op);
 652   epoch_t get_min_last_epoch_clean() const;
 653
 654   friend class C_UpdateCreatingPGs;
 655   std::map<int, std::map<epoch_t, std::set<spg_t>>> creating_pgs_by_osd_epoch;
 656   std::vector<pg_t> pending_created_pgs;
 657   // the epoch when the pg mapping was calculated
 658   epoch_t creating_pgs_epoch = 0;
 659   creating_pgs_t creating_pgs;
 660   mutable std::mutex creating_pgs_lock;
 661
 662   creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc,
 663                                     const OSDMap& nextmap);
 664   unsigned scan_for_creating_pgs(
 665     const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
 666     const mempool::osdmap::set<int64_t>& removed_pools,
 667     utime_t modified,
 668     creating_pgs_t* creating_pgs) const;
 669   std::pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const;
 670   void update_creating_pgs();
 671   void check_pg_creates_subs();
 672   epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const;
 673
 674   int32_t _allocate_osd_id(int32_t* existing_id);
 675
 676   int get_grace_interval_threshold();
 677   bool grace_interval_threshold_exceeded(int last_failed);
 678   void set_default_laggy_params(int target_osd);
 679
 680   int parse_pgid(const cmdmap_t& cmdmap, std::stringstream &ss,
 681                  pg_t &pgid, std::optional<std::string> pgidstr = std::nullopt);
 682 public:
 683   OSDMonitor(CephContext *cct, Monitor &mn, Paxos &p, const std::string& service_name);
 684
 685   void tick() override;  // check state, take actions
 686
 687   bool preprocess_command(MonOpRequestRef op);
 688   bool prepare_command(MonOpRequestRef op);
 689   bool prepare_command_impl(MonOpRequestRef op, const cmdmap_t& cmdmap);
 690
 691   int validate_osd_create(
 692       const int32_t id,
 693       const uuid_d& uuid,
 694       const bool check_osd_exists,
 695       int32_t* existing_id,
 696       std::stringstream& ss);
 697   int prepare_command_osd_create(
 698       const int32_t id,
 699       const uuid_d& uuid,
 700       int32_t* existing_id,
 701       std::stringstream& ss);
 702   void do_osd_create(const int32_t id, const uuid_d& uuid,
 703                      const std::string& device_class,
 704                      int32_t* new_id);
 705   int prepare_command_osd_purge(int32_t id, std::stringstream& ss);
 706   int prepare_command_osd_destroy(int32_t id, std::stringstream& ss);
 707   int _prepare_command_osd_crush_remove(
 708       CrushWrapper &newcrush,
 709       int32_t id,
 710       int32_t ancestor,
 711       bool has_ancestor,
 712       bool unlink_only);
 713   void do_osd_crush_remove(CrushWrapper& newcrush);
 714   int prepare_command_osd_crush_remove(
 715       CrushWrapper &newcrush,
 716       int32_t id,
 717       int32_t ancestor,
 718       bool has_ancestor,
 719       bool unlink_only);
 720   int prepare_command_osd_remove(int32_t id);
 721   int prepare_command_osd_new(
 722       MonOpRequestRef op,
 723       const cmdmap_t& cmdmap,
 724       const std::map<std::string,std::string>& secrets,
 725       std::stringstream &ss,
 726       ceph::Formatter *f);
 727
 728   int prepare_command_pool_set(const cmdmap_t& cmdmap,
 729                                std::stringstream& ss);
 730
 731   int prepare_command_pool_application(const std::string &prefix,
 732                                        const cmdmap_t& cmdmap,
 733                                        std::stringstream& ss);
 734   int preprocess_command_pool_application(const std::string &prefix,
 735                                           const cmdmap_t& cmdmap,
 736                                           std::stringstream& ss,
 737                                           bool *modified);
 738   int _command_pool_application(const std::string &prefix,
 739                                 const cmdmap_t& cmdmap,
 740                                 std::stringstream& ss,
 741                                 bool *modified,
 742                                 bool preparing);
 743
 744   bool handle_osd_timeouts(const utime_t &now,
 745                            std::map<int, std::pair<utime_t, int>> &last_osd_report);
 746
 747   void send_latest(MonOpRequestRef op, epoch_t start=0);
 748   void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) {
 749     op->mark_osdmon_event(__func__);
 750     send_incremental(op, start);
 751   }
 752
 753   int get_version(version_t ver, ceph::buffer::list& bl) override;
 754   int get_version(version_t ver, uint64_t feature, ceph::buffer::list& bl);
 755
 756   int get_version_full(version_t ver, uint64_t feature, ceph::buffer::list& bl);
 757   int get_version_full(version_t ver, ceph::buffer::list& bl) override;
 758   int get_inc(version_t ver, OSDMap::Incremental& inc);
 759   int get_full_from_pinned_map(version_t ver, ceph::buffer::list& bl);
 760
 761   epoch_t blocklist(const entity_addrvec_t& av, utime_t until);
 762   epoch_t blocklist(entity_addr_t a, utime_t until);
 763
 764   void dump_info(ceph::Formatter *f);
 765   int dump_osd_metadata(int osd, ceph::Formatter *f, std::ostream *err);
 766   void print_nodes(ceph::Formatter *f);
 767
 768   void check_osdmap_sub(Subscription *sub);
 769   void check_pg_creates_sub(Subscription *sub);
 770
 771   void do_application_enable(int64_t pool_id, const std::string &app_name,
 772                              const std::string &app_key="",
 773                              const std::string &app_value="",
 774                              bool force=false);
 775   void do_set_pool_opt(int64_t pool_id, pool_opts_t::key_t opt,
 776                        pool_opts_t::value_t);
 777
 778   void add_flag(int flag) {
 779     if (!(osdmap.flags & flag)) {
 780       if (pending_inc.new_flags < 0)
 781         pending_inc.new_flags = osdmap.flags;
 782       pending_inc.new_flags |= flag;
 783     }
 784   }
 785
 786   void remove_flag(int flag) {
 787     if(osdmap.flags & flag) {
 788       if (pending_inc.new_flags < 0)
 789         pending_inc.new_flags = osdmap.flags;
 790       pending_inc.new_flags &= ~flag;
 791     }
 792   }
 793   void convert_pool_priorities(void);
 794   /**
 795    * Find the pools which are requested to be put into stretch mode,
 796    * validate that they are allowed to be in stretch mode (eg, are replicated)
 797    * and place copies of them in the pools set.
 798    * This does not make any changes to the pools or state; it's just
 799    * a safety-check-and-collect function.
 800    */
 801   void try_enable_stretch_mode_pools(std::stringstream& ss, bool *okay,
 802                                      int *errcode,
 803                                      std::set<pg_pool_t*>* pools,
 804                                      const std::string& new_crush_rule);
 805   /**
 806    * Check validity of inputs and OSD/CRUSH state to
 807    * engage stretch mode. Designed to be used with
 808    * MonmapMonitor::try_enable_stretch_mode() where we call both twice,
 809    * first with commit=false to validate.
 810    * @param ss: a stringstream to write errors into
 811    * @param okay: Filled to true if okay, false if validation fails
 812    * @param errcode: filled with -errno if there's a problem
 813    * @param commit: true if we should commit the change, false if just testing
 814    * @param dividing_bucket: the bucket type (eg 'dc') that divides the cluster
 815    * @param bucket_count: The number of buckets required in peering.
 816    *  Currently must be 2.
 817    * @param pools: The pg_pool_ts which are being set to stretch mode (obtained
 818    *   from try_enable_stretch_mode_pools()).
 819    * @param new_crush_rule: The crush rule to set the pools to.
 820    */
 821   void try_enable_stretch_mode(std::stringstream& ss, bool *okay,
 822                                int *errcode, bool commit,
 823                                const std::string& dividing_bucket,
 824                                uint32_t bucket_count,
 825                                const std::set<pg_pool_t*>& pools,
 826                                const std::string& new_crush_rule);
 827   /**
 828    * Check the input dead_buckets mapping (buckets->dead monitors) to see
 829    * if the OSDs are also down. If so, fill in really_down_buckets and
 830    * really_down_mons and return true; else return false.
 831    */
 832   bool check_for_dead_crush_zones(const std::map<std::string,std::set<std::string>>& dead_buckets,
 833                                   std::set<int> *really_down_buckets,
 834                                   std::set<std::string> *really_down_mons);
 835   /**
 836    * Set degraded mode in the OSDMap, adding the given dead buckets to the dead set
 837    * and using the live_zones (should presently be size 1)
 838    */
 839   void trigger_degraded_stretch_mode(const std::set<int>& dead_buckets,
 840                                      const std::set<std::string>& live_zones);
 841   /**
 842    * This is just to maintain stretch_recovery_triggered; below
 843    */
 844   void set_degraded_stretch_mode();
 845   /**
 846    * Set recovery stretch mode in the OSDMap, resetting pool size back to normal
 847    */
 848   void trigger_recovery_stretch_mode();
 849   /**
 850    * This is just to maintain stretch_recovery_triggered; below
 851    */
 852   void set_recovery_stretch_mode();
 853   /**
 854    * This is just to maintain stretch_recovery_triggered; below
 855    */
 856   void set_healthy_stretch_mode();
 857   /**
 858    * Tells the OSD there's a new pg digest, in case it's interested.
 859    * (It's interested when in recovering stretch mode.)
 860    */
 861   void notify_new_pg_digest();
 862   /**
 863    * Check if we can exit recovery stretch mode and go back to normal.
 864    * @param force If true, we will force the exit through once it is legal,
 865    * without regard to the reported PG status.
 866    */
 867   void try_end_recovery_stretch_mode(bool force);
 868   /**
 869    * Sets the osdmap and pg_pool_t values back to healthy stretch mode status.
 870    */
 871   void trigger_healthy_stretch_mode();
 872   /**
 873    * Obtain the crush rule being used for stretch pools.
 874    * Note that right now this is heuristic and simply selects the
 875    * most-used rule on replicated stretch pools.
 876    * @return the crush rule ID, or a negative errno
 877    */
 878   int get_replicated_stretch_crush_rule();
 879 private:
 880   utime_t stretch_recovery_triggered; // what time we committed a switch to recovery mode
 881 };
 882
 883 #endif