import 15.2.5

[ceph.git] / ceph / src / os / bluestore / BlueStore.h
diff --git a/ceph/src/os/bluestore/BlueStore.h b/ceph/src/os/bluestore/BlueStore.h

index 55787e1b2986bd61dcc129b3205ba38e690cb190..cac64a08d5df482c6c00ee07b5847d22ca839a1c 100644 (file)
--- a/ceph/src/os/bluestore/BlueStore.h
+++ b/ceph/src/os/bluestore/BlueStore.h
@@ -1050,13 +1050,9 @@ public:
    };
  
    struct OnodeSpace;
-  struct OnodeCacheShard;
    /// an in-memory object
    struct Onode {
      MEMPOOL_CLASS_HELPERS();
-    // Not persisted and updated on cache insertion/removal
-    OnodeCacheShard *s;
-    bool pinned = false; // Only to be used by the onode cache shard
  
      std::atomic_int nref;  ///< reference count
      Collection *c;
@@ -1065,11 +1061,15 @@ public:
      /// key under PREFIX_OBJ where we are stored
      mempool::bluestore_cache_other::string key;
  
-    boost::intrusive::list_member_hook<> lru_item, pin_item;
+    boost::intrusive::list_member_hook<> lru_item;
  
      bluestore_onode_t onode;  ///< metadata stored as value in kv store
      bool exists;              ///< true if object logically exists
-
+    bool cached;              ///< Onode is logically in the cache
+                              /// (it can be pinned and hence physically out
+                              /// of it at the moment though)
+    bool pinned;              ///< Onode is pinned
+                              /// (or should be pinned when cached)
      ExtentMap extent_map;
  
      // track txc's that have not been committed to kv store (and whose
@@ -1082,32 +1082,35 @@ public:
  
      Onode(Collection *c, const ghobject_t& o,
           const mempool::bluestore_cache_other::string& k)
-      : s(nullptr),
-        nref(0),
+      : nref(0),
         c(c),
         oid(o),
         key(k),
         exists(false),
+        cached(false),
+        pinned(false),
         extent_map(this) {
      }
      Onode(Collection* c, const ghobject_t& o,
-      const string& k)
-      : s(nullptr),
-      nref(0),
+      const std::string& k)
+      : nref(0),
        c(c),
        oid(o),
        key(k),
        exists(false),
+      cached(false),
+      pinned(false),
        extent_map(this) {
      }
      Onode(Collection* c, const ghobject_t& o,
        const char* k)
-      : s(nullptr),
-      nref(0),
+      : nref(0),
        c(c),
        oid(o),
        key(k),
        exists(false),
+      cached(false),
+      pinned(false),
        extent_map(this) {
      }
  
@@ -1120,19 +1123,18 @@ public:
      void dump(Formatter* f) const;
  
      void flush();
-    void get() {
-      if (++nref == 2 && s != nullptr) {
-        s->pin(*this);
-      }
+    void get();
+    void put();
+
+    inline bool put_cache() {
+      ceph_assert(!cached);
+      cached = true;
+      return !pinned;
      }
-    void put() {
-      int n = --nref;
-      if (n == 1 && s != nullptr) {
-        s->unpin(*this);
-      }
-      if (n == 0) {
-       delete this;
-      }
+    inline bool pop_cache() {
+      ceph_assert(cached);
+      cached = false;
+      return !pinned;
      }
  
      const string& get_omap_prefix();
@@ -1199,26 +1201,32 @@ public:
      std::atomic<uint64_t> num_pinned = {0};
  
      std::array<std::pair<ghobject_t, mono_clock::time_point>, 64> dumped_onodes;
+
+    virtual void _pin(Onode* o) = 0;
+    virtual void _unpin(Onode* o) = 0;
+
    public:
      OnodeCacheShard(CephContext* cct) : CacheShard(cct) {}
      static OnodeCacheShard *create(CephContext* cct, string type,
                                     PerfCounters *logger);
-    virtual void _add(OnodeRef& o, int level) = 0;
-    virtual void _rm(OnodeRef& o) = 0;
-    virtual void _touch(OnodeRef& o) = 0;
-    virtual void _pin(Onode& o) = 0;
-    virtual void _unpin(Onode& o) = 0;
+    virtual void _add(Onode* o, int level) = 0;
+    virtual void _rm(Onode* o) = 0;
  
-    void pin(Onode& o) {
+    void pin(Onode* o, std::function<bool ()> validator) {
        std::lock_guard l(lock);
-      _pin(o);
+      if (validator()) {
+        _pin(o);
+      }
      }
  
-    void unpin(Onode& o) {
+    void unpin(Onode* o, std::function<bool()> validator) {
        std::lock_guard l(lock);
-      _unpin(o);
+      if (validator()) {
+        _unpin(o);
+      }
      }
  
+    virtual void move_pinned(OnodeCacheShard *to, Onode *o) = 0;
      virtual void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) = 0;
      bool empty() {
        return _get_num() == 0;
@@ -1279,17 +1287,16 @@ public:
  
      friend class Collection; // for split_cache()
  
+    friend struct LruOnodeCacheShard;
+    void _remove(const ghobject_t& oid);
    public:
      OnodeSpace(OnodeCacheShard *c) : cache(c) {}
      ~OnodeSpace() {
        clear();
      }
  
-    OnodeRef add(const ghobject_t& oid, OnodeRef o);
+    OnodeRef add(const ghobject_t& oid, OnodeRef& o);
      OnodeRef lookup(const ghobject_t& o);
-    void remove(const ghobject_t& oid) {
-      onode_map.erase(oid);
-    }
      void rename(OnodeRef& o, const ghobject_t& old_oid,
                 const ghobject_t& new_oid,
                 const mempool::bluestore_cache_other::string& new_okey);
@@ -1326,6 +1333,9 @@ public:
      pool_opts_t pool_opts;
      ContextQueue *commit_queue;
  
+    OnodeCacheShard* get_onode_cache() const {
+      return onode_map.cache;
+    }
      OnodeRef get_onode(const ghobject_t& oid, bool create, bool is_createop=false);
  
      // the terminology is confusing here, sorry!
@@ -2558,7 +2568,7 @@ public:
  
    void get_db_statistics(Formatter *f) override;
    void generate_db_histogram(Formatter *f) override;
-  void _flush_cache();
+  void _shutdown_cache();
    int flush_cache(ostream *os = NULL) override;
    void dump_perf_counters(Formatter *f) override {
      f->open_object_section("perf_counters");
@@ -3582,7 +3592,8 @@ class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector
    enum {
      // use 0/nullptr as unset indication
      LEVEL_FIRST = 1,
-    LEVEL_WAL = LEVEL_FIRST,
+    LEVEL_LOG = LEVEL_FIRST, // BlueFS log
+    LEVEL_WAL,
      LEVEL_DB,
      LEVEL_SLOW,
      LEVEL_MAX
@@ -3592,6 +3603,8 @@ class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector
    typedef matrix_2d<uint64_t, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t;
  
    per_level_per_dev_usage_t per_level_per_dev_usage;
+  // file count per level, add +1 to keep total file count
+  uint64_t per_level_files[LEVEL_MAX - LEVEL_FIRST + 1] = { 0 };
  
    // Note: maximum per-device totals below might be smaller than corresponding
    // perf counters by up to a single alloc unit (1M) due to superblock extent.
@@ -3617,6 +3630,7 @@ public:
      uint64_t reserved,
      bool new_pol)
    {
+    l_totals[LEVEL_LOG - LEVEL_FIRST] = 0; // not used at the moment
      l_totals[LEVEL_WAL - LEVEL_FIRST] = _wal_total;
      l_totals[LEVEL_DB - LEVEL_FIRST] = _db_total;
      l_totals[LEVEL_SLOW - LEVEL_FIRST] = _slow_total;
@@ -3651,9 +3665,8 @@ public:
      }
    }
  
-  void* get_hint_by_device(uint8_t dev) const override {
-    ceph_assert(dev == BlueFS::BDEV_WAL); // others aren't used atm
-    return  reinterpret_cast<void*>(LEVEL_WAL);
+  void* get_hint_for_log() const override {
+    return  reinterpret_cast<void*>(LEVEL_LOG);
    }
    void* get_hint_by_dir(const string& dirname) const override;
  
@@ -3687,6 +3700,8 @@ public:
          max = cur;
        }
      }
+    ++per_level_files[pos];
+    ++per_level_files[LEVEL_MAX - LEVEL_FIRST];
    }
    void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
      if (hint == nullptr)
@@ -3706,6 +3721,10 @@ public:
      auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
      ceph_assert(cur >= fnode.size);
      cur -= fnode.size;
+    ceph_assert(per_level_files[pos] > 0);
+    --per_level_files[pos];
+    ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0);
+    --per_level_files[LEVEL_MAX - LEVEL_FIRST];
    }
    void add_usage(void* hint, uint64_t fsize) override {
      if (hint == nullptr)