X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=ceph%2Fsrc%2Fos%2Fbluestore%2FBlueStore.h;h=d7a1980320ebb16151784804f71124919f802977;hb=d2e6a577eb19928d58b31d1b6e096ca0f03c4052;hp=1114e7b4f47cbfc2e4bf6a74eb6719cc971bf019;hpb=31f18b776d001752a193a7cec8bb49033c1a904c;p=ceph.git diff --git a/ceph/src/os/bluestore/BlueStore.h b/ceph/src/os/bluestore/BlueStore.h index 1114e7b4f..d7a198032 100644 --- a/ceph/src/os/bluestore/BlueStore.h +++ b/ceph/src/os/bluestore/BlueStore.h @@ -263,7 +263,19 @@ public: buffer_map[b->offset].reset(b); if (b->is_writing()) { b->data.reassign_to_mempool(mempool::mempool_bluestore_writing); - writing.push_back(*b); + if (writing.empty() || writing.rbegin()->seq <= b->seq) { + writing.push_back(*b); + } else { + auto it = writing.begin(); + while (it->seq < b->seq) { + ++it; + } + + assert(it->seq >= b->seq); + // note that this will insert b before it + // hence the order is maintained + writing.insert(it, *b); + } } else { b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data); cache->_add_buffer(b, level, near); @@ -499,7 +511,7 @@ public: get_blob().can_split_at(blob_offset); } - bool try_reuse_blob(uint32_t min_alloc_size, + bool can_reuse_blob(uint32_t min_alloc_size, uint32_t target_blob_size, uint32_t b_offset, uint32_t *length0); @@ -512,10 +524,10 @@ public: #endif } - const bluestore_blob_t& get_blob() const { + inline const bluestore_blob_t& get_blob() const { return blob; } - bluestore_blob_t& dirty_blob() { + inline bluestore_blob_t& dirty_blob() { #ifdef CACHE_BLOB_BL blob_bl.clear(); #endif @@ -1544,8 +1556,6 @@ public: IOContext ioc; bool had_ios = false; ///< true if we submitted IOs before our kv txn - CollectionRef first_collection; ///< first referenced collection - uint64_t seq = 0; utime_t start; utime_t last_stamp; @@ -1831,7 +1841,7 @@ private: interval_set bluefs_extents; ///< block extents owned by bluefs interval_set bluefs_extents_reclaiming; ///< currently reclaiming - std::mutex deferred_lock; + std::mutex deferred_lock, deferred_submit_lock; std::atomic deferred_seq = {0}; deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending int deferred_queue_size = 0; ///< num txc's queued across all osrs @@ -1875,24 +1885,26 @@ private: size_t block_size_order = 0; ///< bits to shift to get block size uint64_t min_alloc_size = 0; ///< minimum allocation unit (power of 2) - std::atomic deferred_batch_ops = {0}; ///< deferred batch size - ///< bits for min_alloc_size - std::atomic min_alloc_size_order = {0}; + uint8_t min_alloc_size_order = 0; static_assert(std::numeric_limits::max() > std::numeric_limits::digits, "not enough bits for min_alloc_size"); - ///< size threshold for forced deferred writes - std::atomic prefer_deferred_size = {0}; - ///< maximum allocation unit (power of 2) std::atomic max_alloc_size = {0}; + ///< number threshold for forced deferred writes + std::atomic deferred_batch_ops = {0}; + + ///< size threshold for forced deferred writes + std::atomic prefer_deferred_size = {0}; + ///< approx cost per io, in bytes std::atomic throttle_cost_per_io = {0}; - std::atomic comp_mode = {Compressor::COMP_NONE}; ///< compression mode + std::atomic comp_mode = + {Compressor::COMP_NONE}; ///< compression mode CompressorRef compressor; std::atomic comp_min_blob_size = {0}; std::atomic comp_max_blob_size = {0}; @@ -1903,6 +1915,7 @@ private: uint64_t kv_throttle_costs = 0; // cache trim control + uint64_t cache_size = 0; ///< total cache size float cache_meta_ratio = 0; ///< cache ratio dedicated to metadata float cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb) float cache_data_ratio = 0; ///< cache ratio dedicated to object data @@ -1974,7 +1987,7 @@ private: int _open_super_meta(); - void open_statfs(); + void _open_statfs(); int _reconcile_bluefs_freespace(); int _balance_bluefs_freespace(PExtentVector *extents); @@ -2022,12 +2035,8 @@ private: bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o); void _deferred_queue(TransContext *txc); - void deferred_try_submit() { - std::lock_guard l(deferred_lock); - _deferred_try_submit(); - } - void _deferred_try_submit(); - void _deferred_submit(OpSequencer *osr); + void deferred_try_submit(); + void _deferred_submit_unlock(OpSequencer *osr); void _deferred_aio_finish(OpSequencer *osr); int _deferred_replay(); @@ -2071,7 +2080,6 @@ private: void _apply_padding(uint64_t head_pad, uint64_t tail_pad, - bufferlist& bl, bufferlist& padded); // -- ondisk version --- @@ -2101,6 +2109,18 @@ public: bool allows_journal() override { return false; }; bool is_rotational() override; + bool is_journal_rotational() override; + + string get_default_device_class() override { + string device_class; + map metadata; + collect_metadata(&metadata); + auto it = metadata.find("bluestore_bdev_type"); + if (it != metadata.end()) { + device_class = it->second; + } + return device_class; + } static int get_block_device_fsid(CephContext* cct, const string& path, uuid_d *fsid); @@ -2184,16 +2204,14 @@ public: uint64_t offset, size_t len, bufferlist& bl, - uint32_t op_flags = 0, - bool allow_eio = false) override; + uint32_t op_flags = 0) override; int read( CollectionHandle &c, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl, - uint32_t op_flags = 0, - bool allow_eio = false) override; + uint32_t op_flags = 0) override; int _do_read( Collection *c, OnodeRef o, @@ -2338,8 +2356,8 @@ public: objectstore_perf_stat_t get_cur_stats() const { objectstore_perf_stat_t ret; - ret.os_commit_latency = os_commit_latency.avg(); - ret.os_apply_latency = os_apply_latency.avg(); + ret.os_commit_latency = os_commit_latency.current_avg(); + ret.os_apply_latency = os_apply_latency.current_avg(); return ret; } @@ -2369,6 +2387,11 @@ public: RWLock::WLocker l(debug_read_error_lock); debug_mdata_error_objects.insert(o); } + void compact() override { + assert(db); + db->compact(); + } + private: bool _debug_data_eio(const ghobject_t& o) { if (!cct->_conf->bluestore_debug_inject_read_err) {