buffer_map[b->offset].reset(b);
if (b->is_writing()) {
b->data.reassign_to_mempool(mempool::mempool_bluestore_writing);
- writing.push_back(*b);
+ if (writing.empty() || writing.rbegin()->seq <= b->seq) {
+ writing.push_back(*b);
+ } else {
+ auto it = writing.begin();
+ while (it->seq < b->seq) {
+ ++it;
+ }
+
+ assert(it->seq >= b->seq);
+ // note that this will insert b before it
+ // hence the order is maintained
+ writing.insert(it, *b);
+ }
} else {
b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
cache->_add_buffer(b, level, near);
get_blob().can_split_at(blob_offset);
}
- bool try_reuse_blob(uint32_t min_alloc_size,
+ bool can_reuse_blob(uint32_t min_alloc_size,
uint32_t target_blob_size,
uint32_t b_offset,
uint32_t *length0);
#endif
}
- const bluestore_blob_t& get_blob() const {
+ inline const bluestore_blob_t& get_blob() const {
return blob;
}
- bluestore_blob_t& dirty_blob() {
+ inline bluestore_blob_t& dirty_blob() {
#ifdef CACHE_BLOB_BL
blob_bl.clear();
#endif
IOContext ioc;
bool had_ios = false; ///< true if we submitted IOs before our kv txn
- CollectionRef first_collection; ///< first referenced collection
-
uint64_t seq = 0;
utime_t start;
utime_t last_stamp;
interval_set<uint64_t> bluefs_extents; ///< block extents owned by bluefs
interval_set<uint64_t> bluefs_extents_reclaiming; ///< currently reclaiming
- std::mutex deferred_lock;
+ std::mutex deferred_lock, deferred_submit_lock;
std::atomic<uint64_t> deferred_seq = {0};
deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
int deferred_queue_size = 0; ///< num txc's queued across all osrs
size_t block_size_order = 0; ///< bits to shift to get block size
uint64_t min_alloc_size = 0; ///< minimum allocation unit (power of 2)
- std::atomic<int> deferred_batch_ops = {0}; ///< deferred batch size
-
///< bits for min_alloc_size
- std::atomic<uint8_t> min_alloc_size_order = {0};
+ uint8_t min_alloc_size_order = 0;
static_assert(std::numeric_limits<uint8_t>::max() >
std::numeric_limits<decltype(min_alloc_size)>::digits,
"not enough bits for min_alloc_size");
- ///< size threshold for forced deferred writes
- std::atomic<uint64_t> prefer_deferred_size = {0};
-
///< maximum allocation unit (power of 2)
std::atomic<uint64_t> max_alloc_size = {0};
+ ///< number threshold for forced deferred writes
+ std::atomic<int> deferred_batch_ops = {0};
+
+ ///< size threshold for forced deferred writes
+ std::atomic<uint64_t> prefer_deferred_size = {0};
+
///< approx cost per io, in bytes
std::atomic<uint64_t> throttle_cost_per_io = {0};
- std::atomic<Compressor::CompressionMode> comp_mode = {Compressor::COMP_NONE}; ///< compression mode
+ std::atomic<Compressor::CompressionMode> comp_mode =
+ {Compressor::COMP_NONE}; ///< compression mode
CompressorRef compressor;
std::atomic<uint64_t> comp_min_blob_size = {0};
std::atomic<uint64_t> comp_max_blob_size = {0};
uint64_t kv_throttle_costs = 0;
// cache trim control
+ uint64_t cache_size = 0; ///< total cache size
float cache_meta_ratio = 0; ///< cache ratio dedicated to metadata
float cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
float cache_data_ratio = 0; ///< cache ratio dedicated to object data
int _open_super_meta();
- void open_statfs();
+ void _open_statfs();
int _reconcile_bluefs_freespace();
int _balance_bluefs_freespace(PExtentVector *extents);
bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o);
void _deferred_queue(TransContext *txc);
- void deferred_try_submit() {
- std::lock_guard<std::mutex> l(deferred_lock);
- _deferred_try_submit();
- }
- void _deferred_try_submit();
- void _deferred_submit(OpSequencer *osr);
+ void deferred_try_submit();
+ void _deferred_submit_unlock(OpSequencer *osr);
void _deferred_aio_finish(OpSequencer *osr);
int _deferred_replay();
void _apply_padding(uint64_t head_pad,
uint64_t tail_pad,
- bufferlist& bl,
bufferlist& padded);
// -- ondisk version ---
bool allows_journal() override { return false; };
bool is_rotational() override;
+ bool is_journal_rotational() override;
+
+ string get_default_device_class() override {
+ string device_class;
+ map<string, string> metadata;
+ collect_metadata(&metadata);
+ auto it = metadata.find("bluestore_bdev_type");
+ if (it != metadata.end()) {
+ device_class = it->second;
+ }
+ return device_class;
+ }
static int get_block_device_fsid(CephContext* cct, const string& path,
uuid_d *fsid);
uint64_t offset,
size_t len,
bufferlist& bl,
- uint32_t op_flags = 0,
- bool allow_eio = false) override;
+ uint32_t op_flags = 0) override;
int read(
CollectionHandle &c,
const ghobject_t& oid,
uint64_t offset,
size_t len,
bufferlist& bl,
- uint32_t op_flags = 0,
- bool allow_eio = false) override;
+ uint32_t op_flags = 0) override;
int _do_read(
Collection *c,
OnodeRef o,
objectstore_perf_stat_t get_cur_stats() const {
objectstore_perf_stat_t ret;
- ret.os_commit_latency = os_commit_latency.avg();
- ret.os_apply_latency = os_apply_latency.avg();
+ ret.os_commit_latency = os_commit_latency.current_avg();
+ ret.os_apply_latency = os_apply_latency.current_avg();
return ret;
}
RWLock::WLocker l(debug_read_error_lock);
debug_mdata_error_objects.insert(o);
}
+ void compact() override {
+ assert(db);
+ db->compact();
+ }
+
private:
bool _debug_data_eio(const ghobject_t& o) {
if (!cct->_conf->bluestore_debug_inject_read_err) {
OnodeRef o,
uint64_t offset,
set<SharedBlob*> *maybe_unshared_blobs=0);
- void _truncate(TransContext *txc,
+ int _truncate(TransContext *txc,
CollectionRef& c,
OnodeRef& o,
uint64_t offset);