return false;
}
+void BlueStore::OnodeSpace::dump(CephContext *cct, int lvl)
+{
+ for (auto& i : onode_map) {
+ ldout(cct, lvl) << i.first << " : " << i.second << dendl;
+ }
+}
// SharedBlob
<< " removing self from set " << get_parent()
<< dendl;
if (get_parent()) {
- if (get_parent()->remove(this)) {
- delete this;
- } else {
- ldout(coll->store->cct, 20)
- << __func__ << " " << this << " lost race to remove myself from set"
- << dendl;
- }
- } else {
- delete this;
+ get_parent()->remove(this);
}
+ delete this;
}
}
}
}
+// SharedBlobSet
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
+
+void BlueStore::SharedBlobSet::dump(CephContext *cct, int lvl)
+{
+ std::lock_guard<std::mutex> l(lock);
+ for (auto& i : sb_map) {
+ ldout(cct, lvl) << i.first << " : " << *i.second << dendl;
+ }
+}
+
// Blob
#undef dout_prefix
unsigned n;
// we need to encode inline_bl to measure encoded length
bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
+ inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
assert(!never_happen);
size_t len = inline_bl.length();
dout(20) << __func__ << " inline shard " << len << " bytes from " << n
on->exists = true;
bufferptr::iterator p = v.front().begin_deep();
on->onode.decode(p);
+ for (auto& i : on->onode.attrs) {
+ i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ }
// initialize extent_map
on->extent_map.decode_spanning_blobs(p);
if (on->onode.extent_map_shards.empty()) {
denc(on->extent_map.inline_bl, p);
on->extent_map.decode_some(on->extent_map.inline_bl);
+ on->extent_map.inline_bl.reassign_to_mempool(
+ mempool::mempool_bluestore_cache_other);
} else {
on->extent_map.init_shards(false, false);
}
continue;
}
ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
- sb->coll = dest;
if (sb->get_sbid()) {
ldout(store->cct, 20) << __func__
<< " moving registration " << *sb << dendl;
shared_blob_set.remove(sb);
dest->shared_blob_set.add(dest, sb);
}
+ sb->coll = dest;
if (dest->cache != cache) {
for (auto& i : sb->bc.buffer_map) {
if (!i.second->is_writing()) {
return;
}
- if (cct->_conf->bluestore_compression_max_blob_size) {
- comp_min_blob_size = cct->_conf->bluestore_compression_max_blob_size;
+ if (cct->_conf->bluestore_compression_min_blob_size) {
+ comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
} else {
assert(bdev);
if (bdev->is_rotational()) {
return 0;
}
+int BlueStore::write_meta(const std::string& key, const std::string& value)
+{
+ bluestore_bdev_label_t label;
+ string p = path + "/block";
+ int r = _read_bdev_label(cct, p, &label);
+ if (r < 0) {
+ return ObjectStore::write_meta(key, value);
+ }
+ label.meta[key] = value;
+ r = _write_bdev_label(cct, p, label);
+ assert(r == 0);
+ return ObjectStore::write_meta(key, value);
+}
+
+int BlueStore::read_meta(const std::string& key, std::string *value)
+{
+ bluestore_bdev_label_t label;
+ string p = path + "/block";
+ int r = _read_bdev_label(cct, p, &label);
+ if (r < 0) {
+ return ObjectStore::read_meta(key, value);
+ }
+ auto i = label.meta.find(key);
+ if (i == label.meta.end()) {
+ return ObjectStore::read_meta(key, value);
+ }
+ *value = i->second;
+ return 0;
+}
+
void BlueStore::_init_logger()
{
PerfCountersBuilder b(cct, "bluestore",
b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
"Sum for extents that have been merged due to garbage "
"collection");
+ b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
+ "Read EIO errors propagated to high level callers");
logger = b.create_perf_counters();
cct->get_perfcounters_collection()->add(logger);
}
int BlueStore::_open_path()
{
+ // sanity check(s)
+ if (cct->_conf->get_val<uint64_t>("osd_max_object_size") >=
+ 4*1024*1024*1024ull) {
+ derr << __func__ << " osd_max_object_size >= 4GB; BlueStore has hard limit of 4GB." << dendl;
+ return -EINVAL;
+ }
assert(path_fd < 0);
path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY));
if (path_fd < 0) {
path_fd = -1;
}
-int BlueStore::_write_bdev_label(string path, bluestore_bdev_label_t label)
+int BlueStore::_write_bdev_label(CephContext *cct,
+ string path, bluestore_bdev_label_t label)
{
dout(10) << __func__ << " path " << path << " label " << label << dendl;
bufferlist bl;
derr << __func__ << " failed to write to " << path
<< ": " << cpp_strerror(r) << dendl;
}
+ r = ::fsync(fd);
+ if (r < 0) {
+ derr << __func__ << " failed to fsync " << path
+ << ": " << cpp_strerror(r) << dendl;
+ }
VOID_TEMP_FAILURE_RETRY(::close(fd));
return r;
}
::decode(expected_crc, p);
}
catch (buffer::error& e) {
- derr << __func__ << " unable to decode label at offset " << p.get_off()
+ dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
<< ": " << e.what()
<< dendl;
- return -EINVAL;
+ return -ENOENT;
}
if (crc != expected_crc) {
derr << __func__ << " bad crc on label, expected " << expected_crc
label.size = size;
label.btime = ceph_clock_now();
label.description = desc;
- int r = _write_bdev_label(path, label);
+ int r = _write_bdev_label(cct, path, label);
if (r < 0)
return r;
} else {
bl.append(freelist_type);
t->set(PREFIX_SUPER, "freelist_type", bl);
}
- fm->create(bdev->get_size(), t);
+ // being able to allocate in units less than bdev block size
+ // seems to be a bad idea.
+ assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
+ fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
// allocate superblock reserved space. note that we do not mark
// bluefs space as allocated in the freelist; we instead rely on
// bluefs_extents.
- fm->allocate(0, SUPER_RESERVED, t);
+ uint64_t reserved = ROUND_UP_TO(MAX(SUPER_RESERVED, min_alloc_size),
+ min_alloc_size);
+ fm->allocate(0, reserved, t);
- uint64_t reserved = 0;
if (cct->_conf->bluestore_bluefs) {
assert(bluefs_extents.num_intervals() == 1);
interval_set<uint64_t>::iterator p = bluefs_extents.begin();
- reserved = p.get_start() + p.get_len();
+ reserved = ROUND_UP_TO(p.get_start() + p.get_len(), min_alloc_size);
dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
<< " for bluefs" << dendl;
bufferlist bl;
t->set(PREFIX_SUPER, "bluefs_extents", bl);
dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
<< std::dec << dendl;
- } else {
- reserved = SUPER_RESERVED;
}
if (cct->_conf->bluestore_debug_prefill > 0) {
db->submit_transaction_sync(t);
}
- int r = fm->init();
+ int r = fm->init(bdev->get_size());
if (r < 0) {
derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
delete fm;
}
bluefs_shared_bdev = BlueFS::BDEV_SLOW;
bluefs_single_shared_device = false;
- } else if (::lstat(bfn.c_str(), &st) == -1) {
- bluefs_shared_bdev = BlueFS::BDEV_DB;
} else {
- //symlink exist is bug
- derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
r = -errno;
- goto free_bluefs;
+ if (::lstat(bfn.c_str(), &st) == -1) {
+ r = 0;
+ bluefs_shared_bdev = BlueFS::BDEV_DB;
+ } else {
+ derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
}
// shared device
bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
cct->_conf->bluestore_bluefs_gift_ratio);
initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
+ if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
+ derr << __func__ << " bluefs_alloc_size 0x" << std::hex
+ << cct->_conf->bluefs_alloc_size << " is not a multiple of "
+ << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+ r = -EINVAL;
+ goto free_bluefs;
+ }
// align to bluefs's alloc_size
initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
// put bluefs in the middle of the device in case it is an HDD
}
cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
bluefs_single_shared_device = false;
- } else if (::lstat(bfn.c_str(), &st) == -1) {
- cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
} else {
- //symlink exist is bug
- derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
r = -errno;
- goto free_bluefs;
+ if (::lstat(bfn.c_str(), &st) == -1) {
+ r = 0;
+ cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
+ } else {
+ derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
}
if (create) {
<< " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
<< ", should reclaim " << pretty_si_t(reclaim) << dendl;
}
+
+ // don't take over too much of the freespace
+ uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
- cct->_conf->bluestore_bluefs_min <
- (uint64_t)(cct->_conf->bluestore_bluefs_max_ratio * total_free)) {
+ cct->_conf->bluestore_bluefs_min < free_cap) {
uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
dout(10) << __func__ << " bluefs_total " << bluefs_total
<< " < min " << cct->_conf->bluestore_bluefs_min
gift = g;
reclaim = 0;
}
+ uint64_t min_free = cct->_conf->get_val<uint64_t>("bluestore_bluefs_min_free");
+ if (bluefs_free < min_free &&
+ min_free < free_cap) {
+ uint64_t g = min_free - bluefs_free;
+ dout(10) << __func__ << " bluefs_free " << bluefs_total
+ << " < min " << min_free
+ << ", should gift " << pretty_si_t(g) << dendl;
+ if (g > gift)
+ gift = g;
+ reclaim = 0;
+ }
if (gift) {
// round up to alloc size
int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
0, 0, &exts);
- if (alloc_len < (int64_t)gift) {
- derr << __func__ << " allocate failed on 0x" << std::hex << gift
- << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+ if (alloc_len <= 0) {
+ dout(1) << __func__ << " no allocate on 0x" << std::hex << gift
+ << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+ alloc->unreserve(gift);
+ alloc->dump();
+ return 0;
+ } else if (alloc_len < (int64_t)gift) {
+ dout(1) << __func__ << " insufficient allocate on 0x" << std::hex << gift
+ << " min_alloc_size 0x" << min_alloc_size
+ << " allocated 0x" << alloc_len
+ << std::dec << dendl;
+ alloc->unreserve(gift - alloc_len);
alloc->dump();
- assert(0 == "allocate failed, wtf");
- return -ENOSPC;
}
for (auto& p : exts) {
bluestore_pextent_t e = bluestore_pextent_t(p);
int BlueStore::_open_collections(int *errors)
{
+ dout(10) << __func__ << dendl;
assert(coll_map.empty());
KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
for (it->upper_bound(string());
<< pretty_binary_string(it->key()) << dendl;
return -EIO;
}
- dout(20) << __func__ << " opened " << cid << " " << c << dendl;
+ dout(20) << __func__ << " opened " << cid << " " << c
+ << " " << c->cnode << dendl;
coll_map[cid] = c;
} else {
derr << __func__ << " unrecognized collection " << it->key() << dendl;
}
if (cct->_conf->bluestore_block_preallocate_file) {
-#ifdef HAVE_POSIX_FALLOCATE
- r = ::posix_fallocate(fd, 0, size);
- if (r) {
+ r = ::ceph_posix_fallocate(fd, 0, size);
+ if (r > 0) {
derr << __func__ << " failed to prefallocate " << name << " file to "
<< size << ": " << cpp_strerror(r) << dendl;
VOID_TEMP_FAILURE_RETRY(::close(fd));
return -r;
}
-#else
- char data[1024*128];
- for (uint64_t off = 0; off < size; off += sizeof(data)) {
- if (off + sizeof(data) > size)
- r = ::write(fd, data, size - off);
- else
- r = ::write(fd, data, sizeof(data));
- if (r < 0) {
- r = -errno;
- derr << __func__ << " failed to prefallocate w/ write " << name << " file to "
- << size << ": " << cpp_strerror(r) << dendl;
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- return r;
- }
- }
-#endif
}
dout(1) << __func__ << " resized " << name << " file to "
<< pretty_si_t(size) << "B" << dendl;
if (r < 0)
goto out_close_fsid;
+ // choose min_alloc_size
+ if (cct->_conf->bluestore_min_alloc_size) {
+ min_alloc_size = cct->_conf->bluestore_min_alloc_size;
+ } else {
+ assert(bdev);
+ if (bdev->is_rotational()) {
+ min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
+ } else {
+ min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
+ }
+ }
+
+ // make sure min_alloc_size is power of 2 aligned.
+ if (!ISP2(min_alloc_size)) {
+ derr << __func__ << " min_alloc_size 0x"
+ << std::hex << min_alloc_size << std::dec
+ << " is not power of 2 aligned!"
+ << dendl;
+ r = -EINVAL;
+ goto out_close_bdev;
+ }
+
r = _open_db(true);
if (r < 0)
goto out_close_bdev;
t->set(PREFIX_SUPER, "blobid_max", bl);
}
- // choose min_alloc_size
- if (cct->_conf->bluestore_min_alloc_size) {
- min_alloc_size = cct->_conf->bluestore_min_alloc_size;
- } else {
- assert(bdev);
- if (bdev->is_rotational()) {
- min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
- } else {
- min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
- }
- }
-
- // make sure min_alloc_size is power of 2 aligned.
- if (!ISP2(min_alloc_size)) {
- derr << __func__ << " min_alloc_size 0x"
- << std::hex << min_alloc_size << std::dec
- << " is not power of 2 aligned!"
- << dendl;
- r = -EINVAL;
- goto out_close_fm;
- }
-
{
bufferlist bl;
::encode((uint64_t)min_alloc_size, bl);
if (r < 0)
goto out_close_fm;
- r = write_meta("bluefs", stringify((int)cct->_conf->bluestore_bluefs));
+ r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
if (r < 0)
goto out_close_fm;
{
dout(1) << __func__ << " path " << path << dendl;
+ _kv_only = kv_only;
+
{
string type;
int r = read_meta("type", &type);
mempool_thread.init();
-
mounted = true;
return 0;
int BlueStore::umount()
{
- assert(mounted);
+ assert(_kv_only || mounted);
dout(1) << __func__ << dendl;
_osr_drain_all();
_osr_unregister_all();
- mempool_thread.shutdown();
-
- dout(20) << __func__ << " stopping kv thread" << dendl;
- _kv_stop();
- _reap_collections();
- _flush_cache();
- dout(20) << __func__ << " closing" << dendl;
-
mounted = false;
- _close_alloc();
- _close_fm();
+ if (!_kv_only) {
+ mempool_thread.shutdown();
+ dout(20) << __func__ << " stopping kv thread" << dendl;
+ _kv_stop();
+ _flush_cache();
+ dout(20) << __func__ << " closing" << dendl;
+
+ _close_alloc();
+ _close_fm();
+ }
_close_db();
_close_bdev();
_close_fsid();
uint64_t len,
uint64_t granularity,
BlueStore::mempool_dynamic_bitset &bitset,
- const char *what,
std::function<void(uint64_t,
BlueStore::mempool_dynamic_bitset &)> f) {
auto end = ROUND_UP_TO(off + len, granularity);
const PExtentVector& extents,
bool compressed,
mempool_dynamic_bitset &used_blocks,
+ uint64_t granularity,
store_statfs_t& expected_statfs)
{
dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
}
bool already = false;
apply(
- e.offset, e.length, block_size, used_blocks, __func__,
+ e.offset, e.length, granularity, used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ assert(pos < bs.size());
if (bs.test(pos))
already = true;
else
return errors;
}
-int BlueStore::fsck(bool deep)
+int BlueStore::_fsck(bool deep, bool repair)
{
- dout(1) << __func__ << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
+ dout(1) << __func__
+ << (repair ? " fsck" : " repair")
+ << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
int errors = 0;
+ int repaired = 0;
typedef btree::btree_set<
uint64_t,std::less<uint64_t>,
if (r < 0)
goto out_scan;
- used_blocks.resize(bdev->get_size() / block_size);
+ used_blocks.resize(fm->get_alloc_units());
apply(
- 0, SUPER_RESERVED, block_size, used_blocks, "0~SUPER_RESERVED",
+ 0, MAX(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ assert(pos < bs.size());
bs.set(pos);
}
);
if (bluefs) {
for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
apply(
- e.get_start(), e.get_len(), block_size, used_blocks, "bluefs",
+ e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ assert(pos < bs.size());
bs.set(pos);
}
);
if (is_extent_shard_key(it->key())) {
while (!expecting_shards.empty() &&
expecting_shards.front() < it->key()) {
- derr << __func__ << " error: missing shard key "
+ derr << "fsck error: missing shard key "
<< pretty_binary_string(expecting_shards.front())
<< dendl;
++errors;
uint32_t offset;
string okey;
get_key_extent_shard(it->key(), &okey, &offset);
- derr << __func__ << " error: stray shard 0x" << std::hex << offset
+ derr << "fsck error: stray shard 0x" << std::hex << offset
<< std::dec << dendl;
if (expecting_shards.empty()) {
- derr << __func__ << " error: " << pretty_binary_string(it->key())
+ derr << "fsck error: " << pretty_binary_string(it->key())
<< " is unexpected" << dendl;
++errors;
continue;
}
while (expecting_shards.front() > it->key()) {
- derr << __func__ << " error: saw " << pretty_binary_string(it->key())
+ derr << "fsck error: saw " << pretty_binary_string(it->key())
<< dendl;
- derr << __func__ << " error: exp "
+ derr << "fsck error: exp "
<< pretty_binary_string(expecting_shards.front()) << dendl;
++errors;
expecting_shards.pop_front();
ghobject_t oid;
int r = get_key_object(it->key(), &oid);
if (r < 0) {
- derr << __func__ << " error: bad object key "
+ derr << "fsck error: bad object key "
<< pretty_binary_string(it->key()) << dendl;
++errors;
continue;
}
}
if (!c) {
- derr << __func__ << " error: stray object " << oid
+ derr << "fsck error: stray object " << oid
<< " not owned by any collection" << dendl;
++errors;
continue;
}
c->cid.is_pg(&pgid);
- dout(20) << __func__ << " collection " << c->cid << dendl;
+ dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
+ << dendl;
}
if (!expecting_shards.empty()) {
for (auto &k : expecting_shards) {
- derr << __func__ << " error: missing shard key "
+ derr << "fsck error: missing shard key "
<< pretty_binary_string(k) << dendl;
}
++errors;
OnodeRef o = c->get_onode(oid, false);
if (o->onode.nid) {
if (o->onode.nid > nid_max) {
- derr << __func__ << " error: " << oid << " nid " << o->onode.nid
+ derr << "fsck error: " << oid << " nid " << o->onode.nid
<< " > nid_max " << nid_max << dendl;
++errors;
}
if (used_nids.count(o->onode.nid)) {
- derr << __func__ << " error: " << oid << " nid " << o->onode.nid
+ derr << "fsck error: " << oid << " nid " << o->onode.nid
<< " already in use" << dendl;
++errors;
continue; // go for next object
get_extent_shard_key(o->key, s.shard_info->offset,
&expecting_shards.back());
if (s.shard_info->offset >= o->onode.size) {
- derr << __func__ << " error: " << oid << " shard 0x" << std::hex
+ derr << "fsck error: " << oid << " shard 0x" << std::hex
<< s.shard_info->offset << " past EOF at 0x" << o->onode.size
<< std::dec << dendl;
++errors;
for (auto& l : o->extent_map.extent_map) {
dout(20) << __func__ << " " << l << dendl;
if (l.logical_offset < pos) {
- derr << __func__ << " error: " << oid << " lextent at 0x"
+ derr << "fsck error: " << oid << " lextent at 0x"
<< std::hex << l.logical_offset
<< " overlaps with the previous, which ends at 0x" << pos
<< std::dec << dendl;
++errors;
}
if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
- derr << __func__ << " error: " << oid << " lextent at 0x"
+ derr << "fsck error: " << oid << " lextent at 0x"
<< std::hex << l.logical_offset << "~" << l.length
<< " spans a shard boundary"
<< std::dec << dendl;
<< std::dec << " for " << *i.first << dendl;
const bluestore_blob_t& blob = i.first->get_blob();
if (i.second & blob.unused) {
- derr << __func__ << " error: " << oid << " blob claims unused 0x"
+ derr << "fsck error: " << oid << " blob claims unused 0x"
<< std::hex << blob.unused
<< " but extents reference 0x" << i.second
<< " on blob " << *i.first << dendl;
if ((blob.unused & mask) == mask) {
// this csum chunk region is marked unused
if (blob.get_csum_item(p) != 0) {
- derr << __func__ << " error: " << oid
+ derr << "fsck error: " << oid
<< " blob claims csum chunk 0x" << std::hex << pos
<< "~" << csum_chunk_size
<< " is unused (mask 0x" << mask << " of unused 0x"
const bluestore_blob_t& blob = i.first->get_blob();
bool equal = i.first->get_blob_use_tracker().equal(i.second);
if (!equal) {
- derr << __func__ << " error: " << oid << " blob " << *i.first
+ derr << "fsck error: " << oid << " blob " << *i.first
<< " doesn't match expected ref_map " << i.second << dendl;
++errors;
}
}
if (blob.is_shared()) {
if (i.first->shared_blob->get_sbid() > blobid_max) {
- derr << __func__ << " error: " << oid << " blob " << blob
+ derr << "fsck error: " << oid << " blob " << blob
<< " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
<< blobid_max << dendl;
++errors;
} else if (i.first->shared_blob->get_sbid() == 0) {
- derr << __func__ << " error: " << oid << " blob " << blob
+ derr << "fsck error: " << oid << " blob " << blob
<< " marked as shared but has uninitialized sbid"
<< dendl;
++errors;
errors += _fsck_check_extents(oid, blob.get_extents(),
blob.is_compressed(),
used_blocks,
+ fm->get_alloc_size(),
expected_statfs);
}
}
int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
if (r < 0) {
++errors;
- derr << __func__ << " error: " << oid << " error during read: "
+ derr << "fsck error: " << oid << " error during read: "
<< cpp_strerror(r) << dendl;
}
}
// omap
if (o->onode.has_omap()) {
if (used_omap_head.count(o->onode.nid)) {
- derr << __func__ << " error: " << oid << " omap_head " << o->onode.nid
+ derr << "fsck error: " << oid << " omap_head " << o->onode.nid
<< " already in use" << dendl;
++errors;
} else {
string key = it->key();
uint64_t sbid;
if (get_key_shared_blob(key, &sbid)) {
- derr << __func__ << " error: bad key '" << key
+ derr << "fsck error: bad key '" << key
<< "' in shared blob namespace" << dendl;
++errors;
continue;
}
auto p = sb_info.find(sbid);
if (p == sb_info.end()) {
- derr << __func__ << " error: found stray shared blob data for sbid 0x"
+ derr << "fsck error: found stray shared blob data for sbid 0x"
<< std::hex << sbid << std::dec << dendl;
++errors;
} else {
::decode(shared_blob, blp);
dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
if (shared_blob.ref_map != sbi.ref_map) {
- derr << __func__ << " error: shared blob 0x" << std::hex << sbid
+ derr << "fsck error: shared blob 0x" << std::hex << sbid
<< std::dec << " ref_map " << shared_blob.ref_map
<< " != expected " << sbi.ref_map << dendl;
++errors;
errors += _fsck_check_extents(p->second.oids.front(),
extents,
p->second.compressed,
- used_blocks, expected_statfs);
+ used_blocks,
+ fm->get_alloc_size(),
+ expected_statfs);
sb_info.erase(p);
}
}
}
for (auto &p : sb_info) {
- derr << __func__ << " error: shared_blob 0x" << p.first
+ derr << "fsck error: shared_blob 0x" << p.first
<< " key is missing (" << *p.second.sb << ")" << dendl;
++errors;
}
if (!(actual_statfs == expected_statfs)) {
- derr << __func__ << " error: actual " << actual_statfs
+ derr << "fsck error: actual " << actual_statfs
<< " != expected " << expected_statfs << dendl;
++errors;
}
uint64_t omap_head;
_key_decode_u64(it->key().c_str(), &omap_head);
if (used_omap_head.count(omap_head) == 0) {
- derr << __func__ << " error: found stray omap data on omap_head "
+ derr << "fsck error: found stray omap data on omap_head "
<< omap_head << dendl;
++errors;
}
try {
::decode(wt, p);
} catch (buffer::error& e) {
- derr << __func__ << " error: failed to decode deferred txn "
+ derr << "fsck error: failed to decode deferred txn "
<< pretty_binary_string(it->key()) << dendl;
r = -EIO;
goto out_scan;
<< " released 0x" << std::hex << wt.released << std::dec << dendl;
for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
apply(
- e.get_start(), e.get_len(), block_size, used_blocks, "deferred",
+ e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ assert(pos < bs.size());
bs.set(pos);
}
);
// know they are allocated.
for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
apply(
- e.get_start(), e.get_len(), block_size, used_blocks, "bluefs_extents",
+ e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ assert(pos < bs.size());
bs.reset(pos);
}
);
while (fm->enumerate_next(&offset, &length)) {
bool intersects = false;
apply(
- offset, length, block_size, used_blocks, "free",
+ offset, length, fm->get_alloc_size(), used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ assert(pos < bs.size());
if (bs.test(pos)) {
intersects = true;
} else {
}
);
if (intersects) {
- derr << __func__ << " error: free extent 0x" << std::hex << offset
- << "~" << length << std::dec
- << " intersects allocated blocks" << dendl;
- ++errors;
- }
- }
- fm->enumerate_reset();
- size_t count = used_blocks.count();
- if (used_blocks.size() == count + 1) {
- // this due to http://tracker.ceph.com/issues/21089
- bufferlist fm_bpb_bl, fm_blocks_bl, fm_bpk_bl;
- db->get(PREFIX_ALLOC, "bytes_per_block", &fm_bpb_bl);
- db->get(PREFIX_ALLOC, "blocks", &fm_blocks_bl);
- db->get(PREFIX_ALLOC, "blocks_per_key", &fm_bpk_bl);
- uint64_t fm_blocks = 0;
- uint64_t fm_bsize = 1;
- uint64_t fm_blocks_per_key = 1;
- try {
- auto p = fm_blocks_bl.begin();
- ::decode(fm_blocks, p);
- auto q = fm_bpb_bl.begin();
- ::decode(fm_bsize, q);
- auto r = fm_bpk_bl.begin();
- ::decode(fm_blocks_per_key, r);
- } catch (buffer::error& e) {
- }
- uint64_t dev_bsize = bdev->get_block_size();
- uint64_t bad_size = bdev->get_size() & ~fm_bsize;
- if (used_blocks.test(bad_size / dev_bsize) == 0) {
- // this is the last block of the device that we previously
- // (incorrectly) truncated off of the effective device size. this
- // prevented BitmapFreelistManager from marking it as used along with
- // the other "past-eof" blocks in the last key slot. mark it used
- // now.
- derr << __func__ << " warning: fixing leaked block 0x" << std::hex
- << bad_size << "~" << fm_bsize << std::dec << " due to old bug"
- << dendl;
- KeyValueDB::Transaction t = db->get_transaction();
- // fix freelistmanager metadata (the internal 'blocks' count is
- // rounded up to include the trailing key, past eof)
- uint64_t new_blocks = bdev->get_size() / fm_bsize;
- if (new_blocks / fm_blocks_per_key * fm_blocks_per_key != new_blocks) {
- new_blocks = (new_blocks / fm_blocks_per_key + 1) *
- fm_blocks_per_key;
- }
- if (new_blocks != fm_blocks) {
- // the fm block count increased
- derr << __func__ << " freelist block and key count changed, fixing 0x"
- << std::hex << bdev->get_size() << "~"
- << ((new_blocks * fm_bsize) - bdev->get_size()) << std::dec
- << dendl;
- bufferlist bl;
- ::encode(new_blocks, bl);
- t->set(PREFIX_ALLOC, "blocks", bl);
- fm->allocate(bdev->get_size(),
- (new_blocks * fm_bsize) - bdev->get_size(),
- t);
+ if (offset == SUPER_RESERVED &&
+ length == min_alloc_size - SUPER_RESERVED) {
+ // this is due to the change just after luminous to min_alloc_size
+ // granularity allocations, and our baked in assumption at the top
+ // of _fsck that 0~ROUND_UP_TO(SUPER_RESERVED,min_alloc_size) is used
+ // (vs luminous's ROUND_UP_TO(SUPER_RESERVED,block_size)). harmless,
+ // since we will never allocate this region below min_alloc_size.
+ dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
+ << " and min_alloc_size, 0x" << std::hex << offset << "~"
+ << length << dendl;
} else {
- // block count is the same, but size changed; fix just the size
- derr << __func__ << " fixing just the stray block at 0x"
- << std::hex << bad_size << "~" << fm_bsize << std::dec << dendl;
- fm->allocate(bad_size, fm_bsize, t);
+ derr << "fsck error: free extent 0x" << std::hex << offset
+ << "~" << length << std::dec
+ << " intersects allocated blocks" << dendl;
+ ++errors;
}
- bufferlist sizebl;
- ::encode(bdev->get_size(), sizebl);
- t->set(PREFIX_ALLOC, "size", sizebl);
- int r = db->submit_transaction_sync(t);
- assert(r == 0);
-
- used_blocks.set(bad_size / dev_bsize);
- ++count;
}
}
+ fm->enumerate_reset();
+ size_t count = used_blocks.count();
if (used_blocks.size() != count) {
assert(used_blocks.size() > count);
++errors;
while (true) {
size_t next = used_blocks.find_next(cur);
if (next != cur + 1) {
- derr << __func__ << " error: leaked extent 0x" << std::hex
- << ((uint64_t)start * block_size) << "~"
- << ((cur + 1 - start) * block_size) << std::dec
+ derr << "fsck error: leaked extent 0x" << std::hex
+ << ((uint64_t)start * fm->get_alloc_size()) << "~"
+ << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
<< dendl;
start = next;
break;
<< dendl;
utime_t duration = ceph_clock_now() - start;
- dout(1) << __func__ << " finish with " << errors << " errors in "
+ dout(1) << __func__ << " finish with " << errors << " errors, " << repaired
+ << " repaired, " << (errors - repaired) << " remaining in "
<< duration << " seconds" << dendl;
- return errors;
+ return errors - repaired;
}
void BlueStore::collect_metadata(map<string,string> *pm)
buf->available = alloc->get_free();
if (bluefs) {
- // part of our shared device is "free" according to BlueFS
- // Don't include bluestore_bluefs_min because that space can't
- // be used for any other purpose.
- buf->available += bluefs->get_free(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min;
-
- // include dedicated db, too, if that isn't the shared device.
- if (bluefs_shared_bdev != BlueFS::BDEV_DB) {
- buf->total += bluefs->get_total(BlueFS::BDEV_DB);
+ // part of our shared device is "free" according to BlueFS, but we
+ // can't touch bluestore_bluefs_min of it.
+ int64_t shared_available = std::min(
+ bluefs->get_free(bluefs_shared_bdev),
+ bluefs->get_total(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min);
+ if (shared_available > 0) {
+ buf->available += shared_available;
}
}
void BlueStore::_queue_reap_collection(CollectionRef& c)
{
dout(10) << __func__ << " " << c << " " << c->cid << dendl;
- std::lock_guard<std::mutex> l(reap_lock);
+ // _reap_collections and this in the same thread,
+ // so no need a lock.
removed_collections.push_back(c);
}
void BlueStore::_reap_collections()
{
+
list<CollectionRef> removed_colls;
{
- std::lock_guard<std::mutex> l(reap_lock);
- removed_colls.swap(removed_collections);
+ // _queue_reap_collection and this in the same thread.
+ // So no need a lock.
+ if (!removed_collections.empty())
+ removed_colls.swap(removed_collections);
+ else
+ return;
}
- bool all_reaped = true;
-
- for (list<CollectionRef>::iterator p = removed_colls.begin();
- p != removed_colls.end();
- ++p) {
+ list<CollectionRef>::iterator p = removed_colls.begin();
+ while (p != removed_colls.end()) {
CollectionRef c = *p;
dout(10) << __func__ << " " << c << " " << c->cid << dendl;
if (c->onode_map.map_any([&](OnodeRef o) {
if (o->flushing_count.load()) {
dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
<< " flush_txns " << o->flushing_count << dendl;
- return false;
+ return true;
}
- return true;
+ return false;
})) {
- all_reaped = false;
+ ++p;
continue;
}
c->onode_map.clear();
+ p = removed_colls.erase(p);
dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
}
-
- if (all_reaped) {
+ if (removed_colls.empty()) {
dout(10) << __func__ << " all reaped" << dendl;
+ } else {
+ removed_collections.splice(removed_collections.begin(), removed_colls);
}
}
length = o->onode.size;
r = _do_read(c, o, offset, length, bl, op_flags);
+ if (r == -EIO) {
+ logger->inc(l_bluestore_read_eio);
+ }
}
out:
- if (r == 0 && _debug_data_eio(oid)) {
+ if (r >= 0 && _debug_data_eio(oid)) {
r = -EIO;
derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
} else if (cct->_conf->bluestore_debug_random_read_err &&
pos += hole;
left -= hole;
}
- BlobRef bptr = lp->blob;
+ BlobRef& bptr = lp->blob;
unsigned l_off = pos - lp->logical_offset;
unsigned b_off = l_off + lp->blob_offset;
unsigned b_len = std::min(left, lp->length - l_off);
// measure the whole block below.
// The error isn't that much...
vector<bufferlist> compressed_blob_bls;
- IOContext ioc(cct, NULL);
+ IOContext ioc(cct, NULL, true); // allow EIO
for (auto& p : blobs2read) {
- BlobRef bptr = p.first;
+ const BlobRef& bptr = p.first;
dout(20) << __func__ << " blob " << *bptr << std::hex
<< " need " << p.second << std::dec << dendl;
if (bptr->get_blob().is_compressed()) {
return r;
return 0;
});
+ if (r < 0) {
+ derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
+ if (r == -EIO) {
+ // propagate EIO to caller
+ return r;
+ }
assert(r == 0);
+ }
} else {
// read the pieces
for (auto& reg : p.second) {
return r;
return 0;
});
- assert(r == 0);
+ if (r < 0) {
+ derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
+ << dendl;
+ if (r == -EIO) {
+ // propagate EIO to caller
+ return r;
+ }
+ assert(r == 0);
+ }
assert(reg.bl.length() == r_len);
}
}
bdev->aio_submit(&ioc);
dout(20) << __func__ << " waiting for aio" << dendl;
ioc.aio_wait();
+ r = ioc.get_return_value();
+ if (r < 0) {
+ assert(r == -EIO); // no other errors allowed
+ return -EIO;
+ }
}
logger->tinc(l_bluestore_read_wait_aio_lat, ceph_clock_now() - start);
auto p = compressed_blob_bls.begin();
blobs2read_t::iterator b2r_it = blobs2read.begin();
while (b2r_it != blobs2read.end()) {
- BlobRef bptr = b2r_it->first;
+ const BlobRef& bptr = b2r_it->first;
dout(20) << __func__ << " blob " << *bptr << std::hex
<< " need 0x" << b2r_it->second << std::dec << dendl;
if (bptr->get_blob().is_compressed()) {
{
// update allocator with full released set
if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
- dout(10) << __func__ << " " << txc << " " << txc->released << dendl;
+ dout(10) << __func__ << " " << txc << " " << std::hex
+ << txc->released << std::dec << dendl;
for (interval_set<uint64_t>::iterator p = txc->released.begin();
p != txc->released.end();
++p) {
}
kv_sync_thread.join();
kv_finalize_thread.join();
+ assert(removed_collections.empty());
{
std::lock_guard<std::mutex> l(kv_lock);
kv_stop = false;
bdev->aio_submit(&b->ioc);
}
+struct C_DeferredTrySubmit : public Context {
+ BlueStore *store;
+ C_DeferredTrySubmit(BlueStore *s) : store(s) {}
+ void finish(int r) {
+ store->deferred_try_submit();
+ }
+};
+
void BlueStore::_deferred_aio_finish(OpSequencer *osr)
{
dout(10) << __func__ << " osr " << osr << dendl;
deferred_queue.erase(q);
} else if (deferred_aggressive) {
dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
- deferred_finisher.queue(new FunctionContext([&](int) {
- deferred_try_submit();
- }));
+ deferred_finisher.queue(new C_DeferredTrySubmit(this));
} else {
dout(20) << __func__ << " leaving queued, more pending" << dendl;
}
<< dendl;
++deferred_aggressive;
deferred_try_submit();
+ {
+ // wake up any previously finished deferred events
+ std::lock_guard<std::mutex> l(kv_lock);
+ kv_cond.notify_one();
+ }
throttle_deferred_bytes.get(txc->cost);
--deferred_aggressive;
}
return r;
}
-void BlueStore::_dump_onode(OnodeRef o, int log_level)
+void BlueStore::_dump_onode(const OnodeRef& o, int log_level)
{
if (!cct->_conf->subsys.should_gather(ceph_subsys_bluestore, log_level))
return;
dout(20) << __func__ << " txc " << txc
<< " " << wctx->writes.size() << " blobs"
<< dendl;
-
- uint64_t need = 0;
- auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
- for (auto &wi : wctx->writes) {
- need += wi.blob_length;
- }
- int r = alloc->reserve(need);
- if (r < 0) {
- derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
- << dendl;
- return r;
+ if (wctx->writes.empty()) {
+ return 0;
}
- uint64_t hint = 0;
CompressorRef c;
double crr = 0;
if (wctx->compress) {
cct->_conf->bluestore_compression_required_ratio,
[&]() {
double val;
- if(coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
+ if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
return boost::optional<double>(val);
}
return boost::optional<double>();
csum,
[&]() {
int val;
- if(coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
+ if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
return boost::optional<int>(val);
}
return boost::optional<int>();
}
);
+ // compress (as needed) and calc needed space
+ uint64_t need = 0;
+ auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
for (auto& wi : wctx->writes) {
- BlobRef b = wi.b;
- bluestore_blob_t& dblob = b->dirty_blob();
- uint64_t b_off = wi.b_off;
- bufferlist *l = &wi.bl;
- uint64_t final_length = wi.blob_length;
- uint64_t csum_length = wi.blob_length;
- unsigned csum_order = block_size_order;
- bufferlist compressed_bl;
- bool compressed = false;
- if(c && wi.blob_length > min_alloc_size) {
-
+ if (c && wi.blob_length > min_alloc_size) {
utime_t start = ceph_clock_now();
// compress
- assert(b_off == 0);
- assert(wi.blob_length == l->length());
- bluestore_compression_header_t chdr;
- chdr.type = c->get_type();
+ assert(wi.b_off == 0);
+ assert(wi.blob_length == wi.bl.length());
+
// FIXME: memory alignment here is bad
bufferlist t;
-
- r = c->compress(*l, t);
+ int r = c->compress(wi.bl, t);
assert(r == 0);
+ bluestore_compression_header_t chdr;
+ chdr.type = c->get_type();
chdr.length = t.length();
- ::encode(chdr, compressed_bl);
- compressed_bl.claim_append(t);
- uint64_t rawlen = compressed_bl.length();
- uint64_t newlen = P2ROUNDUP(rawlen, min_alloc_size);
- uint64_t want_len_raw = final_length * crr;
+ ::encode(chdr, wi.compressed_bl);
+ wi.compressed_bl.claim_append(t);
+
+ wi.compressed_len = wi.compressed_bl.length();
+ uint64_t newlen = P2ROUNDUP(wi.compressed_len, min_alloc_size);
+ uint64_t want_len_raw = wi.blob_length * crr;
uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
- if (newlen <= want_len && newlen < final_length) {
- // Cool. We compressed at least as much as we were hoping to.
- // pad out to min_alloc_size
- compressed_bl.append_zero(newlen - rawlen);
- logger->inc(l_bluestore_write_pad_bytes, newlen - rawlen);
+ if (newlen <= want_len && newlen < wi.blob_length) {
+ // Cool. We compressed at least as much as we were hoping to.
+ // pad out to min_alloc_size
+ wi.compressed_bl.append_zero(newlen - wi.compressed_len);
+ logger->inc(l_bluestore_write_pad_bytes, newlen - wi.compressed_len);
dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
- << " -> 0x" << rawlen << " => 0x" << newlen
+ << " -> 0x" << wi.compressed_len << " => 0x" << newlen
<< " with " << c->get_type()
<< std::dec << dendl;
- txc->statfs_delta.compressed() += rawlen;
- txc->statfs_delta.compressed_original() += l->length();
+ txc->statfs_delta.compressed() += wi.compressed_len;
+ txc->statfs_delta.compressed_original() += wi.blob_length;
txc->statfs_delta.compressed_allocated() += newlen;
- l = &compressed_bl;
- final_length = newlen;
- csum_length = newlen;
- csum_order = ctz(newlen);
- dblob.set_compressed(wi.blob_length, rawlen);
- compressed = true;
- logger->inc(l_bluestore_compress_success_count);
+ logger->inc(l_bluestore_compress_success_count);
+ wi.compressed = true;
+ need += newlen;
} else {
- dout(20) << __func__ << std::hex << " 0x" << l->length()
- << " compressed to 0x" << rawlen << " -> 0x" << newlen
- << " with " << c->get_type()
- << ", which is more than required 0x" << want_len_raw
+ dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
+ << " compressed to 0x" << wi.compressed_len << " -> 0x" << newlen
+ << " with " << c->get_type()
+ << ", which is more than required 0x" << want_len_raw
<< " -> 0x" << want_len
- << ", leaving uncompressed"
- << std::dec << dendl;
- logger->inc(l_bluestore_compress_rejected_count);
+ << ", leaving uncompressed"
+ << std::dec << dendl;
+ logger->inc(l_bluestore_compress_rejected_count);
+ need += wi.blob_length;
}
logger->tinc(l_bluestore_compress_lat,
ceph_clock_now() - start);
+ } else {
+ need += wi.blob_length;
}
- if (!compressed && wi.new_blob) {
+ }
+ int r = alloc->reserve(need);
+ if (r < 0) {
+ derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
+ << dendl;
+ return r;
+ }
+ AllocExtentVector prealloc;
+ prealloc.reserve(2 * wctx->writes.size());;
+ int prealloc_left = 0;
+ prealloc_left = alloc->allocate(
+ need, min_alloc_size, need,
+ 0, &prealloc);
+ assert(prealloc_left == (int64_t)need);
+ dout(20) << __func__ << " prealloc " << prealloc << dendl;
+ auto prealloc_pos = prealloc.begin();
+
+ for (auto& wi : wctx->writes) {
+ BlobRef b = wi.b;
+ bluestore_blob_t& dblob = b->dirty_blob();
+ uint64_t b_off = wi.b_off;
+ bufferlist *l = &wi.bl;
+ uint64_t final_length = wi.blob_length;
+ uint64_t csum_length = wi.blob_length;
+ unsigned csum_order = block_size_order;
+ if (wi.compressed) {
+ final_length = wi.compressed_bl.length();
+ csum_length = final_length;
+ csum_order = ctz(csum_length);
+ l = &wi.compressed_bl;
+ dblob.set_compressed(wi.blob_length, wi.compressed_len);
+ } else if (wi.new_blob) {
// initialize newly created blob only
assert(dblob.is_mutable());
if (l->length() != wi.blob_length) {
}
AllocExtentVector extents;
- extents.reserve(4); // 4 should be (more than) enough for most allocations
- int64_t got = alloc->allocate(final_length, min_alloc_size,
- max_alloc_size.load(),
- hint, &extents);
- assert(got == (int64_t)final_length);
- need -= got;
- txc->statfs_delta.allocated() += got;
+ int64_t left = final_length;
+ while (left > 0) {
+ assert(prealloc_left > 0);
+ if (prealloc_pos->length <= left) {
+ prealloc_left -= prealloc_pos->length;
+ left -= prealloc_pos->length;
+ txc->statfs_delta.allocated() += prealloc_pos->length;
+ extents.push_back(*prealloc_pos);
+ ++prealloc_pos;
+ } else {
+ extents.emplace_back(prealloc_pos->offset, left);
+ prealloc_pos->offset += left;
+ prealloc_pos->length -= left;
+ prealloc_left -= left;
+ txc->statfs_delta.allocated() += left;
+ left = 0;
+ break;
+ }
+ }
for (auto& p : extents) {
- bluestore_pextent_t e = bluestore_pextent_t(p);
- txc->allocated.insert(e.offset, e.length);
- hint = p.end();
+ txc->allocated.insert(p.offset, p.length);
}
dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
}
}
}
- if (need > 0) {
- alloc->unreserve(need);
- }
+ assert(prealloc_pos == prealloc.end());
+ assert(prealloc_left == 0);
return 0;
}
o->extent_map.dirty_range(offset, length);
_wctx_finish(txc, c, o, &wctx);
- if (offset + length > o->onode.size) {
+ if (length > 0 && offset + length > o->onode.size) {
o->onode.size = offset + length;
dout(20) << __func__ << " extending size to " << offset + length
<< dendl;
if (b.is_shared() &&
sb->loaded &&
maybe_unshared_blobs.count(sb)) {
- b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
- expect[sb].get(off, len);
- return 0;
- });
+ if (b.is_compressed()) {
+ expect[sb].get(0, b.get_ondisk_length());
+ } else {
+ b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
+ expect[sb].get(off, len);
+ return 0;
+ });
+ }
}
}
<< " " << name << " (" << val.length() << " bytes)"
<< dendl;
int r = 0;
- if (val.is_partial())
- o->onode.attrs[name.c_str()] = bufferptr(val.c_str(), val.length());
- else
- o->onode.attrs[name.c_str()] = val;
+ if (val.is_partial()) {
+ auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
+ val.length());
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ } else {
+ auto& b = o->onode.attrs[name.c_str()] = val;
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ }
txc->write_onode(o);
dout(10) << __func__ << " " << c->cid << " " << o->oid
<< " " << name << " (" << val.length() << " bytes)"
int r = 0;
for (map<string,bufferptr>::const_iterator p = aset.begin();
p != aset.end(); ++p) {
- if (p->second.is_partial())
- o->onode.attrs[p->first.c_str()] =
+ if (p->second.is_partial()) {
+ auto& b = o->onode.attrs[p->first.c_str()] =
bufferptr(p->second.c_str(), p->second.length());
- else
- o->onode.attrs[p->first.c_str()] = p->second;
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ } else {
+ auto& b = o->onode.attrs[p->first.c_str()] = p->second;
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ }
}
txc->write_onode(o);
dout(10) << __func__ << " " << c->cid << " " << o->oid
assert(i->empty());
}
for (auto& p : coll_map) {
+ if (!p.second->onode_map.empty()) {
+ derr << __func__ << "stray onodes on " << p.first << dendl;
+ p.second->onode_map.dump(cct, 0);
+ }
+ if (!p.second->shared_blob_set.empty()) {
+ derr << __func__ << " stray shared blobs on " << p.first << dendl;
+ p.second->shared_blob_set.dump(cct, 0);
+ }
assert(p.second->onode_map.empty());
assert(p.second->shared_blob_set.empty());
}