<< " removing self from set " << get_parent()
<< dendl;
if (get_parent()) {
- if (get_parent()->try_remove(this)) {
- delete this;
- } else {
- ldout(coll->store->cct, 20)
- << __func__ << " " << this << " lost race to remove myself from set"
- << dendl;
- }
- } else {
- delete this;
+ get_parent()->remove(this);
}
+ delete this;
}
}
b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
"Sum for extents that have been merged due to garbage "
"collection");
+ b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
+ "Read EIO errors propagated to high level callers");
logger = b.create_perf_counters();
cct->get_perfcounters_collection()->add(logger);
}
int BlueStore::_open_path()
{
+ // sanity check(s)
+ if (cct->_conf->get_val<uint64_t>("osd_max_object_size") >=
+ 4*1024*1024*1024ull) {
+ derr << __func__ << " osd_max_object_size >= 4GB; BlueStore has hard limit of 4GB." << dendl;
+ return -EINVAL;
+ }
assert(path_fd < 0);
path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY));
if (path_fd < 0) {
::decode(expected_crc, p);
}
catch (buffer::error& e) {
- derr << __func__ << " unable to decode label at offset " << p.get_off()
+ dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
<< ": " << e.what()
<< dendl;
- return -EINVAL;
+ return -ENOENT;
}
if (crc != expected_crc) {
derr << __func__ << " bad crc on label, expected " << expected_crc
bl.append(freelist_type);
t->set(PREFIX_SUPER, "freelist_type", bl);
}
- fm->create(bdev->get_size(), min_alloc_size, t);
+ // being able to allocate in units less than bdev block size
+ // seems to be a bad idea.
+ assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
+ fm->create(bdev->get_size(), (int64_t)min_alloc_size, t);
// allocate superblock reserved space. note that we do not mark
// bluefs space as allocated in the freelist; we instead rely on
string bfn;
struct stat st;
- if (read_meta("path_block.db", &bfn) < 0) {
- bfn = path + "/block.db";
- }
+ bfn = path + "/block.db";
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
if (r < 0) {
}
bluefs_shared_bdev = BlueFS::BDEV_SLOW;
bluefs_single_shared_device = false;
- } else if (::lstat(bfn.c_str(), &st) == -1) {
- bluefs_shared_bdev = BlueFS::BDEV_DB;
} else {
- //symlink exist is bug
- derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
r = -errno;
- goto free_bluefs;
+ if (::lstat(bfn.c_str(), &st) == -1) {
+ r = 0;
+ bluefs_shared_bdev = BlueFS::BDEV_DB;
+ } else {
+ derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
}
// shared device
- if (read_meta("path_block", &bfn) < 0) {
- bfn = path + "/block";
- }
+ bfn = path + "/block";
r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
if (r < 0) {
derr << __func__ << " add block device(" << bfn << ") returned: "
bluefs_extents.insert(start, initial);
}
- if (read_meta("path_block.wal", &bfn) < 0) {
- bfn = path + "/block.wal";
- }
+ bfn = path + "/block.wal";
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
if (r < 0) {
}
cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
bluefs_single_shared_device = false;
- } else if (::lstat(bfn.c_str(), &st) == -1) {
- cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
} else {
- //symlink exist is bug
- derr << __func__ << " " << bfn << " link target doesn't exist" << dendl;
r = -errno;
- goto free_bluefs;
+ if (::lstat(bfn.c_str(), &st) == -1) {
+ r = 0;
+ cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
+ } else {
+ derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
}
if (create) {
int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
0, 0, &exts);
- if (alloc_len < (int64_t)gift) {
- derr << __func__ << " allocate failed on 0x" << std::hex << gift
- << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+ if (alloc_len <= 0) {
+ dout(1) << __func__ << " no allocate on 0x" << std::hex << gift
+ << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+ alloc->unreserve(gift);
+ alloc->dump();
+ return 0;
+ } else if (alloc_len < (int64_t)gift) {
+ dout(1) << __func__ << " insufficient allocate on 0x" << std::hex << gift
+ << " min_alloc_size 0x" << min_alloc_size
+ << " allocated 0x" << alloc_len
+ << std::dec << dendl;
+ alloc->unreserve(gift - alloc_len);
alloc->dump();
- assert(0 == "allocate failed, wtf");
- return -ENOSPC;
}
for (auto& p : exts) {
bluestore_pextent_t e = bluestore_pextent_t(p);
int BlueStore::_open_collections(int *errors)
{
+ dout(10) << __func__ << dendl;
assert(coll_map.empty());
KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
for (it->upper_bound(string());
<< pretty_binary_string(it->key()) << dendl;
return -EIO;
}
- dout(20) << __func__ << " opened " << cid << " " << c << dendl;
+ dout(20) << __func__ << " opened " << cid << " " << c
+ << " " << c->cnode << dendl;
coll_map[cid] = c;
} else {
derr << __func__ << " unrecognized collection " << it->key() << dendl;
}
if (cct->_conf->bluestore_block_preallocate_file) {
-#ifdef HAVE_POSIX_FALLOCATE
- r = ::posix_fallocate(fd, 0, size);
- if (r) {
+ r = ::ceph_posix_fallocate(fd, 0, size);
+ if (r > 0) {
derr << __func__ << " failed to prefallocate " << name << " file to "
<< size << ": " << cpp_strerror(r) << dendl;
VOID_TEMP_FAILURE_RETRY(::close(fd));
return -r;
}
-#else
- char data[1024*128];
- for (uint64_t off = 0; off < size; off += sizeof(data)) {
- if (off + sizeof(data) > size)
- r = ::write(fd, data, size - off);
- else
- r = ::write(fd, data, sizeof(data));
- if (r < 0) {
- r = -errno;
- derr << __func__ << " failed to prefallocate w/ write " << name << " file to "
- << size << ": " << cpp_strerror(r) << dendl;
- VOID_TEMP_FAILURE_RETRY(::close(fd));
- return r;
- }
- }
-#endif
}
dout(1) << __func__ << " resized " << name << " file to "
<< pretty_si_t(size) << "B" << dendl;
if (r < 0)
goto out_close_fsid;
- {
- string wal_path = cct->_conf->get_val<string>("bluestore_block_wal_path");
- if (wal_path.size()) {
- write_meta("path_block.wal", wal_path);
- }
- string db_path = cct->_conf->get_val<string>("bluestore_block_db_path");
- if (db_path.size()) {
- write_meta("path_block.db", db_path);
- }
- }
-
// choose min_alloc_size
if (cct->_conf->bluestore_min_alloc_size) {
min_alloc_size = cct->_conf->bluestore_min_alloc_size;
mempool_thread.init();
-
mounted = true;
return 0;
mempool_thread.shutdown();
dout(20) << __func__ << " stopping kv thread" << dendl;
_kv_stop();
- _reap_collections();
_flush_cache();
dout(20) << __func__ << " closing" << dendl;
uint64_t len,
uint64_t granularity,
BlueStore::mempool_dynamic_bitset &bitset,
- const char *what,
std::function<void(uint64_t,
BlueStore::mempool_dynamic_bitset &)> f) {
auto end = ROUND_UP_TO(off + len, granularity);
const PExtentVector& extents,
bool compressed,
mempool_dynamic_bitset &used_blocks,
+ uint64_t granularity,
store_statfs_t& expected_statfs)
{
dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
}
bool already = false;
apply(
- e.offset, e.length, min_alloc_size, used_blocks, __func__,
+ e.offset, e.length, granularity, used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ assert(pos < bs.size());
if (bs.test(pos))
already = true;
else
if (r < 0)
goto out_scan;
- used_blocks.resize(bdev->get_size() / min_alloc_size);
+ used_blocks.resize(fm->get_alloc_units());
apply(
- 0, MAX(min_alloc_size, SUPER_RESERVED), min_alloc_size, used_blocks,
- "0~SUPER_RESERVED",
+ 0, MAX(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ assert(pos < bs.size());
bs.set(pos);
}
);
if (bluefs) {
for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
apply(
- e.get_start(), e.get_len(), min_alloc_size, used_blocks, "bluefs",
+ e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ assert(pos < bs.size());
bs.set(pos);
}
);
continue;
}
c->cid.is_pg(&pgid);
- dout(20) << __func__ << " collection " << c->cid << dendl;
+ dout(20) << __func__ << " collection " << c->cid << " " << c->cnode
+ << dendl;
}
if (!expecting_shards.empty()) {
errors += _fsck_check_extents(oid, blob.get_extents(),
blob.is_compressed(),
used_blocks,
+ fm->get_alloc_size(),
expected_statfs);
}
}
errors += _fsck_check_extents(p->second.oids.front(),
extents,
p->second.compressed,
- used_blocks, expected_statfs);
+ used_blocks,
+ fm->get_alloc_size(),
+ expected_statfs);
sb_info.erase(p);
}
}
<< " released 0x" << std::hex << wt.released << std::dec << dendl;
for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
apply(
- e.get_start(), e.get_len(), min_alloc_size, used_blocks, "deferred",
+ e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ assert(pos < bs.size());
bs.set(pos);
}
);
// know they are allocated.
for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
apply(
- e.get_start(), e.get_len(), min_alloc_size, used_blocks,
- "bluefs_extents",
+ e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ assert(pos < bs.size());
bs.reset(pos);
}
);
while (fm->enumerate_next(&offset, &length)) {
bool intersects = false;
apply(
- offset, length, min_alloc_size, used_blocks, "free",
+ offset, length, fm->get_alloc_size(), used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ assert(pos < bs.size());
if (bs.test(pos)) {
intersects = true;
} else {
size_t next = used_blocks.find_next(cur);
if (next != cur + 1) {
derr << "fsck error: leaked extent 0x" << std::hex
- << ((uint64_t)start * min_alloc_size) << "~"
- << ((cur + 1 - start) * min_alloc_size) << std::dec
+ << ((uint64_t)start * fm->get_alloc_size()) << "~"
+ << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
<< dendl;
start = next;
break;
buf->available = alloc->get_free();
if (bluefs) {
- // part of our shared device is "free" according to BlueFS
- // Don't include bluestore_bluefs_min because that space can't
- // be used for any other purpose.
- buf->available += bluefs->get_free(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min;
-
- // include dedicated db, too, if that isn't the shared device.
- if (bluefs_shared_bdev != BlueFS::BDEV_DB) {
- buf->total += bluefs->get_total(BlueFS::BDEV_DB);
+ // part of our shared device is "free" according to BlueFS, but we
+ // can't touch bluestore_bluefs_min of it.
+ int64_t shared_available = std::min(
+ bluefs->get_free(bluefs_shared_bdev),
+ bluefs->get_total(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min);
+ if (shared_available > 0) {
+ buf->available += shared_available;
}
}
void BlueStore::_queue_reap_collection(CollectionRef& c)
{
dout(10) << __func__ << " " << c << " " << c->cid << dendl;
- std::lock_guard<std::mutex> l(reap_lock);
+ // _reap_collections and this in the same thread,
+ // so no need a lock.
removed_collections.push_back(c);
}
void BlueStore::_reap_collections()
{
+
list<CollectionRef> removed_colls;
{
- std::lock_guard<std::mutex> l(reap_lock);
- removed_colls.swap(removed_collections);
+ // _queue_reap_collection and this in the same thread.
+ // So no need a lock.
+ if (!removed_collections.empty())
+ removed_colls.swap(removed_collections);
+ else
+ return;
}
- bool all_reaped = true;
-
- for (list<CollectionRef>::iterator p = removed_colls.begin();
- p != removed_colls.end();
- ++p) {
+ list<CollectionRef>::iterator p = removed_colls.begin();
+ while (p != removed_colls.end()) {
CollectionRef c = *p;
dout(10) << __func__ << " " << c << " " << c->cid << dendl;
if (c->onode_map.map_any([&](OnodeRef o) {
if (o->flushing_count.load()) {
dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
<< " flush_txns " << o->flushing_count << dendl;
- return false;
+ return true;
}
- return true;
+ return false;
})) {
- all_reaped = false;
+ ++p;
continue;
}
c->onode_map.clear();
+ p = removed_colls.erase(p);
dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
}
-
- if (all_reaped) {
+ if (removed_colls.empty()) {
dout(10) << __func__ << " all reaped" << dendl;
+ } else {
+ removed_collections.splice(removed_collections.begin(), removed_colls);
}
}
length = o->onode.size;
r = _do_read(c, o, offset, length, bl, op_flags);
+ if (r == -EIO) {
+ logger->inc(l_bluestore_read_eio);
+ }
}
out:
- if (r == 0 && _debug_data_eio(oid)) {
+ if (r >= 0 && _debug_data_eio(oid)) {
r = -EIO;
derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
} else if (cct->_conf->bluestore_debug_random_read_err &&
pos += hole;
left -= hole;
}
- BlobRef bptr = lp->blob;
+ BlobRef& bptr = lp->blob;
unsigned l_off = pos - lp->logical_offset;
unsigned b_off = l_off + lp->blob_offset;
unsigned b_len = std::min(left, lp->length - l_off);
// measure the whole block below.
// The error isn't that much...
vector<bufferlist> compressed_blob_bls;
- IOContext ioc(cct, NULL);
+ IOContext ioc(cct, NULL, true); // allow EIO
for (auto& p : blobs2read) {
- BlobRef bptr = p.first;
+ const BlobRef& bptr = p.first;
dout(20) << __func__ << " blob " << *bptr << std::hex
<< " need " << p.second << std::dec << dendl;
if (bptr->get_blob().is_compressed()) {
return r;
return 0;
});
+ if (r < 0) {
+ derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
+ if (r == -EIO) {
+ // propagate EIO to caller
+ return r;
+ }
assert(r == 0);
+ }
} else {
// read the pieces
for (auto& reg : p.second) {
return r;
return 0;
});
- assert(r == 0);
+ if (r < 0) {
+ derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
+ << dendl;
+ if (r == -EIO) {
+ // propagate EIO to caller
+ return r;
+ }
+ assert(r == 0);
+ }
assert(reg.bl.length() == r_len);
}
}
bdev->aio_submit(&ioc);
dout(20) << __func__ << " waiting for aio" << dendl;
ioc.aio_wait();
+ r = ioc.get_return_value();
+ if (r < 0) {
+ assert(r == -EIO); // no other errors allowed
+ return -EIO;
+ }
}
logger->tinc(l_bluestore_read_wait_aio_lat, ceph_clock_now() - start);
auto p = compressed_blob_bls.begin();
blobs2read_t::iterator b2r_it = blobs2read.begin();
while (b2r_it != blobs2read.end()) {
- BlobRef bptr = b2r_it->first;
+ const BlobRef& bptr = b2r_it->first;
dout(20) << __func__ << " blob " << *bptr << std::hex
<< " need 0x" << b2r_it->second << std::dec << dendl;
if (bptr->get_blob().is_compressed()) {
{
// update allocator with full released set
if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
- dout(10) << __func__ << " " << txc << " " << txc->released << dendl;
+ dout(10) << __func__ << " " << txc << " " << std::hex
+ << txc->released << std::dec << dendl;
for (interval_set<uint64_t>::iterator p = txc->released.begin();
p != txc->released.end();
++p) {
}
kv_sync_thread.join();
kv_finalize_thread.join();
+ assert(removed_collections.empty());
{
std::lock_guard<std::mutex> l(kv_lock);
kv_stop = false;
return r;
}
-void BlueStore::_dump_onode(OnodeRef o, int log_level)
+void BlueStore::_dump_onode(const OnodeRef& o, int log_level)
{
if (!cct->_conf->subsys.should_gather(ceph_subsys_bluestore, log_level))
return;
o->extent_map.dirty_range(offset, length);
_wctx_finish(txc, c, o, &wctx);
- if (offset + length > o->onode.size) {
+ if (length > 0 && offset + length > o->onode.size) {
o->onode.size = offset + length;
dout(20) << __func__ << " extending size to " << offset + length
<< dendl;