if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
<< " and fsid " << fsid << " check bypassed" << dendl;
- }
- else if (label.osd_uuid != fsid) {
+ } else if (label.osd_uuid != fsid) {
derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
<< " does not match our fsid " << fsid << dendl;
return -EIO;
bdev = NULL;
}
-int BlueStore::_open_fm(KeyValueDB::Transaction t)
+int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only)
{
+ int r;
+ bluestore_bdev_label_t label;
+
ceph_assert(fm == NULL);
fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
ceph_assert(fm);
start += l + u;
}
}
+ r = _write_out_fm_meta(0, false, &label);
+ ceph_assert(r == 0);
+ } else {
+ string p = path + "/block";
+ r = _read_bdev_label(cct, p, &label);
+ if (r < 0) {
+ derr << __func__ << " freelist init failed, error reading bdev label: " << cpp_strerror(r) << dendl;
+ delete fm;
+ fm = NULL;
+ return r;
+ }
}
-
- int r = fm->init(db);
+ r = fm->init(label, db, read_only);
if (r < 0) {
derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
delete fm;
fm = NULL;
}
+int BlueStore::_write_out_fm_meta(uint64_t target_size,
+ bool update_root_size,
+ bluestore_bdev_label_t* res_label)
+{
+ string p = path + "/block";
+
+ std::vector<std::pair<string, string>> fm_meta;
+ fm->get_meta(target_size, &fm_meta);
+
+ bluestore_bdev_label_t label;
+ int r = _read_bdev_label(cct, p, &label);
+ if (r < 0)
+ return r;
+
+ for (auto& m : fm_meta) {
+ label.meta[m.first] = m.second;
+ }
+ if (update_root_size) {
+ label.size = target_size;
+ }
+ r = _write_bdev_label(cct, p, label);
+ if (res_label) {
+ *res_label = label;
+ }
+
+ return r;
+}
+
int BlueStore::_open_alloc()
{
ceph_assert(alloc == NULL);
bytes += length;
}
fm->enumerate_reset();
- dout(1) << __func__ << " loaded " << byte_u_t(bytes)
- << " in " << num << " extents"
- << dendl;
// also mark bluefs space as allocated
for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
alloc->init_rm_free(e.get_start(), e.get_len());
}
+ dout(1) << __func__ << " loaded " << byte_u_t(bytes)
+ << " in " << num << " extents"
+ << " available " << byte_u_t(alloc->get_free())
+ << dendl;
+
return 0;
}
return r;
}
-void BlueStore::_close_bluefs()
+void BlueStore::_close_bluefs(bool cold_close)
{
- bluefs->umount();
+ bluefs->umount(cold_close);
_minimal_close_bluefs();
}
goto out_db;
}
- r = _open_fm(nullptr);
+ r = _open_fm(nullptr, true);
if (r < 0)
goto out_db;
// now open in R/W mode
if (!read_only) {
- _close_db();
+ _close_db(true);
r = _open_db(false, false, false);
if (r < 0) {
_close_fm();
return r;
}
+ fm->sync(db);
}
} else {
r = _open_db(false, false);
goto out_db;
}
- r = _open_fm(nullptr);
+ r = _open_fm(nullptr, false);
if (r < 0)
goto out_db;
out_fm:
_close_fm();
out_db:
- _close_db();
+ _close_db(read_only);
return r;
}
-void BlueStore::_close_db_and_around()
+void BlueStore::_close_db_and_around(bool read_only)
{
if (bluefs) {
- if (out_of_sync_fm.fetch_and(0)) {
+ if (!read_only && out_of_sync_fm.fetch_and(0)) {
_sync_bluefs_and_fm();
}
- _close_db();
- while(out_of_sync_fm.fetch_and(0)) {
+ _close_db(read_only);
+ while(!read_only && out_of_sync_fm.fetch_and(0)) {
// if seen some allocations during close - repeat open_db, sync fm, close
dout(0) << __func__ << " syncing FreelistManager" << dendl;
int r = _open_db(false, false, false);
break;
}
_sync_bluefs_and_fm();
- _close_db();
+ _close_db(false);
}
if (!_kv_only) {
_close_alloc();
} else {
_close_alloc();
_close_fm();
- _close_db();
+ _close_db(read_only);
}
}
(void)r;
}
env = new rocksdb::EnvMirror(b, a, false, true);
- }
- else {
+ } else {
env = new BlueRocksEnv(bluefs);
// simplify the dir names, too, as "seen" by rocksdb
if (!db) {
derr << __func__ << " error creating db" << dendl;
if (bluefs) {
- _close_bluefs();
+ _close_bluefs(read_only);
}
// delete env manually here since we can't depend on db to do this
// under this case
}
if (r) {
derr << __func__ << " erroring opening db: " << err.str() << dendl;
- _close_db();
+ _close_db(read_only);
return -EIO;
}
dout(1) << __func__ << " opened " << kv_backend
return 0;
}
-void BlueStore::_close_db()
+void BlueStore::_close_db(bool cold_close)
{
ceph_assert(db);
delete db;
db = NULL;
if (bluefs) {
- _close_bluefs();
+ _close_bluefs(cold_close);
}
}
{
KeyValueDB::Transaction t = db->get_transaction();
- r = _open_fm(t);
+ r = _open_fm(t, true);
if (r < 0)
goto out_close_db;
{
out_close_fm:
_close_fm();
out_close_db:
- _close_db();
+ _close_db(false);
out_close_bdev:
_close_bdev();
out_close_fsid:
void BlueStore::_umount_for_bluefs()
{
- _close_bluefs();
+ _close_bluefs(false);
_close_fsid();
_close_path();
}
bluefs->add_block_extent(
id,
reserved,
- bluefs->get_block_device_size(id) - reserved);
+ bluefs->get_block_device_size(id) - reserved, true);
r = bluefs->prepare_new_device(id, bluefs_layout);
ceph_assert(r == 0);
int BlueStore::expand_devices(ostream& out)
{
- int r = _mount(false);
+ int r = cold_open();
ceph_assert(r == 0);
bluefs->dump_block_extents(out);
- out << "Expanding..." << std::endl;
+ out << "Expanding DB/WAL..." << std::endl;
for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
if (devid == bluefs_layout.shared_bdev ) {
continue;
uint64_t size = bdev->get_size();
if (size0 < size) {
out << bluefs_layout.shared_bdev
- <<" : expanding " << " from 0x" << std::hex
- << size0 << " to 0x" << size << std::dec << std::endl;
- KeyValueDB::Transaction txn;
- txn = db->get_transaction();
- int r = fm->expand(size, txn);
+ << " : expanding " << " from 0x" << std::hex
+ << size0 << " to 0x" << size << std::dec << std::endl;
+ _write_out_fm_meta(size, true);
+ cold_close();
+
+ // mount in read/write to sync expansion changes
+ r = _mount(false);
ceph_assert(r == 0);
- db->submit_transaction_sync(txn);
-
- // always reference to slow device here
- string p = get_device_path(BlueFS::BDEV_SLOW);
- ceph_assert(!p.empty());
- const char* path = p.c_str();
- bluestore_bdev_label_t label;
- r = _read_bdev_label(cct, path, &label);
- if (r < 0) {
- derr << "unable to read label for " << path << ": "
- << cpp_strerror(r) << dendl;
- } else {
- label.size = size;
- r = _write_bdev_label(cct, path, label);
- if (r < 0) {
- derr << "unable to write label for " << path << ": "
- << cpp_strerror(r) << dendl;
- } else {
- out << bluefs_layout.shared_bdev
- <<" : size label updated to " << size
- << std::endl;
- }
- }
+ umount();
+ } else {
+ cold_close();
}
- umount();
+ return r;
+}
+
+int BlueStore::dump_bluefs_sizes(ostream& out)
+{
+ int r = cold_open();
+ ceph_assert(r == 0);
+ bluefs->dump_block_extents(out);
+ cold_close();
return r;
}
out_coll:
_flush_cache();
out_db:
- _close_db_and_around();
+ _close_db_and_around(false);
out_bdev:
_close_bdev();
out_fsid:
dout(20) << __func__ << " closing" << dendl;
}
- _close_db_and_around();
+ _close_db_and_around(false);
_close_bdev();
_close_fsid();
_close_path();
}
int BlueStore::cold_close()
{
- _close_db_and_around();
+ _close_db_and_around(true);
_close_bdev();
_close_fsid();
_close_path();
mempool_thread.shutdown();
_flush_cache();
out_db:
- _close_db_and_around();
+ _close_db_and_around(false);
out_bdev:
_close_bdev();
out_fsid:
e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
bs.set(pos);
- }
- );
+ });
}
int r = bluefs->fsck();
if (r < 0) {
ceph_assert(ondisk_format > 0);
ceph_assert(ondisk_format < latest_ondisk_format);
+ KeyValueDB::Transaction t = db->get_transaction();
if (ondisk_format == 1) {
// changes:
// - super: added ondisk_format
// - super: added min_compat_ondisk_format
// - super: added min_alloc_size
// - super: removed min_min_alloc_size
- KeyValueDB::Transaction t = db->get_transaction();
{
bufferlist bl;
db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
}
ondisk_format = 2;
- _prepare_ondisk_format_super(t);
- int r = db->submit_transaction_sync(t);
- ceph_assert(r == 0);
}
if (ondisk_format == 2) {
// changes:
// - super: added per_pool_omap key, which indicates that *all* objects
// are using the new prefix and key format
ondisk_format = 3;
- KeyValueDB::Transaction t = db->get_transaction();
- _prepare_ondisk_format_super(t);
- int r = db->submit_transaction_sync(t);
+ }
+ if (ondisk_format == 3) {
+ // changes:
+ // - FreelistManager keeps meta within bdev label
+ int r = _write_out_fm_meta(0);
ceph_assert(r == 0);
+ ondisk_format = 4;
}
+ // This to be the last operation
+ _prepare_ondisk_format_super(t);
+ int r = db->submit_transaction_sync(t);
+ ceph_assert(r == 0);
}
// done
dout(1) << __func__ << " done" << dendl;
uint64_t b_off0 = b_off;
_pad_zeros(&bl, &b_off0, block_size);
o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
- wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, true, true);
+ wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+ min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
+ // doesn't match disk one only
+ true);
return;
}
}
if (wi.mark_unused) {
+ ceph_assert(!dblob.is_compressed());
auto b_end = b_off + wi.bl.length();
if (b_off) {
dblob.add_unused(0, b_off);
}
- if (b_end < wi.blob_length) {
- dblob.add_unused(b_end, wi.blob_length - b_end);
+ uint64_t llen = dblob.get_logical_length();
+ if (b_end < llen) {
+ dblob.add_unused(b_end, llen - b_end);
}
}