X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=ceph%2Fsrc%2Fos%2Fbluestore%2FBlueStore.cc;h=6fdd11b170b2403af505c2cf68717d686314893f;hb=28e407b858acd3bddc89f68583571f771bb42e46;hp=5fe5b98c5dd074980957be21127afc53bb07d029;hpb=3efd99882e8c73385040d3f5c48fd014e4247be7;p=ceph.git diff --git a/ceph/src/os/bluestore/BlueStore.cc b/ceph/src/os/bluestore/BlueStore.cc index 5fe5b98c5..6fdd11b17 100644 --- a/ceph/src/os/bluestore/BlueStore.cc +++ b/ceph/src/os/bluestore/BlueStore.cc @@ -1667,16 +1667,9 @@ void BlueStore::SharedBlob::put() << " removing self from set " << get_parent() << dendl; if (get_parent()) { - if (get_parent()->try_remove(this)) { - delete this; - } else { - ldout(coll->store->cct, 20) - << __func__ << " " << this << " lost race to remove myself from set" - << dendl; - } - } else { - delete this; + get_parent()->remove(this); } + delete this; } } @@ -3966,6 +3959,8 @@ void BlueStore::_init_logger() b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged", "Sum for extents that have been merged due to garbage " "collection"); + b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio", + "Read EIO errors propagated to high level callers"); logger = b.create_perf_counters(); cct->get_perfcounters_collection()->add(logger); } @@ -4004,6 +3999,12 @@ int BlueStore::get_block_device_fsid(CephContext* cct, const string& path, int BlueStore::_open_path() { + // sanity check(s) + if (cct->_conf->get_val("osd_max_object_size") >= + 4*1024*1024*1024ull) { + derr << __func__ << " osd_max_object_size >= 4GB; BlueStore has hard limit of 4GB." << dendl; + return -EINVAL; + } assert(path_fd < 0); path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY)); if (path_fd < 0) { @@ -4085,10 +4086,10 @@ int BlueStore::_read_bdev_label(CephContext* cct, string path, ::decode(expected_crc, p); } catch (buffer::error& e) { - derr << __func__ << " unable to decode label at offset " << p.get_off() + dout(2) << __func__ << " unable to decode label at offset " << p.get_off() << ": " << e.what() << dendl; - return -EINVAL; + return -ENOENT; } if (crc != expected_crc) { derr << __func__ << " bad crc on label, expected " << expected_crc @@ -4220,7 +4221,10 @@ int BlueStore::_open_fm(bool create) bl.append(freelist_type); t->set(PREFIX_SUPER, "freelist_type", bl); } - fm->create(bdev->get_size(), min_alloc_size, t); + // being able to allocate in units less than bdev block size + // seems to be a bad idea. + assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size); + fm->create(bdev->get_size(), (int64_t)min_alloc_size, t); // allocate superblock reserved space. note that we do not mark // bluefs space as allocated in the freelist; we instead rely on @@ -4553,9 +4557,7 @@ int BlueStore::_open_db(bool create) string bfn; struct stat st; - if (read_meta("path_block.db", &bfn) < 0) { - bfn = path + "/block.db"; - } + bfn = path + "/block.db"; if (::stat(bfn.c_str(), &st) == 0) { r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn); if (r < 0) { @@ -4584,19 +4586,20 @@ int BlueStore::_open_db(bool create) } bluefs_shared_bdev = BlueFS::BDEV_SLOW; bluefs_single_shared_device = false; - } else if (::lstat(bfn.c_str(), &st) == -1) { - bluefs_shared_bdev = BlueFS::BDEV_DB; } else { - //symlink exist is bug - derr << __func__ << " " << bfn << " link target doesn't exist" << dendl; r = -errno; - goto free_bluefs; + if (::lstat(bfn.c_str(), &st) == -1) { + r = 0; + bluefs_shared_bdev = BlueFS::BDEV_DB; + } else { + derr << __func__ << " " << bfn << " symlink exists but target unusable: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } } // shared device - if (read_meta("path_block", &bfn) < 0) { - bfn = path + "/block"; - } + bfn = path + "/block"; r = bluefs->add_block_device(bluefs_shared_bdev, bfn); if (r < 0) { derr << __func__ << " add block device(" << bfn << ") returned: " @@ -4625,9 +4628,7 @@ int BlueStore::_open_db(bool create) bluefs_extents.insert(start, initial); } - if (read_meta("path_block.wal", &bfn) < 0) { - bfn = path + "/block.wal"; - } + bfn = path + "/block.wal"; if (::stat(bfn.c_str(), &st) == 0) { r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn); if (r < 0) { @@ -4656,13 +4657,16 @@ int BlueStore::_open_db(bool create) } cct->_conf->set_val("rocksdb_separate_wal_dir", "true"); bluefs_single_shared_device = false; - } else if (::lstat(bfn.c_str(), &st) == -1) { - cct->_conf->set_val("rocksdb_separate_wal_dir", "false"); } else { - //symlink exist is bug - derr << __func__ << " " << bfn << " link target doesn't exist" << dendl; r = -errno; - goto free_bluefs; + if (::lstat(bfn.c_str(), &st) == -1) { + r = 0; + cct->_conf->set_val("rocksdb_separate_wal_dir", "false"); + } else { + derr << __func__ << " " << bfn << " symlink exists but target unusable: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } } if (create) { @@ -4932,12 +4936,19 @@ int BlueStore::_balance_bluefs_freespace(PExtentVector *extents) int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size, 0, 0, &exts); - if (alloc_len < (int64_t)gift) { - derr << __func__ << " allocate failed on 0x" << std::hex << gift - << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl; + if (alloc_len <= 0) { + dout(1) << __func__ << " no allocate on 0x" << std::hex << gift + << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl; + alloc->unreserve(gift); + alloc->dump(); + return 0; + } else if (alloc_len < (int64_t)gift) { + dout(1) << __func__ << " insufficient allocate on 0x" << std::hex << gift + << " min_alloc_size 0x" << min_alloc_size + << " allocated 0x" << alloc_len + << std::dec << dendl; + alloc->unreserve(gift - alloc_len); alloc->dump(); - assert(0 == "allocate failed, wtf"); - return -ENOSPC; } for (auto& p : exts) { bluestore_pextent_t e = bluestore_pextent_t(p); @@ -4993,6 +5004,7 @@ void BlueStore::_commit_bluefs_freespace( int BlueStore::_open_collections(int *errors) { + dout(10) << __func__ << dendl; assert(coll_map.empty()); KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL); for (it->upper_bound(string()); @@ -5014,7 +5026,8 @@ int BlueStore::_open_collections(int *errors) << pretty_binary_string(it->key()) << dendl; return -EIO; } - dout(20) << __func__ << " opened " << cid << " " << c << dendl; + dout(20) << __func__ << " opened " << cid << " " << c + << " " << c->cnode << dendl; coll_map[cid] = c; } else { derr << __func__ << " unrecognized collection " << it->key() << dendl; @@ -5098,30 +5111,13 @@ int BlueStore::_setup_block_symlink_or_file( } if (cct->_conf->bluestore_block_preallocate_file) { -#ifdef HAVE_POSIX_FALLOCATE - r = ::posix_fallocate(fd, 0, size); - if (r) { + r = ::ceph_posix_fallocate(fd, 0, size); + if (r > 0) { derr << __func__ << " failed to prefallocate " << name << " file to " << size << ": " << cpp_strerror(r) << dendl; VOID_TEMP_FAILURE_RETRY(::close(fd)); return -r; } -#else - char data[1024*128]; - for (uint64_t off = 0; off < size; off += sizeof(data)) { - if (off + sizeof(data) > size) - r = ::write(fd, data, size - off); - else - r = ::write(fd, data, sizeof(data)); - if (r < 0) { - r = -errno; - derr << __func__ << " failed to prefallocate w/ write " << name << " file to " - << size << ": " << cpp_strerror(r) << dendl; - VOID_TEMP_FAILURE_RETRY(::close(fd)); - return r; - } - } -#endif } dout(1) << __func__ << " resized " << name << " file to " << pretty_si_t(size) << "B" << dendl; @@ -5236,17 +5232,6 @@ int BlueStore::mkfs() if (r < 0) goto out_close_fsid; - { - string wal_path = cct->_conf->get_val("bluestore_block_wal_path"); - if (wal_path.size()) { - write_meta("path_block.wal", wal_path); - } - string db_path = cct->_conf->get_val("bluestore_block_db_path"); - if (db_path.size()) { - write_meta("path_block.db", db_path); - } - } - // choose min_alloc_size if (cct->_conf->bluestore_min_alloc_size) { min_alloc_size = cct->_conf->bluestore_min_alloc_size; @@ -5452,7 +5437,6 @@ int BlueStore::_mount(bool kv_only) mempool_thread.init(); - mounted = true; return 0; @@ -5488,7 +5472,6 @@ int BlueStore::umount() mempool_thread.shutdown(); dout(20) << __func__ << " stopping kv thread" << dendl; _kv_stop(); - _reap_collections(); _flush_cache(); dout(20) << __func__ << " closing" << dendl; @@ -5516,7 +5499,6 @@ static void apply(uint64_t off, uint64_t len, uint64_t granularity, BlueStore::mempool_dynamic_bitset &bitset, - const char *what, std::function f) { auto end = ROUND_UP_TO(off + len, granularity); @@ -5532,6 +5514,7 @@ int BlueStore::_fsck_check_extents( const PExtentVector& extents, bool compressed, mempool_dynamic_bitset &used_blocks, + uint64_t granularity, store_statfs_t& expected_statfs) { dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl; @@ -5545,8 +5528,9 @@ int BlueStore::_fsck_check_extents( } bool already = false; apply( - e.offset, e.length, min_alloc_size, used_blocks, __func__, + e.offset, e.length, granularity, used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { + assert(pos < bs.size()); if (bs.test(pos)) already = true; else @@ -5650,11 +5634,11 @@ int BlueStore::_fsck(bool deep, bool repair) if (r < 0) goto out_scan; - used_blocks.resize(bdev->get_size() / min_alloc_size); + used_blocks.resize(fm->get_alloc_units()); apply( - 0, MAX(min_alloc_size, SUPER_RESERVED), min_alloc_size, used_blocks, - "0~SUPER_RESERVED", + 0, MAX(min_alloc_size, SUPER_RESERVED), fm->get_alloc_size(), used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { + assert(pos < bs.size()); bs.set(pos); } ); @@ -5662,8 +5646,9 @@ int BlueStore::_fsck(bool deep, bool repair) if (bluefs) { for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) { apply( - e.get_start(), e.get_len(), min_alloc_size, used_blocks, "bluefs", + e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { + assert(pos < bs.size()); bs.set(pos); } ); @@ -5764,7 +5749,8 @@ int BlueStore::_fsck(bool deep, bool repair) continue; } c->cid.is_pg(&pgid); - dout(20) << __func__ << " collection " << c->cid << dendl; + dout(20) << __func__ << " collection " << c->cid << " " << c->cnode + << dendl; } if (!expecting_shards.empty()) { @@ -5949,6 +5935,7 @@ int BlueStore::_fsck(bool deep, bool repair) errors += _fsck_check_extents(oid, blob.get_extents(), blob.is_compressed(), used_blocks, + fm->get_alloc_size(), expected_statfs); } } @@ -6011,7 +5998,9 @@ int BlueStore::_fsck(bool deep, bool repair) errors += _fsck_check_extents(p->second.oids.front(), extents, p->second.compressed, - used_blocks, expected_statfs); + used_blocks, + fm->get_alloc_size(), + expected_statfs); sb_info.erase(p); } } @@ -6061,8 +6050,9 @@ int BlueStore::_fsck(bool deep, bool repair) << " released 0x" << std::hex << wt.released << std::dec << dendl; for (auto e = wt.released.begin(); e != wt.released.end(); ++e) { apply( - e.get_start(), e.get_len(), min_alloc_size, used_blocks, "deferred", + e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { + assert(pos < bs.size()); bs.set(pos); } ); @@ -6076,9 +6066,9 @@ int BlueStore::_fsck(bool deep, bool repair) // know they are allocated. for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) { apply( - e.get_start(), e.get_len(), min_alloc_size, used_blocks, - "bluefs_extents", + e.get_start(), e.get_len(), fm->get_alloc_size(), used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { + assert(pos < bs.size()); bs.reset(pos); } ); @@ -6088,8 +6078,9 @@ int BlueStore::_fsck(bool deep, bool repair) while (fm->enumerate_next(&offset, &length)) { bool intersects = false; apply( - offset, length, min_alloc_size, used_blocks, "free", + offset, length, fm->get_alloc_size(), used_blocks, [&](uint64_t pos, mempool_dynamic_bitset &bs) { + assert(pos < bs.size()); if (bs.test(pos)) { intersects = true; } else { @@ -6129,8 +6120,8 @@ int BlueStore::_fsck(bool deep, bool repair) size_t next = used_blocks.find_next(cur); if (next != cur + 1) { derr << "fsck error: leaked extent 0x" << std::hex - << ((uint64_t)start * min_alloc_size) << "~" - << ((cur + 1 - start) * min_alloc_size) << std::dec + << ((uint64_t)start * fm->get_alloc_size()) << "~" + << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec << dendl; start = next; break; @@ -6199,14 +6190,13 @@ int BlueStore::statfs(struct store_statfs_t *buf) buf->available = alloc->get_free(); if (bluefs) { - // part of our shared device is "free" according to BlueFS - // Don't include bluestore_bluefs_min because that space can't - // be used for any other purpose. - buf->available += bluefs->get_free(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min; - - // include dedicated db, too, if that isn't the shared device. - if (bluefs_shared_bdev != BlueFS::BDEV_DB) { - buf->total += bluefs->get_total(BlueFS::BDEV_DB); + // part of our shared device is "free" according to BlueFS, but we + // can't touch bluestore_bluefs_min of it. + int64_t shared_available = std::min( + bluefs->get_free(bluefs_shared_bdev), + bluefs->get_total(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min); + if (shared_available > 0) { + buf->available += shared_available; } } @@ -6239,23 +6229,26 @@ BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid) void BlueStore::_queue_reap_collection(CollectionRef& c) { dout(10) << __func__ << " " << c << " " << c->cid << dendl; - std::lock_guard l(reap_lock); + // _reap_collections and this in the same thread, + // so no need a lock. removed_collections.push_back(c); } void BlueStore::_reap_collections() { + list removed_colls; { - std::lock_guard l(reap_lock); - removed_colls.swap(removed_collections); + // _queue_reap_collection and this in the same thread. + // So no need a lock. + if (!removed_collections.empty()) + removed_colls.swap(removed_collections); + else + return; } - bool all_reaped = true; - - for (list::iterator p = removed_colls.begin(); - p != removed_colls.end(); - ++p) { + list::iterator p = removed_colls.begin(); + while (p != removed_colls.end()) { CollectionRef c = *p; dout(10) << __func__ << " " << c << " " << c->cid << dendl; if (c->onode_map.map_any([&](OnodeRef o) { @@ -6263,19 +6256,21 @@ void BlueStore::_reap_collections() if (o->flushing_count.load()) { dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid << " flush_txns " << o->flushing_count << dendl; - return false; + return true; } - return true; + return false; })) { - all_reaped = false; + ++p; continue; } c->onode_map.clear(); + p = removed_colls.erase(p); dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl; } - - if (all_reaped) { + if (removed_colls.empty()) { dout(10) << __func__ << " all reaped" << dendl; + } else { + removed_collections.splice(removed_collections.begin(), removed_colls); } } @@ -6436,10 +6431,13 @@ int BlueStore::read( length = o->onode.size; r = _do_read(c, o, offset, length, bl, op_flags); + if (r == -EIO) { + logger->inc(l_bluestore_read_eio); + } } out: - if (r == 0 && _debug_data_eio(oid)) { + if (r >= 0 && _debug_data_eio(oid)) { r = -EIO; derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl; } else if (cct->_conf->bluestore_debug_random_read_err && @@ -6545,7 +6543,7 @@ int BlueStore::_do_read( pos += hole; left -= hole; } - BlobRef bptr = lp->blob; + BlobRef& bptr = lp->blob; unsigned l_off = pos - lp->logical_offset; unsigned b_off = l_off + lp->blob_offset; unsigned b_len = std::min(left, lp->length - l_off); @@ -6593,9 +6591,9 @@ int BlueStore::_do_read( // measure the whole block below. // The error isn't that much... vector compressed_blob_bls; - IOContext ioc(cct, NULL); + IOContext ioc(cct, NULL, true); // allow EIO for (auto& p : blobs2read) { - BlobRef bptr = p.first; + const BlobRef& bptr = p.first; dout(20) << __func__ << " blob " << *bptr << std::hex << " need " << p.second << std::dec << dendl; if (bptr->get_blob().is_compressed()) { @@ -6620,7 +6618,14 @@ int BlueStore::_do_read( return r; return 0; }); + if (r < 0) { + derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl; + if (r == -EIO) { + // propagate EIO to caller + return r; + } assert(r == 0); + } } else { // read the pieces for (auto& reg : p.second) { @@ -6658,7 +6663,15 @@ int BlueStore::_do_read( return r; return 0; }); - assert(r == 0); + if (r < 0) { + derr << __func__ << " bdev-read failed: " << cpp_strerror(r) + << dendl; + if (r == -EIO) { + // propagate EIO to caller + return r; + } + assert(r == 0); + } assert(reg.bl.length() == r_len); } } @@ -6667,6 +6680,11 @@ int BlueStore::_do_read( bdev->aio_submit(&ioc); dout(20) << __func__ << " waiting for aio" << dendl; ioc.aio_wait(); + r = ioc.get_return_value(); + if (r < 0) { + assert(r == -EIO); // no other errors allowed + return -EIO; + } } logger->tinc(l_bluestore_read_wait_aio_lat, ceph_clock_now() - start); @@ -6674,7 +6692,7 @@ int BlueStore::_do_read( auto p = compressed_blob_bls.begin(); blobs2read_t::iterator b2r_it = blobs2read.begin(); while (b2r_it != blobs2read.end()) { - BlobRef bptr = b2r_it->first; + const BlobRef& bptr = b2r_it->first; dout(20) << __func__ << " blob " << *bptr << std::hex << " need 0x" << b2r_it->second << std::dec << dendl; if (bptr->get_blob().is_compressed()) { @@ -8229,7 +8247,8 @@ void BlueStore::_txc_release_alloc(TransContext *txc) { // update allocator with full released set if (!cct->_conf->bluestore_debug_no_reuse_blocks) { - dout(10) << __func__ << " " << txc << " " << txc->released << dendl; + dout(10) << __func__ << " " << txc << " " << std::hex + << txc->released << std::dec << dendl; for (interval_set::iterator p = txc->released.begin(); p != txc->released.end(); ++p) { @@ -8380,6 +8399,7 @@ void BlueStore::_kv_stop() } kv_sync_thread.join(); kv_finalize_thread.join(); + assert(removed_collections.empty()); { std::lock_guard l(kv_lock); kv_stop = false; @@ -9383,7 +9403,7 @@ int BlueStore::_touch(TransContext *txc, return r; } -void BlueStore::_dump_onode(OnodeRef o, int log_level) +void BlueStore::_dump_onode(const OnodeRef& o, int log_level) { if (!cct->_conf->subsys.should_gather(ceph_subsys_bluestore, log_level)) return; @@ -10571,7 +10591,7 @@ int BlueStore::_do_zero(TransContext *txc, o->extent_map.dirty_range(offset, length); _wctx_finish(txc, c, o, &wctx); - if (offset + length > o->onode.size) { + if (length > 0 && offset + length > o->onode.size) { o->onode.size = offset + length; dout(20) << __func__ << " extending size to " << offset + length << dendl;