#include <boost/container/flat_set.hpp>
#include <boost/algorithm/string.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real.hpp>
#include "include/cpp-btree/btree_set.h"
return p;
}
+
#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
static int _get_key_object(const char *p, ghobject_t *oid)
void _add(BlueStore::Onode* o, int level) override
{
- if (o->put_cache()) {
+ o->set_cached();
+ if (o->pin_nref == 1) {
(level > 0) ? lru.push_front(*o) : lru.push_back(*o);
o->cache_age_bin = age_bins.front();
*(o->cache_age_bin) += 1;
- } else {
- ++num_pinned;
}
++num; // we count both pinned and unpinned entries
dout(20) << __func__ << " " << this << " " << o->oid << " added, num="
}
void _rm(BlueStore::Onode* o) override
{
- if (o->pop_cache()) {
+ o->clear_cached();
+ if (o->lru_item.is_linked()) {
*(o->cache_age_bin) -= 1;
lru.erase(lru.iterator_to(*o));
- } else {
- ceph_assert(num_pinned);
- --num_pinned;
}
ceph_assert(num);
--num;
dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
}
- void _pin(BlueStore::Onode* o) override
- {
- *(o->cache_age_bin) -= 1;
- lru.erase(lru.iterator_to(*o));
- ++num_pinned;
- dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " pinned" << dendl;
- }
- void _unpin(BlueStore::Onode* o) override
- {
- lru.push_front(*o);
- o->cache_age_bin = age_bins.front();
- *(o->cache_age_bin) += 1;
- ceph_assert(num_pinned);
- --num_pinned;
- dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " unpinned" << dendl;
- }
- void _unpin_and_rm(BlueStore::Onode* o) override
+
+ void maybe_unpin(BlueStore::Onode* o) override
{
- o->pop_cache();
- ceph_assert(num_pinned);
- --num_pinned;
- ceph_assert(num);
- --num;
+ OnodeCacheShard* ocs = this;
+ ocs->lock.lock();
+ // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
+ while (ocs != o->c->get_onode_cache()) {
+ ocs->lock.unlock();
+ ocs = o->c->get_onode_cache();
+ ocs->lock.lock();
+ }
+ if (o->is_cached() && o->pin_nref == 1) {
+ if(!o->lru_item.is_linked()) {
+ if (o->exists) {
+ lru.push_front(*o);
+ o->cache_age_bin = age_bins.front();
+ *(o->cache_age_bin) += 1;
+ dout(20) << __func__ << " " << this << " " << o->oid << " unpinned"
+ << dendl;
+ } else {
+ ceph_assert(num);
+ --num;
+ o->clear_cached();
+ dout(20) << __func__ << " " << this << " " << o->oid << " removed"
+ << dendl;
+ // remove will also decrement nref
+ o->c->onode_space._remove(o->oid);
+ }
+ } else if (o->exists) {
+ // move onode within LRU
+ lru.erase(lru.iterator_to(*o));
+ lru.push_front(*o);
+ if (o->cache_age_bin != age_bins.front()) {
+ *(o->cache_age_bin) -= 1;
+ o->cache_age_bin = age_bins.front();
+ *(o->cache_age_bin) += 1;
+ }
+ dout(20) << __func__ << " " << this << " " << o->oid << " touched"
+ << dendl;
+ }
+ }
+ ocs->lock.unlock();
}
+
void _trim_to(uint64_t new_size) override
{
if (new_size >= lru.size()) {
return; // don't even try
}
- uint64_t n = lru.size() - new_size;
- auto p = lru.end();
- ceph_assert(p != lru.begin());
- --p;
- ceph_assert(num >= n);
- num -= n;
- while (n-- > 0) {
- BlueStore::Onode *o = &*p;
+ uint64_t n = num - new_size; // note: we might get empty LRU
+ // before n == 0 due to pinned
+ // entries. And hence being unable
+ // to reach new_size target.
+ while (n-- > 0 && lru.size() > 0) {
+ BlueStore::Onode *o = &lru.back();
+ lru.pop_back();
+
dout(20) << __func__ << " rm " << o->oid << " "
- << o->nref << " " << o->cached << " " << o->pinned << dendl;
- if (p != lru.begin()) {
- lru.erase(p--);
+ << o->nref << " " << o->cached << dendl;
+
+ *(o->cache_age_bin) -= 1;
+ if (o->pin_nref > 1) {
+ dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << dendl;
} else {
- ceph_assert(n == 0);
- lru.erase(p);
+ ceph_assert(num);
+ --num;
+ o->clear_cached();
+ o->c->onode_space._remove(o->oid);
}
- *(o->cache_age_bin) -= 1;
- auto pinned = !o->pop_cache();
- ceph_assert(!pinned);
- o->c->onode_map._remove(o->oid);
}
}
- void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
+ void _move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
{
if (to == this) {
return;
}
- ceph_assert(o->cached);
- ceph_assert(o->pinned);
- ceph_assert(num);
- ceph_assert(num_pinned);
- --num_pinned;
- --num;
- ++to->num_pinned;
- ++to->num;
+ _rm(o);
+ ceph_assert(o->nref > 1);
+ to->_add(o, 0);
}
void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
{
+ std::lock_guard l(lock);
*onodes += num;
- *pinned_onodes += num_pinned;
+ *pinned_onodes += num - lru.size();
}
};
#undef dout_prefix
#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
-BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
+BlueStore::OnodeRef BlueStore::OnodeSpace::add_onode(const ghobject_t& oid,
OnodeRef& o)
{
std::lock_guard l(cache->lock);
- auto p = onode_map.find(oid);
- if (p != onode_map.end()) {
+ // add entry or return existing one
+ auto p = onode_map.emplace(oid, o);
+ if (!p.second) {
ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
- << " raced, returning existing " << p->second
+ << " raced, returning existing " << p.first->second
<< dendl;
- return p->second;
+ return p.first->second;
}
ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
- onode_map[oid] = o;
cache->_add(o.get(), 1);
cache->_trim();
return o;
std::lock_guard l(cache->lock);
ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
if (p == onode_map.end()) {
- cache->logger->inc(l_bluestore_onode_misses);
ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
+ cache->logger->inc(l_bluestore_onode_misses);
} else {
ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
<< " " << p->second->nref
<< " " << p->second->cached
- << " " << p->second->pinned
<< dendl;
// This will pin onode and implicitly touch the cache when Onode
// eventually will become unpinned
o = p->second;
- ceph_assert(!o->cached || o->pinned);
cache->logger->inc(l_bluestore_onode_hits);
}
// This will pin 'o' and implicitly touch cache
// when it will eventually become unpinned
onode_map.insert(make_pair(new_oid, o));
- ceph_assert(o->pinned);
o->oid = new_oid;
o->key = new_okey;
ldout(cct, LogLevelV) << i.first << " : " << i.second
<< " " << i.second->nref
<< " " << i.second->cached
- << " " << i.second->pinned
<< dendl;
}
}
#ifndef CACHE_BLOB_BL
void BlueStore::Blob::decode(
- Collection *coll,
bufferptr::const_iterator& p,
uint64_t struct_v,
uint64_t* sbid,
- bool include_ref_map)
+ bool include_ref_map,
+ Collection *coll)
{
denc(blob, p, struct_v);
if (blob.is_shared()) {
used_in_blob.clear();
bluestore_extent_ref_map_t legacy_ref_map;
legacy_ref_map.decode(p);
- for (auto r : legacy_ref_map.ref_map) {
- get_ref(
- coll,
- r.first,
- r.second.refs * r.second.length);
+ if (coll) {
+ for (auto r : legacy_ref_map.ref_map) {
+ get_ref(
+ coll,
+ r.first,
+ r.second.refs * r.second.length);
+ }
}
}
}
#undef dout_context
#define dout_context onode->c->store->cct
-BlueStore::ExtentMap::ExtentMap(Onode *o)
+BlueStore::ExtentMap::ExtentMap(Onode *o, size_t inline_shard_prealloc_size)
: onode(o),
- inline_bl(
- o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
+ inline_bl(inline_shard_prealloc_size) {
}
void BlueStore::ExtentMap::dump(Formatter* f) const
bool was_too_many_blobs_check = false;
auto too_many_blobs_threshold =
g_conf()->bluestore_debug_too_many_blobs_threshold;
- auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
- decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
- decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
+ auto& dumped_onodes = onode->c->onode_space.cache->dumped_onodes;
+ decltype(onode->c->onode_space.cache->dumped_onodes)::value_type* oid_slot = nullptr;
+ decltype(onode->c->onode_space.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
if (e->logical_offset >= needs_reshard_end) {
return false;
}
-unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
+/////////////////// BlueStore::ExtentMap::DecoderExtent ///////////
+void BlueStore::ExtentMap::ExtentDecoder::decode_extent(
+ Extent* le,
+ __u8 struct_v,
+ bptr_c_it_t& p,
+ Collection* c)
{
- /*
- derr << __func__ << ":";
- bl.hexdump(*_dout);
- *_dout << dendl;
- */
+ uint64_t blobid;
+ denc_varint(blobid, p);
+ if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
+ uint64_t gap;
+ denc_varint_lowz(gap, p);
+ pos += gap;
+ }
+ le->logical_offset = pos;
+ if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
+ denc_varint_lowz(le->blob_offset, p);
+ } else {
+ le->blob_offset = 0;
+ }
+ if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
+ denc_varint_lowz(prev_len, p);
+ }
+ le->length = prev_len;
+ if (blobid & BLOBID_FLAG_SPANNING) {
+ consume_blobid(le, true, blobid >> BLOBID_SHIFT_BITS);
+ } else {
+ blobid >>= BLOBID_SHIFT_BITS;
+ if (blobid) {
+ consume_blobid(le, false, blobid - 1);
+ } else {
+ Blob *b = new Blob();
+ uint64_t sbid = 0;
+ b->decode(p, struct_v, &sbid, false, c);
+ consume_blob(le, extent_pos, sbid, b);
+ }
+ }
+ pos += prev_len;
+ ++extent_pos;
+}
+
+unsigned BlueStore::ExtentMap::ExtentDecoder::decode_some(
+ const bufferlist& bl, Collection* c)
+{
+ __u8 struct_v;
+ uint32_t num;
ceph_assert(bl.get_num_buffers() <= 1);
auto p = bl.front().begin_deep();
- __u8 struct_v;
denc(struct_v, p);
// Version 2 differs from v1 in blob's ref_map
// serialization only. Hence there is no specific
// handling at ExtentMap level below.
ceph_assert(struct_v == 1 || struct_v == 2);
-
- uint32_t num;
denc_varint(num, p);
- vector<BlobRef> blobs(num);
- uint64_t pos = 0;
- uint64_t prev_len = 0;
- unsigned n = 0;
+ extent_pos = 0;
while (!p.end()) {
- Extent *le = new Extent();
- uint64_t blobid;
- denc_varint(blobid, p);
- if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
- uint64_t gap;
- denc_varint_lowz(gap, p);
- pos += gap;
- }
- le->logical_offset = pos;
- if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
- denc_varint_lowz(le->blob_offset, p);
- } else {
- le->blob_offset = 0;
- }
- if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
- denc_varint_lowz(prev_len, p);
- }
- le->length = prev_len;
+ Extent* le = get_next_extent();
+ decode_extent(le, struct_v, p, c);
+ add_extent(le);
+ }
+ ceph_assert(extent_pos == num);
+ return num;
+}
- if (blobid & BLOBID_FLAG_SPANNING) {
- dout(30) << __func__ << " getting spanning blob "
- << (blobid >> BLOBID_SHIFT_BITS) << dendl;
- le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
- } else {
- blobid >>= BLOBID_SHIFT_BITS;
- if (blobid) {
- le->assign_blob(blobs[blobid - 1]);
- ceph_assert(le->blob);
- } else {
- Blob *b = new Blob();
- uint64_t sbid = 0;
- b->decode(onode->c, p, struct_v, &sbid, false);
- blobs[n] = b;
- onode->c->open_shared_blob(sbid, b);
- le->assign_blob(b);
- }
- // we build ref_map dynamically for non-spanning blobs
- le->blob->get_ref(
- onode->c,
- le->blob_offset,
- le->length);
- }
- pos += prev_len;
- ++n;
- extent_map.insert(*le);
+void BlueStore::ExtentMap::ExtentDecoder::decode_spanning_blobs(
+ bptr_c_it_t& p, Collection* c)
+{
+ __u8 struct_v;
+ denc(struct_v, p);
+ // Version 2 differs from v1 in blob's ref_map
+ // serialization only. Hence there is no specific
+ // handling at ExtentMap level.
+ ceph_assert(struct_v == 1 || struct_v == 2);
+
+ unsigned n;
+ denc_varint(n, p);
+ while (n--) {
+ BlueStore::BlobRef b(new Blob());
+ denc_varint(b->id, p);
+ uint64_t sbid = 0;
+ b->decode(p, struct_v, &sbid, true, c);
+ consume_spanning_blob(sbid, b);
}
+}
- ceph_assert(n == num);
- return num;
+/////////////////// BlueStore::ExtentMap::DecoderExtentFull ///////////
+void BlueStore::ExtentMap::ExtentDecoderFull::consume_blobid(
+ BlueStore::Extent* le, bool spanning, uint64_t blobid) {
+ ceph_assert(le);
+ if (spanning) {
+ le->assign_blob(extent_map.get_spanning_blob(blobid));
+ } else {
+ ceph_assert(blobid < blobs.size());
+ le->assign_blob(blobs[blobid]);
+ // we build ref_map dynamically for non-spanning blobs
+ le->blob->get_ref(
+ extent_map.onode->c,
+ le->blob_offset,
+ le->length);
+ }
+}
+
+void BlueStore::ExtentMap::ExtentDecoderFull::consume_blob(
+ BlueStore::Extent* le, uint64_t extent_no, uint64_t sbid, BlobRef b) {
+ ceph_assert(le);
+ blobs.resize(extent_no + 1);
+ blobs[extent_no] = b;
+ extent_map.onode->c->open_shared_blob(sbid, b);
+ le->assign_blob(b);
+ le->blob->get_ref(
+ extent_map.onode->c,
+ le->blob_offset,
+ le->length);
+}
+
+void BlueStore::ExtentMap::ExtentDecoderFull::consume_spanning_blob(
+ uint64_t sbid, BlueStore::BlobRef b) {
+ extent_map.spanning_blob_map[b->id] = b;
+ extent_map.onode->c->open_shared_blob(sbid, b);
+}
+
+BlueStore::Extent* BlueStore::ExtentMap::ExtentDecoderFull::get_next_extent()
+{
+ return new Extent();
+}
+
+void BlueStore::ExtentMap::ExtentDecoderFull::add_extent(BlueStore::Extent* le)
+{
+ extent_map.extent_map.insert(*le);
+}
+
+unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
+{
+ ExtentDecoderFull edecoder(*this);
+ unsigned n = edecoder.decode_some(bl, onode->c);
+ return n;
}
void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
}
}
-void BlueStore::ExtentMap::decode_spanning_blobs(
- bufferptr::const_iterator& p)
-{
- __u8 struct_v;
- denc(struct_v, p);
- // Version 2 differs from v1 in blob's ref_map
- // serialization only. Hence there is no specific
- // handling at ExtentMap level.
- ceph_assert(struct_v == 1 || struct_v == 2);
-
- unsigned n;
- denc_varint(n, p);
- while (n--) {
- BlobRef b(new Blob());
- denc_varint(b->id, p);
- spanning_blob_map[b->id] = b;
- uint64_t sbid = 0;
- b->decode(onode->c, p, struct_v, &sbid, true);
- onode->c->open_shared_blob(sbid, b);
- }
-}
-
void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
{
shards.resize(onode->onode.extent_map_shards.size());
out->push_back('~');
}
-void BlueStore::Onode::get() {
- if (++nref >= 2 && !pinned) {
- OnodeCacheShard* ocs = c->get_onode_cache();
- ocs->lock.lock();
- // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
- while (ocs != c->get_onode_cache()) {
- ocs->lock.unlock();
- ocs = c->get_onode_cache();
- ocs->lock.lock();
- }
- bool was_pinned = pinned;
- pinned = nref >= 2;
- bool r = !was_pinned && pinned;
- if (cached && r) {
- ocs->_pin(this);
- }
- ocs->lock.unlock();
- }
+void BlueStore::Onode::get()
+{
+ ++nref;
+ ++pin_nref;
}
-void BlueStore::Onode::put() {
- ++put_nref;
- int n = --nref;
- if (n == 1) {
- OnodeCacheShard* ocs = c->get_onode_cache();
- ocs->lock.lock();
- // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
- while (ocs != c->get_onode_cache()) {
- ocs->lock.unlock();
- ocs = c->get_onode_cache();
- ocs->lock.lock();
- }
- bool need_unpin = pinned;
- pinned = pinned && nref >= 2;
- need_unpin = need_unpin && !pinned;
- if (cached && need_unpin) {
- if (exists) {
- ocs->_unpin(this);
- } else {
- ocs->_unpin_and_rm(this);
- // remove will also decrement nref
- c->onode_map._remove(oid);
- }
- }
- ocs->lock.unlock();
+void BlueStore::Onode::put()
+{
+ if (--pin_nref == 1) {
+ c->get_onode_cache()->maybe_unpin(this);
}
- auto pn = --put_nref;
- if (nref == 0 && pn == 0) {
+ if (--nref == 0) {
delete this;
}
}
-BlueStore::Onode* BlueStore::Onode::decode(
- CollectionRef c,
- const ghobject_t& oid,
- const string& key,
- const bufferlist& v)
+void BlueStore::Onode::decode_raw(
+ BlueStore::Onode* on,
+ const bufferlist& v,
+ BlueStore::ExtentMap::ExtentDecoder& edecoder)
{
- Onode* on = new Onode(c.get(), oid, key);
on->exists = true;
auto p = v.front().begin_deep();
on->onode.decode(p);
- for (auto& i : on->onode.attrs) {
- i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
- }
// initialize extent_map
- on->extent_map.decode_spanning_blobs(p);
+ edecoder.decode_spanning_blobs(p, on->c);
if (on->onode.extent_map_shards.empty()) {
denc(on->extent_map.inline_bl, p);
- on->extent_map.decode_some(on->extent_map.inline_bl);
- on->extent_map.inline_bl.reassign_to_mempool(
- mempool::mempool_bluestore_cache_data);
+ edecoder.decode_some(on->extent_map.inline_bl, on->c);
}
- else {
- on->extent_map.init_shards(false, false);
+}
+
+BlueStore::Onode* BlueStore::Onode::create_decode(
+ CollectionRef c,
+ const ghobject_t& oid,
+ const string& key,
+ const bufferlist& v,
+ bool allow_empty)
+{
+ ceph_assert(v.length() || allow_empty);
+ Onode* on = new Onode(c.get(), oid, key);
+
+ if (v.length()) {
+ ExtentMap::ExtentDecoderFull edecoder(on->extent_map);
+ decode_raw(on, v, edecoder);
+
+ for (auto& i : on->onode.attrs) {
+ i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+ }
+
+ // initialize extent_map
+ if (on->onode.extent_map_shards.empty()) {
+ on->extent_map.inline_bl.reassign_to_mempool(
+ mempool::mempool_bluestore_cache_data);
+ } else {
+ on->extent_map.init_shards(false, false);
+ }
}
return on;
}
store(store_),
cache(bc),
exists(true),
- onode_map(oc),
+ onode_space(oc),
commit_queue(nullptr)
{
}
}
}
- OnodeRef o = onode_map.lookup(oid);
+ OnodeRef o = onode_space.lookup(oid);
if (o)
return o;
ceph_assert(r == -ENOENT);
if (!create)
return OnodeRef();
-
- // new object, new onode
- on = new Onode(this, oid, key);
} else {
- // loaded
ceph_assert(r >= 0);
- on = Onode::decode(this, oid, key, v);
}
+
+ // new object, load onode if available
+ on = Onode::create_decode(this, oid, key, v, true);
o.reset(on);
- return onode_map.add(oid, o);
+ return onode_space.add_onode(oid, o);
}
void BlueStore::Collection::split_cache(
bool is_pg = dest->cid.is_pg(&destpg);
ceph_assert(is_pg);
- auto p = onode_map.onode_map.begin();
- while (p != onode_map.onode_map.end()) {
+ auto p = onode_space.onode_map.begin();
+ while (p != onode_space.onode_map.end()) {
OnodeRef o = p->second;
if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
// onode does not belong to this child
ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
<< dendl;
- // ensuring that nref is always >= 2 and hence onode is pinned and
- // physically out of cache during the transition
+ // ensuring that nref is always >= 2 and hence onode is pinned
OnodeRef o_pin = o;
- ceph_assert(o->pinned);
- p = onode_map.onode_map.erase(p);
- dest->onode_map.onode_map[o->oid] = o;
+ p = onode_space.onode_map.erase(p);
+ dest->onode_space.onode_map[o->oid] = o;
if (o->cached) {
- get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
+ get_onode_cache()->_move_pinned(dest->get_onode_cache(), o.get());
}
o->c = dest;
#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
BlueStore::OmapIteratorImpl::OmapIteratorImpl(
- CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
+ CollectionRef c, OnodeRef& o, KeyValueDB::Iterator it)
: c(c), o(o), it(it)
{
std::shared_lock l(c->lock);
bdev = NULL;
}
-int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only, bool fm_restore)
+int BlueStore::_open_fm(KeyValueDB::Transaction t,
+ bool read_only,
+ bool db_avail,
+ bool fm_restore)
{
int r;
// fm restore must pass in a valid transaction
ceph_assert(!fm_restore || (t != nullptr));
+ // when function is called in repair mode (to_repair=true) we skip db->open()/create()
+ bool can_have_null_fm = !is_db_rotational() &&
+ !read_only &&
+ db_avail &&
+ cct->_conf->bluestore_allocation_from_file &&
+ !bdev->is_smr();
+
// When allocation-info is stored in a single file we set freelist_type to "null"
- bool set_null_freemap = false;
- if (freelist_type == "null") {
- // use BitmapFreelistManager with the null option to stop allocations from going to RocksDB
- // we will store the allocation info in a single file during umount()
- freelist_type = "bitmap";
- set_null_freemap = true;
+ if (can_have_null_fm) {
+ freelist_type = "null";
+ need_to_destage_allocation_file = true;
}
fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
ceph_assert(fm);
- if (set_null_freemap) {
- fm->set_null_manager();
- }
if (t) {
// create mode. initialize freespace
dout(20) << __func__ << " initializing freespace" << dendl;
ceph_assert(cct->_conf->bdev_block_size <= min_alloc_size);
uint64_t alloc_size = min_alloc_size;
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr()) {
- if (freelist_type != "zoned") {
- derr << "SMR device but freelist_type = " << freelist_type << " (not zoned)"
- << dendl;
- return -EINVAL;
- }
- } else
-#endif
- if (freelist_type == "zoned") {
+ if (bdev->is_smr() && freelist_type != "zoned") {
+ derr << "SMR device but freelist_type = " << freelist_type << " (not zoned)"
+ << dendl;
+ return -EINVAL;
+ }
+ if (!bdev->is_smr() && freelist_type == "zoned") {
derr << "non-SMR device (or SMR support not built-in) but freelist_type = zoned"
<< dendl;
return -EINVAL;
r = _write_out_fm_meta(0);
ceph_assert(r == 0);
} else {
+ if (can_have_null_fm) {
+ commit_to_null_manager();
+ }
r = fm->init(db, read_only,
[&](const std::string& key, std::string* result) {
return read_meta(key, result);
});
if (r < 0) {
- derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
+ derr << __func__ << " failed: " << cpp_strerror(r) << dendl;
delete fm;
fm = NULL;
return r;
delete alloc;
return -EINVAL;
}
- shared_alloc.set(a);
+ shared_alloc.set(a, alloc_size);
} else
#endif
{
// BlueFS will share the same allocator
- shared_alloc.set(alloc);
+ shared_alloc.set(alloc, alloc_size);
}
return 0;
derr << __func__ << "::NCB::Please change the value of bluestore_allocation_from_file to TRUE in your ceph.conf file" << dendl;
return -ENOTSUP; // Operation not supported
}
-
if (restore_allocator(alloc, &num, &bytes) == 0) {
dout(5) << __func__ << "::NCB::restore_allocator() completed successfully alloc=" << alloc << dendl;
} else {
void BlueStore::_post_init_alloc(const std::map<uint64_t, uint64_t>& zone_adjustments)
{
+ int r = 0;
#ifdef HAVE_LIBZBD
- assert(bdev->is_smr());
- dout(1) << __func__ << " adjusting freelist based on device write pointers" << dendl;
- auto f = dynamic_cast<ZonedFreelistManager*>(fm);
- ceph_assert(f);
- KeyValueDB::Transaction t = db->get_transaction();
- for (auto& i : zone_adjustments) {
- // allocate AND release since this gap is now dead space
- // note that the offset is imprecise, but only need to select the zone
- f->allocate(i.first, i.second, t);
- f->release(i.first, i.second, t);
- }
- int r = db->submit_transaction_sync(t);
- ceph_assert(r == 0);
+ if (bdev->is_smr()) {
+ if (zone_adjustments.empty()) {
+ return;
+ }
+ dout(1) << __func__ << " adjusting freelist based on device write pointers" << dendl;
+ auto f = dynamic_cast<ZonedFreelistManager*>(fm);
+ ceph_assert(f);
+ KeyValueDB::Transaction t = db->get_transaction();
+ for (auto& i : zone_adjustments) {
+ // allocate AND release since this gap is now dead space
+ // note that the offset is imprecise, but only need to select the zone
+ f->allocate(i.first, i.second, t);
+ f->release(i.first, i.second, t);
+ }
+ r = db->submit_transaction_sync(t);
+ } else
#endif
+ if (fm->is_null_manager()) {
+ // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected
+ // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount()
+ // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort
+ // to recovery from RocksDB::ONodes
+ r = invalidate_allocation_file_on_bluefs();
+ }
+ ceph_assert(r >= 0);
}
void BlueStore::_close_alloc()
return bdev->is_rotational();
}
+bool BlueStore::is_statfs_recoverable() const
+{
+ // abuse fm for now
+ return has_null_manager();
+}
+
bool BlueStore::test_mount_in_use()
{
// most error conditions mean the mount is not in use (e.g., because
goto out_db;
}
- r = _open_fm(nullptr, true);
+ r = _open_fm(nullptr, true, false);
if (r < 0)
goto out_db;
goto out_alloc;
}
- if (!read_only && !zone_adjustments.empty()) {
- // for SMR devices that have freelist mismatch with device write pointers
+ if (!read_only) {
_post_init_alloc(zone_adjustments);
}
if (db) {
_close_db();
}
+ _close_around_db();
+}
+
+void BlueStore::_close_around_db()
+{
if (bluefs) {
_close_bluefs();
}
int BlueStore::close_db_environment()
{
- _close_db_and_around();
+ if (db) {
+ delete db;
+ db = nullptr;
+ }
+ _close_around_db();
return 0;
}
return 0;
}
-void BlueStore::_close_db_leave_bluefs()
+void BlueStore::_close_db()
{
+ dout(10) << __func__ << ":read_only=" << db_was_opened_read_only
+ << " fm=" << fm
+ << " destage_alloc_file=" << need_to_destage_allocation_file
+ << " per_pool=" << per_pool_stat_collection
+ << " pool stats=" << osd_pools.size()
+ << dendl;
+ bool do_destage = !db_was_opened_read_only && need_to_destage_allocation_file;
+ if (do_destage && is_statfs_recoverable()) {
+ auto t = db->get_transaction();
+ store_statfs_t s;
+ if (per_pool_stat_collection) {
+ KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
+ uint64_t pool_id;
+ for (it->upper_bound(string()); it->valid(); it->next()) {
+ int r = get_key_pool_stat(it->key(), &pool_id);
+ if (r >= 0) {
+ dout(10) << __func__ << " wiping statfs for: " << pool_id << dendl;
+ } else {
+ derr << __func__ << " wiping invalid statfs key: " << it->key() << dendl;
+ }
+ t->rmkey(PREFIX_STAT, it->key());
+ }
+
+ std::lock_guard l(vstatfs_lock);
+ for(auto &p : osd_pools) {
+ string key;
+ get_pool_stat_key(p.first, &key);
+ bufferlist bl;
+ if (!p.second.is_empty()) {
+ p.second.encode(bl);
+ p.second.publish(&s);
+ t->set(PREFIX_STAT, key, bl);
+ dout(10) << __func__ << " persisting: "
+ << p.first << "->" << s
+ << dendl;
+ }
+ }
+ } else {
+ bufferlist bl;
+ {
+ std::lock_guard l(vstatfs_lock);
+ vstatfs.encode(bl);
+ vstatfs.publish(&s);
+ }
+ t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
+ dout(10) << __func__ << "persisting: " << s << dendl;
+ }
+ int r = db->submit_transaction_sync(t);
+ dout(10) << __func__ << " statfs persisted." << dendl;
+ ceph_assert(r >= 0);
+ }
ceph_assert(db);
delete db;
db = nullptr;
-}
-void BlueStore::_close_db()
-{
- dout(10) << __func__ << ":read_only=" << db_was_opened_read_only << " fm=" << fm << " destage_alloc_file=" << need_to_destage_allocation_file << dendl;
- _close_db_leave_bluefs();
-
- if (need_to_destage_allocation_file) {
- ceph_assert(fm && fm->is_null_manager());
+ if (do_destage && fm && fm->is_null_manager()) {
int ret = store_allocator(alloc);
if (ret != 0) {
derr << __func__ << "::NCB::store_allocator() failed (continue with bitmapFreelistManager)" << dendl;
st.decode(p);
vstatfs += st;
- dout(30) << __func__ << " pool " << pool_id
- << " statfs " << st << dendl;
+ dout(10) << __func__ << " pool " << std::hex << pool_id
+ << " statfs(hex) " << st
+ << std::dec << dendl;
} catch (ceph::buffer::error& e) {
derr << __func__ << " failed to decode pool stats, key:"
<< pretty_binary_string(it->key()) << dendl;
}
}
}
- dout(30) << __func__ << " statfs " << vstatfs << dendl;
+ dout(10) << __func__ << " statfs " << std::hex
+ << vstatfs << std::dec << dendl;
}
{
KeyValueDB::Transaction t = db->get_transaction();
- r = _open_fm(t, true);
+ r = _open_fm(t, false, true);
if (r < 0)
goto out_close_db;
{
}
//---------------------------------------------
-bool BlueStore::has_null_manager()
+bool BlueStore::has_null_manager() const
{
return (fm && fm->is_null_manager());
}
dout(1) << __func__ << " quick-fix on mount" << dendl;
_fsck_on_open(FSCK_SHALLOW, true);
- //reread statfs
- //FIXME minor: replace with actual open/close?
- _open_statfs();
- _check_legacy_statfs_alert();
-
//set again as hopefully it has been fixed
if (was_per_pool_omap != OMAP_PER_PG) {
_set_per_pool_omap();
return errors;
}
-void BlueStore::_fsck_check_pool_statfs(
- BlueStore::per_pool_statfs& expected_pool_statfs,
+void BlueStore::_fsck_check_statfs(
+ const store_statfs_t& expected_statfs,
+ const per_pool_statfs& expected_pool_statfs,
int64_t& errors,
int64_t& warnings,
BlueStoreRepairer* repairer)
{
- auto it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
- if (it) {
- for (it->lower_bound(string()); it->valid(); it->next()) {
- string key = it->key();
- if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
- if (repairer) {
- ++errors;
- repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
- derr << "fsck error: " << "legacy statfs record found, removing"
- << dendl;
- }
- continue;
- }
- uint64_t pool_id;
- if (get_key_pool_stat(key, &pool_id) < 0) {
- derr << "fsck error: bad key " << key
- << "in statfs namespece" << dendl;
- if (repairer) {
- repairer->remove_key(db, PREFIX_STAT, key);
- }
- ++errors;
- continue;
- }
-
- volatile_statfs vstatfs;
- bufferlist bl = it->value();
- auto blp = bl.cbegin();
- try {
- vstatfs.decode(blp);
- } catch (ceph::buffer::error& e) {
- derr << "fsck error: failed to decode Pool StatFS record"
- << pretty_binary_string(key) << dendl;
+ string key;
+ store_statfs_t actual_statfs;
+ store_statfs_t s;
+ {
+ // make a copy
+ per_pool_statfs my_expected_pool_statfs(expected_pool_statfs);
+ auto op = osd_pools.begin();
+ while (op != osd_pools.end()) {
+ get_pool_stat_key(op->first, &key);
+ op->second.publish(&s);
+ auto it_expected = my_expected_pool_statfs.find(op->first);
+ if (it_expected == my_expected_pool_statfs.end()) {
+ auto op0 = op++;
+ if (op0->second.is_empty()) {
+ // It's OK to lack relevant empty statfs record
+ continue;
+ }
+ derr << __func__ << "::fsck error: " << std::hex
+ << "pool " << op0->first << " has got no statfs to match against: "
+ << s
+ << std::dec << dendl;
+ ++errors;
if (repairer) {
- dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
- << pretty_binary_string(key)
- << "', removing" << dendl;
+ osd_pools.erase(op0);
repairer->remove_key(db, PREFIX_STAT, key);
}
- ++errors;
- vstatfs.reset();
- }
- auto stat_it = expected_pool_statfs.find(pool_id);
- if (stat_it == expected_pool_statfs.end()) {
- if (vstatfs.is_empty()) {
- // we don't consider that as an error since empty pool statfs
- // are left in DB for now
- dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
- << std::hex << pool_id << std::dec << dendl;
+ } else {
+ if (!(s == it_expected->second)) {
+ derr << "fsck error: actual " << s
+ << " != expected " << it_expected->second
+ << " for pool "
+ << std::hex << op->first << std::dec << dendl;
+ ++errors;
if (repairer) {
- // but we need to increment error count in case of repair
- // to have proper counters at the end
- // (as repairer increments recovery counter anyway).
- ++errors;
+ // repair in-memory in a hope this would be flushed properly on shutdown
+ s = it_expected->second;
+ op->second = it_expected->second;
+ repairer->fix_statfs(db, key, it_expected->second);
}
- } else {
- derr << "fsck error: found stray Pool StatFS record for pool id 0x"
- << std::hex << pool_id << std::dec << dendl;
- ++errors;
}
- if (repairer) {
- repairer->remove_key(db, PREFIX_STAT, key);
- }
- continue;
- }
- store_statfs_t statfs;
- vstatfs.publish(&statfs);
- if (!(stat_it->second == statfs)) {
- derr << "fsck error: actual " << statfs
- << " != expected " << stat_it->second
- << " for pool "
- << std::hex << pool_id << std::dec << dendl;
- if (repairer) {
- repairer->fix_statfs(db, key, stat_it->second);
- }
- ++errors;
+ actual_statfs.add(s);
+ my_expected_pool_statfs.erase(it_expected);
+ ++op;
}
- expected_pool_statfs.erase(stat_it);
- }
- } // if (it)
- for (auto& s : expected_pool_statfs) {
- if (s.second.is_zero()) {
- // we might lack empty statfs recs in DB
- continue;
}
- derr << "fsck error: missing Pool StatFS record for pool "
- << std::hex << s.first << std::dec << dendl;
- if (repairer) {
- string key;
- get_pool_stat_key(s.first, &key);
- repairer->fix_statfs(db, key, s.second);
+ // check stats that lack matching entities in osd_pools
+ for (auto &p : my_expected_pool_statfs) {
+ if (p.second.is_zero()) {
+ // It's OK to lack relevant empty statfs record
+ continue;
+ }
+ get_pool_stat_key(p.first, &key);
+ derr << __func__ << "::fsck error: " << std::hex
+ << "pool " << p.first << " has got no actual statfs: "
+ << std::dec << p.second
+ << dendl;
+ ++errors;
+ if (repairer) {
+ osd_pools[p.first] = p.second;
+ repairer->fix_statfs(db, key, p.second);
+ actual_statfs.add(p.second);
+ }
+ }
+ }
+ // process global statfs
+ if (repairer) {
+ if (!per_pool_stat_collection) {
+ // by virtue of running this method, we correct the top-level
+ // error of having global stats
+ repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
+ per_pool_stat_collection = true;
+ }
+ vstatfs = actual_statfs;
+ dout(20) << __func__ << " setting vstatfs to " << actual_statfs << dendl;
+ } else if (!per_pool_stat_collection) {
+ // check global stats only if fscking (not repairing) w/o per-pool stats
+ vstatfs.publish(&s);
+ if (!(s == expected_statfs)) {
+ derr << "fsck error: actual " << s
+ << " != expected " << expected_statfs << dendl;
+ ++errors;
}
- ++errors;
- }
- if (!per_pool_stat_collection &&
- repairer) {
- // by virtue of running this method, we correct the top-level
- // error of having global stats
- repairer->inc_repaired();
}
}
<< dendl;
OnodeRef o;
- o.reset(Onode::decode(c, oid, it->key(), it->value()));
+ o.reset(Onode::create_decode(c, oid, it->key(), it->value()));
o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
_dump_onode<30>(cct, *o);
dout(10) << __func__ << " " << oid << dendl;
OnodeRef o;
- o.reset(Onode::decode(c, oid, key, value));
+ o.reset(Onode::create_decode(c, oid, key, value));
++num_objects;
num_spanning_blobs += o->extent_map.spanning_blob_map.size();
if (!broken) {
first_broken = it1->second;
++errors;
+ derr << "fsck error:" << " stray spanning blob found:" << it1->first
+ << dendl;
}
broken++;
if (repairer) {
mempool_dynamic_bitset used_blocks, bluefs_used_blocks;
KeyValueDB::Iterator it;
- store_statfs_t expected_store_statfs, actual_statfs;
+ store_statfs_t expected_store_statfs;
per_pool_statfs expected_pool_statfs;
sb_info_space_efficient_map_t sb_info;
<< dendl;
}
- // get expected statfs; reset unaffected fields to be able to compare
- // structs
- statfs(&actual_statfs);
- actual_statfs.total = 0;
- actual_statfs.internally_reserved = 0;
- actual_statfs.available = 0;
- actual_statfs.internal_metadata = 0;
- actual_statfs.omap_allocated = 0;
-
if (g_conf()->bluestore_debug_fsck_abort) {
dout(1) << __func__ << " debug abort" << dendl;
goto out_scan;
sb_ref_mismatches = sb_ref_counts.count_non_zero();
if (sb_ref_mismatches != 0) {
- derr << "fsck error: shared blob references aren't matching, at least "
- << sb_ref_mismatches << " found" << dendl;
+ derr << "fsck error:" << "*" << sb_ref_mismatches
+ << " shared blob references aren't matching, at least "
+ << sb_ref_mismatches << " found" << dendl;
errors += sb_ref_mismatches;
}
<< " obj:" << oid << dendl;
OnodeRef o;
- o.reset(Onode::decode(c, oid, it->key(), it->value()));
+ o.reset(Onode::create_decode(c, oid, it->key(), it->value()));
o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
mempool::bluestore_fsck::set<BlobRef> blobs;
sb_info.clear();
sb_ref_counts.reset();
- // check global stats only if fscking (not repairing) w/o per-pool stats
- if (!per_pool_stat_collection &&
- !repair &&
- !(actual_statfs == expected_store_statfs)) {
- derr << "fsck error: actual " << actual_statfs
- << " != expected " << expected_store_statfs << dendl;
- if (repair) {
- repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
- expected_store_statfs);
- }
- ++errors;
- }
-
dout(1) << __func__ << " checking pool_statfs" << dendl;
- _fsck_check_pool_statfs(expected_pool_statfs,
- errors, warnings, repair ? &repairer : nullptr);
-
+ _fsck_check_statfs(expected_store_statfs, expected_pool_statfs,
+ errors, warnings, repair ? &repairer : nullptr);
if (depth != FSCK_SHALLOW) {
dout(1) << __func__ << " checking for stray omap data " << dendl;
it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE);
string key;
get_shared_blob_key(sbid, &key);
bluestore_shared_blob_t persistent(sbid);
- persistent.ref_map.get(0xdead0000, 0x1000);
+ persistent.ref_map.get(0xdead0000, min_alloc_size);
bufferlist bl;
encode(persistent, bl);
dout(20) << __func__ << " sbid " << sbid
while (p != removed_colls.end()) {
CollectionRef c = *p;
dout(10) << __func__ << " " << c << " " << c->cid << dendl;
- if (c->onode_map.map_any([&](Onode* o) {
+ if (c->onode_space.map_any([&](Onode* o) {
ceph_assert(!o->exists);
if (o->flushing_count.load()) {
dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
++p;
continue;
}
- c->onode_map.clear();
+ c->onode_space.clear();
p = removed_colls.erase(p);
dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
}
}
void BlueStore::_read_cache(
- OnodeRef o,
+ OnodeRef& o,
uint64_t offset,
size_t length,
int read_cache_policy,
}
int BlueStore::_generate_read_result_bl(
- OnodeRef o,
+ OnodeRef& o,
uint64_t offset,
size_t length,
ready_regions_t& ready_regions,
int BlueStore::_do_read(
Collection *c,
- OnodeRef o,
+ OnodeRef& o,
uint64_t offset,
size_t length,
bufferlist& bl,
int BlueStore::_do_readv(
Collection *c,
- OnodeRef o,
+ OnodeRef& o,
const interval_set<uint64_t>& m,
bufferlist& bl,
uint32_t op_flags,
return 0;
}
-void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
+void BlueStore::_assign_nid(TransContext *txc, OnodeRef& o)
{
if (o->onode.nid) {
ceph_assert(o->exists);
logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
- bufferlist bl;
- txc->statfs_delta.encode(bl);
if (per_pool_stat_collection) {
- string key;
- get_pool_stat_key(txc->osd_pool_id, &key);
- txc->t->merge(PREFIX_STAT, key, bl);
+ if (!is_statfs_recoverable()) {
+ bufferlist bl;
+ txc->statfs_delta.encode(bl);
+ string key;
+ get_pool_stat_key(txc->osd_pool_id, &key);
+ txc->t->merge(PREFIX_STAT, key, bl);
+ }
std::lock_guard l(vstatfs_lock);
auto& stats = osd_pools[txc->osd_pool_id];
vstatfs += txc->statfs_delta; //non-persistent in this mode
} else {
- txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
+ if (!is_statfs_recoverable()) {
+ bufferlist bl;
+ txc->statfs_delta.encode(bl);
+ txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
+ }
std::lock_guard l(vstatfs_lock);
vstatfs += txc->statfs_delta;
int BlueStore::_touch(TransContext *txc,
CollectionRef& c,
- OnodeRef &o)
+ OnodeRef& o)
{
dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
int r = 0;
void BlueStore::_do_write_small(
TransContext *txc,
CollectionRef &c,
- OnodeRef o,
+ OnodeRef& o,
uint64_t offset, uint64_t length,
bufferlist::iterator& blp,
WriteContext *wctx)
return;
}
-bool BlueStore::has_null_fm()
-{
- return fm->is_null_manager();
-}
-
bool BlueStore::BigDeferredWriteContext::can_defer(
BlueStore::extent_map_t::iterator ep,
uint64_t prefer_deferred_size,
void BlueStore::_do_write_big_apply_deferred(
TransContext* txc,
CollectionRef& c,
- OnodeRef o,
+ OnodeRef& o,
BlueStore::BigDeferredWriteContext& dctx,
bufferlist::iterator& blp,
WriteContext* wctx)
void BlueStore::_do_write_big(
TransContext *txc,
CollectionRef &c,
- OnodeRef o,
+ OnodeRef& o,
uint64_t offset, uint64_t length,
bufferlist::iterator& blp,
WriteContext *wctx)
int BlueStore::_do_alloc_write(
TransContext *txc,
CollectionRef coll,
- OnodeRef o,
+ OnodeRef& o,
WriteContext *wctx)
{
dout(20) << __func__ << " txc " << txc
// compress (as needed) and calc needed space
uint64_t need = 0;
+ uint64_t data_size = 0;
+ // 'need' is amount of space that must be provided by allocator.
+ // 'data_size' is a size of data that will be transferred to disk.
+ // Note that data_size is always <= need. This comes from:
+ // - write to blob was unaligned, and there is free space
+ // - data has been compressed
+ //
+ // We make one decision and apply it to all blobs.
+ // All blobs will be deferred or none will.
+ // We assume that allocator does its best to provide contiguous space,
+ // and the condition is : (data_size < deferred).
+
auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
for (auto& wi : wctx->writes) {
if (c && wi.blob_length > min_alloc_size) {
txc->statfs_delta.compressed_allocated() += result_len;
logger->inc(l_bluestore_compress_success_count);
need += result_len;
+ data_size += result_len;
} else {
rejected = true;
}
<< dendl;
logger->inc(l_bluestore_compress_rejected_count);
need += wi.blob_length;
+ data_size += wi.bl.length();
} else {
rejected = true;
}
<< std::dec << dendl;
logger->inc(l_bluestore_compress_rejected_count);
need += wi.blob_length;
+ data_size += wi.bl.length();
}
log_latency("compress@_do_alloc_write",
l_bluestore_compress_lat,
cct->_conf->bluestore_log_op_age );
} else {
need += wi.blob_length;
+ data_size += wi.bl.length();
}
}
PExtentVector prealloc;
- prealloc.reserve(2 * wctx->writes.size());;
+ prealloc.reserve(2 * wctx->writes.size());
int64_t prealloc_left = 0;
prealloc_left = alloc->allocate(
need, min_alloc_size, need,
}
_collect_allocation_stats(need, min_alloc_size, prealloc);
- dout(20) << __func__ << " prealloc " << prealloc << dendl;
+ dout(20) << __func__ << std::hex << " need=0x" << need << " data=0x" << data_size
+ << " prealloc " << prealloc << dendl;
auto prealloc_pos = prealloc.begin();
ceph_assert(prealloc_pos != prealloc.end());
- uint64_t prealloc_pos_length = prealloc_pos->length;
for (auto& wi : wctx->writes) {
bluestore_blob_t& dblob = wi.b->dirty_blob();
PExtentVector extents;
int64_t left = final_length;
- bool has_chunk2defer = false;
auto prefer_deferred_size_snapshot = prefer_deferred_size.load();
while (left > 0) {
ceph_assert(prealloc_left > 0);
- has_chunk2defer |= (prealloc_pos_length < prefer_deferred_size_snapshot);
if (prealloc_pos->length <= left) {
prealloc_left -= prealloc_pos->length;
left -= prealloc_pos->length;
txc->statfs_delta.allocated() += prealloc_pos->length;
extents.push_back(*prealloc_pos);
++prealloc_pos;
- if (prealloc_pos != prealloc.end()) {
- prealloc_pos_length = prealloc_pos->length;
- }
} else {
extents.emplace_back(prealloc_pos->offset, left);
prealloc_pos->offset += left;
// queue io
if (!g_conf()->bluestore_debug_omit_block_device_write) {
- if (has_chunk2defer && l->length() < prefer_deferred_size_snapshot) {
+ if (data_size < prefer_deferred_size_snapshot) {
dout(20) << __func__ << " deferring 0x" << std::hex
<< l->length() << std::dec << " write via deferred" << dendl;
bluestore_deferred_op_t *op = _get_deferred_op(txc, l->length());
void BlueStore::_wctx_finish(
TransContext *txc,
CollectionRef& c,
- OnodeRef o,
+ OnodeRef& o,
WriteContext *wctx,
set<SharedBlob*> *maybe_unshared_blobs)
{
void BlueStore::_do_write_data(
TransContext *txc,
CollectionRef& c,
- OnodeRef o,
+ OnodeRef& o,
uint64_t offset,
uint64_t length,
bufferlist& bl,
void BlueStore::_choose_write_options(
CollectionRef& c,
- OnodeRef o,
+ OnodeRef& o,
uint32_t fadvise_flags,
WriteContext *wctx)
{
int BlueStore::_do_gc(
TransContext *txc,
CollectionRef& c,
- OnodeRef o,
+ OnodeRef& o,
const WriteContext& wctx,
uint64_t *dirty_start,
uint64_t *dirty_end)
int BlueStore::_do_write(
TransContext *txc,
CollectionRef& c,
- OnodeRef o,
+ OnodeRef& o,
uint64_t offset,
uint64_t length,
bufferlist& bl,
}
void BlueStore::_do_truncate(
- TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
+ TransContext *txc, CollectionRef& c, OnodeRef& o, uint64_t offset,
set<SharedBlob*> *maybe_unshared_blobs)
{
dout(15) << __func__ << " " << c->cid << " " << o->oid
int BlueStore::_do_remove(
TransContext *txc,
CollectionRef& c,
- OnodeRef o)
+ OnodeRef& o)
{
set<SharedBlob*> maybe_unshared_blobs;
bool is_gen = !o->oid.is_no_gen();
int BlueStore::_remove(TransContext *txc,
CollectionRef& c,
- OnodeRef &o)
+ OnodeRef& o)
{
dout(15) << __func__ << " " << c->cid << " " << o->oid
<< " onode " << o.get()
int BlueStore::_omap_setheader(TransContext *txc,
CollectionRef& c,
- OnodeRef &o,
+ OnodeRef& o,
bufferlist& bl)
{
dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
// this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
// Onode in the old slot
- c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
+ c->onode_space.rename(oldo, old_oid, new_oid, new_okey);
r = 0;
// hold a ref to new Onode in old name position, to ensure we don't drop
}
size_t nonexistent_count = 0;
ceph_assert((*c)->exists);
- if ((*c)->onode_map.map_any([&](Onode* o) {
+ if ((*c)->onode_space.map_any([&](Onode* o) {
if (o->exists) {
dout(1) << __func__ << " " << o->oid << " " << o
<< " exists in onode_map" << dendl;
bool exists = (!next.is_max());
for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
dout(10) << __func__ << " oid " << *it << dendl;
- auto onode = (*c)->onode_map.lookup(*it);
+ auto onode = (*c)->onode_space.lookup(*it);
exists = !onode || onode->exists;
if (exists) {
dout(1) << __func__ << " " << *it
ceph_assert(is_pg);
// the destination should initially be empty.
- ceph_assert(d->onode_map.empty());
+ ceph_assert(d->onode_space.empty());
ceph_assert(d->shared_blob_set.empty());
ceph_assert(d->cnode.bits == bits);
ceph_assert(i->empty());
}
for (auto& p : coll_map) {
- p.second->onode_map.clear();
+ p.second->onode_space.clear();
if (!p.second->shared_blob_set.empty()) {
derr << __func__ << " stray shared blobs on " << p.first << dendl;
p.second->shared_blob_set.dump<0>(cct);
}
- ceph_assert(p.second->onode_map.empty());
+ ceph_assert(p.second->onode_space.empty());
ceph_assert(p.second->shared_blob_set.empty());
}
coll_map.clear();
}
}
-void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
+void BlueStore::_record_onode(OnodeRef& o, KeyValueDB::Transaction &txn)
{
// finalize extent_map shards
o->extent_map.update(txn, false);
{
// mark that allocation-file was invalidated and we should destage a new copy whne closing db
need_to_destage_allocation_file = true;
- dout(10) << "need_to_destage_allocation_file was set" << dendl;
+ dout(10) << __func__ << " need_to_destage_allocation_file was set" << dendl;
BlueFS::FileWriter *p_handle = nullptr;
if (!bluefs->dir_exists(allocator_dir)) {
int ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
if (ret != 0) {
- dout(5) << "allocator_file(" << allocator_file << ") doesn't exist" << dendl;
+ dout(5) << __func__ << " allocator_file(" << allocator_file << ") doesn't exist" << dendl;
// nothing to do -> return
return 0;
}
ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, true);
if (ret != 0) {
- derr << "Failed open_for_write with error-code " << ret << dendl;
+ derr << __func__ << "::NCB:: Failed open_for_write with error-code "
+ << ret << dendl;
return -1;
}
dout(5) << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
ret = bluefs->truncate(p_handle, 0);
if (ret != 0) {
- derr << "Failed truncate with error-code " << ret << dendl;
+ derr << __func__ << "::NCB:: Failed truncaste with error-code "
+ << ret << dendl;
bluefs->close_writer(p_handle);
return -1;
}
//-----------------------------------------------------------------------------------
int BlueStore::__restore_allocator(Allocator* allocator, uint64_t *num, uint64_t *bytes)
{
+ if (cct->_conf->bluestore_debug_inject_allocation_from_file_failure > 0) {
+ boost::mt11213b rng(time(NULL));
+ boost::uniform_real<> ur(0, 1);
+ if (ur(rng) < cct->_conf->bluestore_debug_inject_allocation_from_file_failure) {
+ derr << __func__ << " failure injected." << dendl;
+ return -1;
+ }
+ }
utime_t start_time = ceph_clock_now();
BlueFS::FileReader *p_temp_handle = nullptr;
int ret = bluefs->open_for_read(allocator_dir, allocator_file, &p_temp_handle, false);
return ret;
}
-//-------------------------------------------------------------------------
-void BlueStore::ExtentMap::provide_shard_info_to_onode(bufferlist v, uint32_t shard_id)
-{
- [[maybe_unused]] auto cct = onode->c->store->cct;
- auto path = onode->c->store->path;
- if (shard_id < shards.size()) {
- auto p = &shards[shard_id];
- if (!p->loaded) {
- dout(30) << "opening shard 0x" << std::hex << p->shard_info->offset << std::dec << dendl;
- p->extents = decode_some(v);
- p->loaded = true;
- dout(20) << "open shard 0x" << std::hex << p->shard_info->offset << std::dec << dendl;
- ceph_assert(p->dirty == false);
- ceph_assert(v.length() == p->shard_info->bytes);
- }
- } else {
- derr << "illegal shard-id=" << shard_id << " shards.size()=" << shards.size() << dendl;
- ceph_assert(shard_id < shards.size());
- }
-}
-
//-----------------------------------------------------------------------------------
void BlueStore::set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length)
{
+ dout(30) << __func__ << " 0x" << std::hex
+ << offset << "~" << length
+ << " " << min_alloc_size_mask
+ << dendl;
ceph_assert((offset & min_alloc_size_mask) == 0);
ceph_assert((length & min_alloc_size_mask) == 0);
sbmap->set(offset >> min_alloc_size_order, length >> min_alloc_size_order);
}
-//---------------------------------------------------------
-// Process all physical extents from a given Onode (including all its shards)
-void BlueStore::read_allocation_from_single_onode(
- SimpleBitmap* sbmap,
- BlueStore::OnodeRef& onode_ref,
- read_alloc_stats_t& stats)
-{
- // create a map holding all physical-extents of this Onode to prevent duplication from being added twice and more
- std::unordered_map<uint64_t, uint32_t> lcl_extnt_map;
- unsigned blobs_count = 0;
- uint64_t pos = 0;
+void BlueStore::ExtentDecoderPartial::_consume_new_blob(bool spanning,
+ uint64_t extent_no,
+ uint64_t sbid,
+ BlobRef b)
+{
+ [[maybe_unused]] auto cct = store.cct;
+ ceph_assert(per_pool_statfs);
+ ceph_assert(oid != ghobject_t());
- stats.spanning_blob_count += onode_ref->extent_map.spanning_blob_map.size();
- // first iterate over all logical-extents
- for (struct Extent& l_extent : onode_ref->extent_map.extent_map) {
- ceph_assert(l_extent.logical_offset >= pos);
+ auto &blob = b->get_blob();
+ if(spanning) {
+ dout(20) << __func__ << " " << spanning << " " << b->id << dendl;
+ ceph_assert(b->id >= 0);
+ spanning_blobs[b->id] = b;
+ ++stats.spanning_blob_count;
+ } else {
+ dout(20) << __func__ << " " << spanning << " " << extent_no << dendl;
+ blobs[extent_no] = b;
+ }
+ bool compressed = blob.is_compressed();
+ if (!blob.is_shared()) {
+ for (auto& pe : blob.get_extents()) {
+ if (pe.offset == bluestore_pextent_t::INVALID_OFFSET) {
+ ++stats.skipped_illegal_extent;
+ continue;
+ }
+ store.set_allocation_in_simple_bmap(&sbmap, pe.offset, pe.length);
- pos = l_extent.logical_offset + l_extent.length;
- ceph_assert(l_extent.blob);
- const bluestore_blob_t& blob = l_extent.blob->get_blob();
- const PExtentVector& p_extent_vec = blob.get_extents();
- blobs_count++;
- if (blob.is_compressed()) {
- stats.compressed_blob_count++;
+ per_pool_statfs->allocated() += pe.length;
+ if (compressed) {
+ per_pool_statfs->compressed_allocated() += pe.length;
+ }
}
-
- if (blob.is_shared()) {
- stats.shared_blobs_count++;
+ if (compressed) {
+ per_pool_statfs->compressed() +=
+ blob.get_compressed_payload_length();
+ ++stats.compressed_blob_count;
}
-
- // process all physical extent in this blob
- for (auto p_extent = p_extent_vec.begin(); p_extent != p_extent_vec.end(); p_extent++) {
- auto offset = p_extent->offset;
- auto length = p_extent->length;
-
- // Offset of -1 means that the extent was removed (and it is only a place holder) and can be safely skipped
- if (offset == (uint64_t)-1) {
- stats.skipped_illegal_extent++;
- continue;
+ } else {
+ auto it = sb_info.find(sbid);
+ if (it == sb_info.end()) {
+ derr << __func__ << " shared blob not found:" << sbid
+ << dendl;
+ }
+ auto &sbi = *it;
+ auto pool_id = oid.hobj.get_logical_pool();
+ if (sbi.pool_id == sb_info_t::INVALID_POOL_ID) {
+ sbi.pool_id = pool_id;
+ size_t alloc_delta = sbi.allocated_chunks << min_alloc_size_order;
+ per_pool_statfs->allocated() += alloc_delta;
+ if (compressed) {
+ per_pool_statfs->compressed_allocated() += alloc_delta;
+ ++stats.compressed_blob_count;
}
+ }
+ if (compressed) {
+ per_pool_statfs->compressed() +=
+ blob.get_compressed_payload_length();
+ }
+ }
+}
- if (!blob.is_shared()) {
- // skip repeating extents
- auto lcl_itr = lcl_extnt_map.find(offset);
- // extents using shared blobs might have differnt length
- if (lcl_itr != lcl_extnt_map.end() ) {
- // repeated extents must have the same length!
- ceph_assert(lcl_extnt_map[offset] == length);
- stats.skipped_repeated_extent++;
- } else {
- lcl_extnt_map[offset] = length;
- set_allocation_in_simple_bmap(sbmap, offset, length);
- stats.extent_count++;
- }
- } else {
- // extents using shared blobs might have differnt length
- set_allocation_in_simple_bmap(sbmap, offset, length);
- stats.extent_count++;
- }
+void BlueStore::ExtentDecoderPartial::consume_blobid(Extent* le,
+ bool spanning,
+ uint64_t blobid)
+{
+ [[maybe_unused]] auto cct = store.cct;
+ dout(20) << __func__ << " " << spanning << " " << blobid << dendl;
+ auto &map = spanning ? spanning_blobs : blobs;
+ auto it = map.find(blobid);
+ ceph_assert(it != map.end());
+ per_pool_statfs->stored() += le->length;
+ if (it->second->get_blob().is_compressed()) {
+ per_pool_statfs->compressed_original() += le->length;
+ }
+}
- } // physical-extents loop
+void BlueStore::ExtentDecoderPartial::consume_blob(Extent* le,
+ uint64_t extent_no,
+ uint64_t sbid,
+ BlobRef b)
+{
+ _consume_new_blob(false, extent_no, sbid, b);
+ per_pool_statfs->stored() += le->length;
+ if (b->get_blob().is_compressed()) {
+ per_pool_statfs->compressed_original() += le->length;
+ }
+}
- } // logical-extents loop
+void BlueStore::ExtentDecoderPartial::consume_spanning_blob(uint64_t sbid,
+ BlobRef b)
+{
+ _consume_new_blob(true, 0/*doesn't matter*/, sbid, b);
+}
- if (blobs_count < MAX_BLOBS_IN_ONODE) {
- stats.blobs_in_onode[blobs_count]++;
- } else {
- // store all counts higher than MAX_BLOBS_IN_ONODE in a single bucket at offset zero
- stats.blobs_in_onode[MAX_BLOBS_IN_ONODE]++;
- }
+void BlueStore::ExtentDecoderPartial::reset(const ghobject_t _oid,
+ volatile_statfs* _per_pool_statfs)
+{
+ oid = _oid;
+ per_pool_statfs = _per_pool_statfs;
+ blob_map_t empty;
+ blob_map_t empty2;
+ std::swap(blobs, empty);
+ std::swap(spanning_blobs, empty2);
}
-//-------------------------------------------------------------------------
int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats_t& stats)
{
- // finally add all space take by user data
- auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+ sb_info_space_efficient_map_t sb_info;
+ // iterate over all shared blobs
+ auto it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
if (!it) {
- // TBD - find a better error code
- derr << "failed db->get_iterator(PREFIX_OBJ)" << dendl;
- return -1;
+ derr << "failed getting shared blob's iterator" << dendl;
+ return -ENOENT;
+ }
+ if (it) {
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ const auto& key = it->key();
+ dout(20) << __func__ << " decode sb " << pretty_binary_string(key) << dendl;
+ uint64_t sbid = 0;
+ if (get_key_shared_blob(key, &sbid) != 0) {
+ derr << __func__ << " bad shared blob key '" << pretty_binary_string(key)
+ << "'" << dendl;
+ }
+ bluestore_shared_blob_t shared_blob(sbid);
+ bufferlist bl = it->value();
+ auto blp = bl.cbegin();
+ try {
+ decode(shared_blob, blp);
+ }
+ catch (ceph::buffer::error& e) {
+ derr << __func__ << " failed to decode Shared Blob"
+ << pretty_binary_string(key) << dendl;
+ continue;
+ }
+ dout(20) << __func__ << " " << shared_blob << dendl;
+ uint64_t allocated = 0;
+ for (auto& r : shared_blob.ref_map.ref_map) {
+ ceph_assert(r.first != bluestore_pextent_t::INVALID_OFFSET);
+ set_allocation_in_simple_bmap(sbmap, r.first, r.second.length);
+ allocated += r.second.length;
+ }
+ auto &sbi = sb_info.add_or_adopt(sbid);
+ ceph_assert(p2phase(allocated, min_alloc_size) == 0);
+ sbi.allocated_chunks += (allocated >> min_alloc_size_order);
+ ++stats.shared_blob_count;
+ }
+ }
+
+ it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+ if (!it) {
+ derr << "failed getting onode's iterator" << dendl;
+ return -ENOENT;
}
- CollectionRef collection_ref;
- spg_t pgid;
- BlueStore::OnodeRef onode_ref;
- bool has_open_onode = false;
- uint32_t shard_id = 0;
uint64_t kv_count = 0;
uint64_t count_interval = 1'000'000;
+ ExtentDecoderPartial edecoder(*this,
+ stats,
+ *sbmap,
+ sb_info,
+ min_alloc_size_order);
+
// iterate over all ONodes stored in RocksDB
for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) {
// trace an even after every million processed objects (typically every 5-10 seconds)
if (kv_count && (kv_count % count_interval == 0) ) {
- dout(5) << "processed objects count = " << kv_count << dendl;
- }
-
- // Shards - Code
- // add the extents from the shards to the main Obj
- if (is_extent_shard_key(it->key())) {
- // shards must follow a valid main object
- if (has_open_onode) {
- // shards keys must start with the main object key
- if (it->key().find(onode_ref->key) == 0) {
- // shards count can't exceed declared shard-count in the main-object
- if (shard_id < onode_ref->extent_map.shards.size()) {
- onode_ref->extent_map.provide_shard_info_to_onode(it->value(), shard_id);
- stats.shard_count++;
- shard_id++;
- } else {
- derr << "illegal shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
- derr << "shard->key=" << pretty_binary_string(it->key()) << dendl;
- ceph_assert(shard_id < onode_ref->extent_map.shards.size());
- }
- } else {
- derr << "illegal shard-key::onode->key=" << pretty_binary_string(onode_ref->key) << " shard->key=" << pretty_binary_string(it->key()) << dendl;
- ceph_assert(it->key().find(onode_ref->key) == 0);
- }
- } else {
- derr << "error::shard without main objects for key=" << pretty_binary_string(it->key()) << dendl;
- ceph_assert(has_open_onode);
- }
-
- } else {
- // Main Object Code
+ dout(5) << __func__ << " processed objects count = " << kv_count << dendl;
+ }
- if (has_open_onode) {
- // make sure we got all shards of this object
- if (shard_id == onode_ref->extent_map.shards.size()) {
- // We completed an Onode Object -> pass it to be processed
- read_allocation_from_single_onode(sbmap, onode_ref, stats);
- } else {
- derr << "Missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
- ceph_assert(shard_id == onode_ref->extent_map.shards.size());
- }
- } else {
- // We opened a new Object
- has_open_onode = true;
+ auto key = it->key();
+ auto okey = key;
+ dout(20) << __func__ << " decode onode " << pretty_binary_string(key) << dendl;
+ ghobject_t oid;
+ if (!is_extent_shard_key(it->key())) {
+ int r = get_key_object(okey, &oid);
+ if (r != 0) {
+ derr << __func__ << " failed to decode onode key = "
+ << pretty_binary_string(okey) << dendl;
+ return -EIO;
}
-
- // The main Obj is always first in RocksDB so we can start with shard_id set to zero
- shard_id = 0;
- stats.onode_count++;
- ghobject_t oid;
- int ret = get_key_object(it->key(), &oid);
- if (ret < 0) {
- derr << "bad object key " << pretty_binary_string(it->key()) << dendl;
- ceph_assert(ret == 0);
- continue;
+ edecoder.reset(oid,
+ &stats.actual_pool_vstatfs[oid.hobj.get_logical_pool()]);
+ Onode dummy_on(cct);
+ Onode::decode_raw(&dummy_on,
+ it->value(),
+ edecoder);
+ ++stats.onode_count;
+ } else {
+ uint32_t offset;
+ int r = get_key_extent_shard(key, &okey, &offset);
+ if (r != 0) {
+ derr << __func__ << " failed to decode onode extent key = "
+ << pretty_binary_string(key) << dendl;
+ return -EIO;
}
-
- // fill collection_ref if doesn't exist yet
- // We process all the obejcts in a given collection and then move to the next collection
- // This means we only search once for every given collection
- if (!collection_ref ||
- oid.shard_id != pgid.shard ||
- oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
- !collection_ref->contains(oid)) {
- stats.collection_search++;
- collection_ref = nullptr;
-
- for (auto& p : coll_map) {
- if (p.second->contains(oid)) {
- collection_ref = p.second;
- break;
- }
- }
-
- if (!collection_ref) {
- derr << "stray object " << oid << " not owned by any collection" << dendl;
- ceph_assert(collection_ref);
- continue;
- }
-
- collection_ref->cid.is_pg(&pgid);
+ r = get_key_object(okey, &oid);
+ if (r != 0) {
+ derr << __func__
+ << " failed to decode onode key= " << pretty_binary_string(okey)
+ << " from extent key= " << pretty_binary_string(key)
+ << dendl;
+ return -EIO;
}
- onode_ref.reset(BlueStore::Onode::decode(collection_ref, oid, it->key(), it->value()));
+ ceph_assert(oid == edecoder.get_oid());
+ edecoder.decode_some(it->value(), nullptr);
+ ++stats.shard_count;
}
}
- // process the last object
- if (has_open_onode) {
- // make sure we got all shards of this object
- if (shard_id == onode_ref->extent_map.shards.size()) {
- // We completed an Onode Object -> pass it to be processed
- read_allocation_from_single_onode(sbmap, onode_ref, stats);
- } else {
- derr << "Last Object is missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
- ceph_assert(shard_id == onode_ref->extent_map.shards.size());
- }
+ std::lock_guard l(vstatfs_lock);
+ store_statfs_t s;
+ osd_pools.clear();
+ for (auto& p : stats.actual_pool_vstatfs) {
+ if (per_pool_stat_collection) {
+ osd_pools[p.first] = p.second;
+ }
+ stats.actual_store_vstatfs += p.second;
+ p.second.publish(&s);
+ dout(5) << __func__ << " recovered pool "
+ << std::hex
+ << p.first << "->" << s
+ << std::dec
+ << " per-pool:" << per_pool_stat_collection
+ << dendl;
}
- dout(5) << "onode_count=" << stats.onode_count << " ,shard_count=" << stats.shard_count << dendl;
-
+ vstatfs = stats.actual_store_vstatfs;
+ vstatfs.publish(&s);
+ dout(5) << __func__ << " recovered " << s
+ << dendl;
return 0;
}
KeyValueDB::Transaction t = db->get_transaction();
// call _open_fm() with fm_restore set to TRUE
// this will mark the full device space as allocated (and not just the reserved space)
- _open_fm(t, true, true);
+ _open_fm(t, true, true, true);
if (fm == nullptr) {
derr << "Failed _open_fm()" << dendl;
return -1;
#endif // CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
//-------------------------------------------------------------------------------------
-static int commit_freelist_type(KeyValueDB *db, const std::string& freelist_type, CephContext *cct, const std::string &path)
+int BlueStore::commit_freelist_type()
{
// When freelist_type to "bitmap" we will store allocation in RocksDB
// When allocation-info is stored in a single file we set freelist_type to "null"
//-------------------------------------------------------------------------------------
int BlueStore::commit_to_null_manager()
{
- dout(5) << "Set FreelistManager to NULL FM..." << dendl;
+ dout(5) << __func__ << " Set FreelistManager to NULL FM..." << dendl;
fm->set_null_manager();
freelist_type = "null";
#if 1
- return commit_freelist_type(db, freelist_type, cct, path);
+ return commit_freelist_type();
#else
// should check how long this step take on a big configuration as deletes are expensive
- if (commit_freelist_type(db, freelist_type, cct, path) == 0) {
+ if (commit_freelist_type() == 0) {
// remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
clear_allocation_objects_from_rocksdb(db, cct, path);
}
dout(5) << "Set FreelistManager to Real FM..." << dendl;
ceph_assert(!fm->is_null_manager());
freelist_type = "bitmap";
- int ret = commit_freelist_type(db, freelist_type, cct, path);
+ int ret = commit_freelist_type();
if (ret == 0) {
//remove the allocation_file
invalidate_allocation_file_on_bluefs();