BlueStore::Onode,
boost::intrusive::list_member_hook<>,
&BlueStore::Onode::lru_item> > list_t;
- typedef boost::intrusive::list<
- BlueStore::Onode,
- boost::intrusive::member_hook<
- BlueStore::Onode,
- boost::intrusive::list_member_hook<>,
- &BlueStore::Onode::pin_item> > pin_list_t;
list_t lru;
- pin_list_t pin_list;
explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
- void _add(BlueStore::OnodeRef& o, int level) override
+ void _add(BlueStore::Onode* o, int level) override
{
- ceph_assert(o->s == nullptr);
- o->s = this;
- if (o->nref > 1) {
- pin_list.push_front(*o);
- o->pinned = true;
- num_pinned = pin_list.size();
- } else {
+ if (o->put_cache()) {
(level > 0) ? lru.push_front(*o) : lru.push_back(*o);
+ } else {
+ ++num_pinned;
}
- num = lru.size();
+ ++num; // we count both pinned and unpinned entries
+ dout(20) << __func__ << " " << this << " " << o->oid << " added, num=" << num << dendl;
}
- void _rm(BlueStore::OnodeRef& o) override
+ void _rm(BlueStore::Onode* o) override
{
- o->s = nullptr;
- if (o->pinned) {
- o->pinned = false;
- pin_list.erase(pin_list.iterator_to(*o));
- } else {
+ if (o->pop_cache()) {
lru.erase(lru.iterator_to(*o));
+ } else {
+ ceph_assert(num_pinned);
+ --num_pinned;
}
- num = lru.size();
- num_pinned = pin_list.size();
+ ceph_assert(num);
+ --num;
+ dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
}
- void _touch(BlueStore::OnodeRef& o) override
+ void _pin(BlueStore::Onode* o) override
{
- if (o->pinned) {
- return;
- }
lru.erase(lru.iterator_to(*o));
- lru.push_front(*o);
- num = lru.size();
+ ++num_pinned;
+ dout(20) << __func__ << this << " " << " " << " " << o->oid << " pinned" << dendl;
}
- void _pin(BlueStore::Onode& o) override
+ void _unpin(BlueStore::Onode* o) override
{
- if (o.pinned == true) {
- return;
- }
- lru.erase(lru.iterator_to(o));
- pin_list.push_front(o);
- o.pinned = true;
- num = lru.size();
- num_pinned = pin_list.size();
- dout(30) << __func__ << " " << o.oid << " pinned" << dendl;
-
- }
- void _unpin(BlueStore::Onode& o) override
- {
- if (o.pinned == false) {
- return;
- }
- pin_list.erase(pin_list.iterator_to(o));
- lru.push_front(o);
- o.pinned = false;
- num = lru.size();
- num_pinned = pin_list.size();
- dout(30) << __func__ << " " << o.oid << " unpinned" << dendl;
+ lru.push_front(*o);
+ ceph_assert(num_pinned);
+ --num_pinned;
+ dout(20) << __func__ << this << " " << " " << " " << o->oid << " unpinned" << dendl;
}
+
void _trim_to(uint64_t new_size) override
{
if (new_size >= lru.size()) {
auto p = lru.end();
ceph_assert(p != lru.begin());
--p;
- while (n > 0) {
+ ceph_assert(num >= n);
+ num -= n;
+ while (n-- > 0) {
BlueStore::Onode *o = &*p;
- dout(30) << __func__ << " rm " << o->oid << dendl;
+ dout(20) << __func__ << " rm " << o->oid << " "
+ << o->nref << " " << o->cached << " " << o->pinned << dendl;
if (p != lru.begin()) {
lru.erase(p--);
} else {
+ ceph_assert(n == 0);
lru.erase(p);
- ceph_assert(n == 1);
}
- o->s = nullptr;
- o->get(); // paranoia
- o->c->onode_map.remove(o->oid);
- o->put();
- --n;
+ auto pinned = !o->pop_cache();
+ ceph_assert(!pinned);
+ o->c->onode_map._remove(o->oid);
}
- num = lru.size();
+ }
+ void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
+ {
+ if (to == this) {
+ return;
+ }
+ ceph_assert(o->cached);
+ ceph_assert(o->pinned);
+ ceph_assert(num);
+ ceph_assert(num_pinned);
+ --num_pinned;
+ --num;
+ ++to->num_pinned;
+ ++to->num;
}
void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
{
- *onodes += num + num_pinned;
+ *onodes += num;
*pinned_onodes += num_pinned;
}
};
#undef dout_prefix
#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
-BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
+BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
+ OnodeRef& o)
{
std::lock_guard l(cache->lock);
auto p = onode_map.find(oid);
<< dendl;
return p->second;
}
- ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
+ ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
onode_map[oid] = o;
- cache->_add(o, 1);
+ cache->_add(o.get(), 1);
cache->_trim();
return o;
}
+void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
+{
+ ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
+ onode_map.erase(oid);
+}
+
BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
{
ldout(cache->cct, 30) << __func__ << dendl;
ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
} else {
ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
+ << " " << p->second->nref
+ << " " << p->second->cached
+ << " " << p->second->pinned
<< dendl;
- cache->_touch(p->second);
- hit = true;
+ // This will pin onode and implicitly touch the cache when Onode
+ // eventually will become unpinned
o = p->second;
+ ceph_assert(!o->cached || o->pinned);
+
+ hit = true;
}
}
void BlueStore::OnodeSpace::clear()
{
std::lock_guard l(cache->lock);
- ldout(cache->cct, 10) << __func__ << dendl;
+ ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
for (auto &p : onode_map) {
- cache->_rm(p.second);
+ cache->_rm(p.second.get());
}
onode_map.clear();
}
if (pn != onode_map.end()) {
ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
<< dendl;
- cache->_rm(pn->second);
+ cache->_rm(pn->second.get());
onode_map.erase(pn);
}
OnodeRef o = po->second;
// install a non-existent onode at old location
oldo.reset(new Onode(o->c, old_oid, o->key));
po->second = oldo;
- cache->_add(po->second, 1);
- // add at new position and fix oid, key
+ cache->_add(oldo.get(), 1);
+ // add at new position and fix oid, key.
+ // This will pin 'o' and implicitly touch cache
+ // when it will eventually become unpinned
onode_map.insert(make_pair(new_oid, o));
- cache->_touch(o);
+ ceph_assert(o->pinned);
+
o->oid = new_oid;
o->key = new_okey;
cache->_trim();
void BlueStore::OnodeSpace::dump(CephContext *cct)
{
for (auto& i : onode_map) {
- ldout(cct, LogLevelV) << i.first << " : " << i.second << dendl;
+ ldout(cct, LogLevelV) << i.first << " : " << i.second
+ << " " << i.second->nref
+ << " " << i.second->cached
+ << " " << i.second->pinned
+ << dendl;
}
}
#undef dout_prefix
#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
+//
+// A tricky thing about Onode's ref counter is that we do an additional
+// increment when newly pinned instance is detected. And -1 on unpin.
+// This prevents from a conflict with a delete call (when nref == 0).
+// The latter might happen while the thread is in unpin() function
+// (and e.g. waiting for lock acquisition) since nref is already
+// decremented. And another 'putting' thread on the instance will release it.
+//
+void BlueStore::Onode::get() {
+ if (++nref == 2) {
+ c->get_onode_cache()->pin(this, [&]() {
+ bool was_pinned = pinned;
+ pinned = nref >= 2;
+ // additional increment for newly pinned instance
+ bool r = !was_pinned && pinned;
+ if (r) {
+ ++nref;
+ }
+ return cached && r;
+ });
+ }
+}
+void BlueStore::Onode::put() {
+ if (--nref == 2) {
+ c->get_onode_cache()->unpin(this, [&]() {
+ bool was_pinned = pinned;
+ pinned = pinned && nref > 2; // intentionally use > not >= as we have
+ // +1 due to pinned state
+ bool r = was_pinned && !pinned;
+ // additional decrement for newly unpinned instance
+ if (r) {
+ --nref;
+ }
+ return cached && r;
+ });
+ }
+ if (nref == 0) {
+ delete this;
+ }
+}
+
BlueStore::Onode* BlueStore::Onode::decode(
CollectionRef c,
const ghobject_t& oid,
ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
<< dendl;
- // move the onode to the new map before futzing with the cache
- // shard, ensuring that nref is always >= 2, and no racing
- // thread can trigger a pin or unpin (which does *not* behave
- // well when we are clearing and resetting the 's' shard
- // pointer!).
+ // ensuring that nref is always >= 2 and hence onode is pinned and
+ // physically out of cache during the transition
+ OnodeRef o_pin = o;
+ ceph_assert(o->pinned);
+
p = onode_map.onode_map.erase(p);
dest->onode_map.onode_map[o->oid] = o;
-
- if (onode_map.cache != dest->onode_map.cache) {
- // move onode to a different cache shard
- onode_map.cache->_rm(o);
- o->c = dest;
- dest->onode_map.cache->_add(o, 1);
- } else {
- // the onode is in the same cache shard, making our move simpler.
- o->c = dest;
+ if (get_onode_cache() != dest->get_onode_cache()) {
+ get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
}
+ o->c = dest;
// move over shared blobs and buffers. cover shared blobs from
// both extent map and spanning blob map (the full extent map
out_stop:
_kv_stop();
out_coll:
- _flush_cache();
+ _shutdown_cache();
out_db:
_close_db_and_around(false);
out_bdev:
mempool_thread.shutdown();
dout(20) << __func__ << " stopping kv thread" << dendl;
_kv_stop();
- _flush_cache();
+ _shutdown_cache();
dout(20) << __func__ << " closing" << dendl;
}
out_scan:
mempool_thread.shutdown();
- _flush_cache();
+ _shutdown_cache();
out_db:
_close_db_and_around(false);
out_bdev:
}
-void BlueStore::_flush_cache()
+void BlueStore::_shutdown_cache()
{
dout(10) << __func__ << dendl;
- for (auto i : onode_cache_shards) {
- i->flush();
- ceph_assert(i->empty());
- }
for (auto i : buffer_cache_shards) {
i->flush();
ceph_assert(i->empty());
}
for (auto& p : coll_map) {
- if (!p.second->onode_map.empty()) {
- derr << __func__ << " stray onodes on " << p.first << dendl;
- p.second->onode_map.dump<0>(cct);
- }
+ p.second->onode_map.clear();
if (!p.second->shared_blob_set.empty()) {
derr << __func__ << " stray shared blobs on " << p.first << dendl;
p.second->shared_blob_set.dump<0>(cct);
ceph_assert(p.second->shared_blob_set.empty());
}
coll_map.clear();
+ for (auto i : onode_cache_shards) {
+ ceph_assert(i->empty());
+ }
}
// For external caller.
// - observed maximums on DB dev for DB/WAL/UNSORTED data
// - observed maximum spillovers
uint64_t max_db_use = 0; // max db usage we potentially observed
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
// this could go to db hence using it in the estimation
}
}
break;
+ case LEVEL_LOG:
case LEVEL_WAL:
res = BlueFS::BDEV_WAL;
break;
<< ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
<< ", db_avail:" << db_avail4slow << std::endl
<< "Usage matrix:" << std::endl;
- constexpr std::array<const char*, 7> names{ {
+ constexpr std::array<const char*, 8> names{ {
"DEV/LEV",
"WAL",
"DB",
"SLOW",
"*",
"*",
- "REAL"
+ "REAL",
+ "FILES",
} };
const size_t width = 12;
for (size_t i = 0; i < names.size(); ++i) {
sout.setf(std::ios::left, std::ios::adjustfield);
sout.width(width);
switch (l + LEVEL_FIRST) {
+ case LEVEL_LOG:
+ sout << "LOG"; break;
case LEVEL_WAL:
sout << "WAL"; break;
case LEVEL_DB:
case LEVEL_MAX:
sout << "TOTALS"; break;
}
- for (size_t d = 0; d < max_x - 1; d++) {
+ for (size_t d = 0; d < max_x; d++) {
sout.setf(std::ios::left, std::ios::adjustfield);
sout.width(width);
sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
}
sout.setf(std::ios::left, std::ios::adjustfield);
sout.width(width);
- sout << stringify(byte_u_t(per_level_per_dev_usage.at(max_x - 1, l)))
- << std::endl;
+ sout << stringify(per_level_files[l]) << std::endl;
}
ceph_assert(max_x == per_level_per_dev_max.get_max_x());
ceph_assert(max_y == per_level_per_dev_max.get_max_y());
sout.setf(std::ios::left, std::ios::adjustfield);
sout.width(width);
switch (l + LEVEL_FIRST) {
+ case LEVEL_LOG:
+ sout << "LOG"; break;
case LEVEL_WAL:
sout << "WAL"; break;
case LEVEL_DB: