// bluestore_cache_other
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
- bluestore_cache_other);
+ bluestore_Buffer);
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
- bluestore_cache_other);
+ bluestore_Extent);
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
- bluestore_cache_other);
+ bluestore_Blob);
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
- bluestore_cache_other);
+ bluestore_SharedBlob);
// bluestore_txc
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
* We use ! as a terminator for strings; this works because it is < #
* and will get escaped if it is present in the string.
*
+ * NOTE: There is a bug in this implementation: due to implicit
+ * character type conversion in comparison it may produce unexpected
+ * ordering. Unfortunately fixing the bug would mean invalidating the
+ * keys in existing deployments. Instead we do additional sorting
+ * where it is needed.
*/
template<typename S>
static void append_escaped(const string &in, S *out)
char hexbyte[in.length() * 3 + 1];
char* ptr = &hexbyte[0];
for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
- if (*i <= '#') {
+ if (*i <= '#') { // bug: unexpected result for *i > 0x7f
*ptr++ = '#';
*ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
*ptr++ = "0123456789abcdef"[*i & 0x0f];
- } else if (*i >= '~') {
+ } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
*ptr++ = '~';
*ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
*ptr++ = "0123456789abcdef"[*i & 0x0f];
return key + 1;
}
-static void get_coll_key_range(const coll_t& cid, int bits,
- string *temp_start, string *temp_end,
- string *start, string *end)
+static void get_coll_range(const coll_t& cid, int bits,
+ ghobject_t *temp_start, ghobject_t *temp_end,
+ ghobject_t *start, ghobject_t *end)
{
- temp_start->clear();
- temp_end->clear();
- start->clear();
- end->clear();
-
spg_t pgid;
if (cid.is_pg(&pgid)) {
- _key_encode_shard(pgid.shard, start);
+ start->shard_id = pgid.shard;
*temp_start = *start;
- _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
- _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
+ start->hobj.pool = pgid.pool();
+ temp_start->hobj.pool = -2ll - pgid.pool();
*end = *start;
*temp_end = *temp_start;
uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
- _key_encode_u32(reverse_hash, start);
- _key_encode_u32(reverse_hash, temp_start);
+ start->hobj.set_bitwise_key_u32(reverse_hash);
+ temp_start->hobj.set_bitwise_key_u32(reverse_hash);
uint64_t end_hash = reverse_hash + (1ull << (32 - bits));
if (end_hash > 0xffffffffull)
end_hash = 0xffffffffull;
- _key_encode_u32(end_hash, end);
- _key_encode_u32(end_hash, temp_end);
+ end->hobj.set_bitwise_key_u32(end_hash);
+ temp_end->hobj.set_bitwise_key_u32(end_hash);
} else {
- _key_encode_shard(shard_id_t::NO_SHARD, start);
- _key_encode_u64(-1ull + 0x8000000000000000ull, start);
+ start->shard_id = shard_id_t::NO_SHARD;
+ start->hobj.pool = -1ull;
+
*end = *start;
- _key_encode_u32(0, start);
- _key_encode_u32(0xffffffff, end);
+ start->hobj.set_bitwise_key_u32(0);
+ end->hobj.set_bitwise_key_u32(0xffffffff);
// no separate temp section
*temp_start = *end;
*temp_end = *end;
}
+
+ start->generation = 0;
+ end->generation = 0;
+ temp_start->generation = 0;
+ temp_end->generation = 0;
}
static void get_shared_blob_key(uint64_t sbid, string *key)
}
template<typename S>
-static int get_key_object(const S& key, ghobject_t *oid)
+static void _key_encode_prefix(const ghobject_t& oid, S *key)
{
- int r;
- const char *p = key.c_str();
+ _key_encode_shard(oid.shard_id, key);
+ _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
+ _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
+}
- if (key.length() < 1 + 8 + 4)
- return -1;
+static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
+{
p = _key_decode_shard(p, &oid->shard_id);
uint64_t pool;
oid->hobj.set_bitwise_key_u32(hash);
+ return p;
+}
+
+#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
+
+template<typename S>
+static int get_key_object(const S& key, ghobject_t *oid)
+{
+ int r;
+ const char *p = key.c_str();
+
+ if (key.length() < ENCODED_KEY_PREFIX_LEN)
+ return -1;
+
+ p = _key_decode_prefix(p, oid);
+
+ if (key.length() == ENCODED_KEY_PREFIX_LEN)
+ return -2;
+
r = decode_escaped(p, &oid->hobj.nspace);
if (r < 0)
return -2;
{
key->clear();
- size_t max_len = 1 + 8 + 4 +
+ size_t max_len = ENCODED_KEY_PREFIX_LEN +
(oid.hobj.nspace.length() * 3 + 1) +
(oid.hobj.get_key().length() * 3 + 1) +
1 + // for '<', '=', or '>'
8 + 8 + 1;
key->reserve(max_len);
- _key_encode_shard(oid.shard_id, key);
- _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
- _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
+ _key_encode_prefix(oid, key);
append_escaped(oid.hobj.nspace, key);
return out << ")";
}
+namespace {
+
+/*
+ * Due to a bug in key string encoding (see a comment for append_escaped)
+ * the KeyValueDB iterator does not lexicographically sort the same
+ * way that ghobject_t does: objects with the same hash may have wrong order.
+ *
+ * This is the iterator wrapper that fixes the keys order.
+ */
+
+class CollectionListIterator {
+public:
+ CollectionListIterator(const KeyValueDB::Iterator &it)
+ : m_it(it) {
+ }
+ virtual ~CollectionListIterator() {
+ }
+
+ virtual bool valid() const = 0;
+ virtual const ghobject_t &oid() const = 0;
+ virtual void lower_bound(const ghobject_t &oid) = 0;
+ virtual void upper_bound(const ghobject_t &oid) = 0;
+ virtual void next() = 0;
+
+protected:
+ KeyValueDB::Iterator m_it;
+};
+
+class SimpleCollectionListIterator : public CollectionListIterator {
+public:
+ SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
+ : CollectionListIterator(it), m_cct(cct) {
+ }
+
+ bool valid() const override {
+ return m_it->valid();
+ }
+
+ const ghobject_t &oid() const override {
+ ceph_assert(valid());
+
+ return m_oid;
+ }
+
+ void lower_bound(const ghobject_t &oid) override {
+ string key;
+ get_object_key(m_cct, oid, &key);
+
+ m_it->lower_bound(key);
+ get_oid();
+ }
+
+ void upper_bound(const ghobject_t &oid) override {
+ string key;
+ get_object_key(m_cct, oid, &key);
+
+ m_it->upper_bound(key);
+ get_oid();
+ }
+
+ void next() override {
+ ceph_assert(valid());
+
+ m_it->next();
+ get_oid();
+ }
+
+private:
+ CephContext *m_cct;
+ ghobject_t m_oid;
+
+ void get_oid() {
+ if (!valid()) {
+ return;
+ }
+
+ if (is_extent_shard_key(m_it->key())) {
+ next();
+ return;
+ }
+
+ m_oid = ghobject_t();
+ int r = get_key_object(m_it->key(), &m_oid);
+ ceph_assert(r == 0);
+ }
+};
+
+class SortedCollectionListIterator : public CollectionListIterator {
+public:
+ SortedCollectionListIterator(const KeyValueDB::Iterator &it)
+ : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
+ }
+
+ bool valid() const override {
+ return m_chunk_iter != m_chunk.end();
+ }
+
+ const ghobject_t &oid() const override {
+ ceph_assert(valid());
+
+ return m_chunk_iter->first;
+ }
+
+ void lower_bound(const ghobject_t &oid) override {
+ std::string key;
+ _key_encode_prefix(oid, &key);
+
+ m_it->lower_bound(key);
+ m_chunk_iter = m_chunk.end();
+ if (!get_next_chunk()) {
+ return;
+ }
+
+ if (this->oid().shard_id != oid.shard_id ||
+ this->oid().hobj.pool != oid.hobj.pool ||
+ this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
+ return;
+ }
+
+ m_chunk_iter = m_chunk.lower_bound(oid);
+ if (m_chunk_iter == m_chunk.end()) {
+ get_next_chunk();
+ }
+ }
+
+ void upper_bound(const ghobject_t &oid) override {
+ lower_bound(oid);
+
+ if (valid() && this->oid() == oid) {
+ next();
+ }
+ }
+
+ void next() override {
+ ceph_assert(valid());
+
+ m_chunk_iter++;
+ if (m_chunk_iter == m_chunk.end()) {
+ get_next_chunk();
+ }
+ }
+
+private:
+ std::map<ghobject_t, std::string> m_chunk;
+ std::map<ghobject_t, std::string>::iterator m_chunk_iter;
+
+ bool get_next_chunk() {
+ while (m_it->valid() && is_extent_shard_key(m_it->key())) {
+ m_it->next();
+ }
+
+ if (!m_it->valid()) {
+ return false;
+ }
+
+ ghobject_t oid;
+ int r = get_key_object(m_it->key(), &oid);
+ ceph_assert(r == 0);
+
+ m_chunk.clear();
+ while (true) {
+ m_chunk.insert({oid, m_it->key()});
+
+ do {
+ m_it->next();
+ } while (m_it->valid() && is_extent_shard_key(m_it->key()));
+
+ if (!m_it->valid()) {
+ break;
+ }
+
+ ghobject_t next;
+ r = get_key_object(m_it->key(), &next);
+ ceph_assert(r == 0);
+ if (next.shard_id != oid.shard_id ||
+ next.hobj.pool != oid.hobj.pool ||
+ next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
+ break;
+ }
+ oid = next;
+ }
+
+ m_chunk_iter = m_chunk.begin();
+ return true;
+ }
+};
+
+} // anonymous namespace
+
// Garbage Collector
void BlueStore::GarbageCollector::process_protrusive_extents(
OnodeRef& oldo,
const ghobject_t& old_oid,
const ghobject_t& new_oid,
- const mempool::bluestore_cache_other::string& new_okey)
+ const mempool::bluestore_cache_meta::string& new_okey)
{
std::lock_guard l(cache->lock);
ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
unsigned n;
// we need to encode inline_bl to measure encoded length
bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
- inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
ceph_assert(!never_happen);
size_t len = inline_bl.length();
dout(20) << __func__ << " inline shard " << len << " bytes from " << n
auto p = v.front().begin_deep();
on->onode.decode(p);
for (auto& i : on->onode.attrs) {
- i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
}
// initialize extent_map
denc(on->extent_map.inline_bl, p);
on->extent_map.decode_some(on->extent_map.inline_bl);
on->extent_map.inline_bl.reassign_to_mempool(
- mempool::mempool_bluestore_cache_other);
+ mempool::mempool_bluestore_cache_data);
}
else {
on->extent_map.init_shards(false, false);
int r;
{
std::shared_lock l(c->lock);
- mempool::bluestore_cache_other::string k(name);
+ mempool::bluestore_cache_meta::string k(name);
OnodeRef o = c->get_onode(oid, false);
if (!o || !o->exists) {
int r;
{
std::shared_lock l(c->lock);
- r = _collection_list(c, start, end, max, ls, pnext);
+ r = _collection_list(c, start, end, max, false, ls, pnext);
+ }
+
+ dout(10) << __func__ << " " << c->cid
+ << " start " << start << " end " << end << " max " << max
+ << " = " << r << ", ls.size() = " << ls->size()
+ << ", next = " << (pnext ? *pnext : ghobject_t()) << dendl;
+ return r;
+}
+
+int BlueStore::collection_list_legacy(
+ CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
+ vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+ Collection *c = static_cast<Collection *>(c_.get());
+ c->flush();
+ dout(15) << __func__ << " " << c->cid
+ << " start " << start << " end " << end << " max " << max << dendl;
+ int r;
+ {
+ std::shared_lock l(c->lock);
+ r = _collection_list(c, start, end, max, true, ls, pnext);
}
dout(10) << __func__ << " " << c->cid
int BlueStore::_collection_list(
Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
- vector<ghobject_t> *ls, ghobject_t *pnext)
+ bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
{
if (!c->exists)
auto start_time = mono_clock::now();
int r = 0;
ghobject_t static_next;
- KeyValueDB::Iterator it;
- string temp_start_key, temp_end_key;
- string start_key, end_key;
+ std::unique_ptr<CollectionListIterator> it;
+ ghobject_t coll_range_temp_start, coll_range_temp_end;
+ ghobject_t coll_range_start, coll_range_end;
bool set_next = false;
- string pend;
+ ghobject_t pend;
bool temp;
if (!pnext)
if (start.is_max() || start.hobj.is_max()) {
goto out;
}
- get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
- &start_key, &end_key);
+ get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
+ &coll_range_temp_end, &coll_range_start, &coll_range_end);
dout(20) << __func__
- << " range " << pretty_binary_string(temp_start_key)
- << " to " << pretty_binary_string(temp_end_key)
- << " and " << pretty_binary_string(start_key)
- << " to " << pretty_binary_string(end_key)
+ << " range " << coll_range_temp_start
+ << " to " << coll_range_temp_end
+ << " and " << coll_range_start
+ << " to " << coll_range_end
<< " start " << start << dendl;
- it = db->get_iterator(PREFIX_OBJ);
+ if (legacy) {
+ it = std::make_unique<SimpleCollectionListIterator>(
+ cct, db->get_iterator(PREFIX_OBJ));
+ } else {
+ it = std::make_unique<SortedCollectionListIterator>(
+ db->get_iterator(PREFIX_OBJ));
+ }
if (start == ghobject_t() ||
start.hobj == hobject_t() ||
start == c->cid.get_min_hobj()) {
- it->upper_bound(temp_start_key);
+ it->upper_bound(coll_range_temp_start);
temp = true;
} else {
- string k;
- get_object_key(cct, start, &k);
if (start.hobj.is_temp()) {
temp = true;
- ceph_assert(k >= temp_start_key && k < temp_end_key);
+ ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
} else {
temp = false;
- ceph_assert(k >= start_key && k < end_key);
+ ceph_assert(start >= coll_range_start && start < coll_range_end);
}
- dout(20) << __func__ << " start from " << pretty_binary_string(k)
- << " temp=" << (int)temp << dendl;
- it->lower_bound(k);
+ dout(20) << __func__ << " temp=" << (int)temp << dendl;
+ it->lower_bound(start);
}
if (end.hobj.is_max()) {
- pend = temp ? temp_end_key : end_key;
+ pend = temp ? coll_range_temp_end : coll_range_end;
} else {
- get_object_key(cct, end, &end_key);
if (end.hobj.is_temp()) {
if (temp)
- pend = end_key;
+ pend = end;
else
- goto out;
+ goto out;
} else {
- pend = temp ? temp_end_key : end_key;
+ pend = temp ? coll_range_temp_end : end;
}
}
- dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
+ dout(20) << __func__ << " pend " << pend << dendl;
while (true) {
- if (!it->valid() || it->key() >= pend) {
+ if (!it->valid() || it->oid() >= pend) {
if (!it->valid())
dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
else
- dout(20) << __func__ << " key " << pretty_binary_string(it->key())
- << " >= " << end << dendl;
+ dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
if (temp) {
if (end.hobj.is_temp()) {
+ if (it->valid() && it->oid() < coll_range_temp_end) {
+ *pnext = it->oid();
+ set_next = true;
+ }
break;
}
dout(30) << __func__ << " switch to non-temp namespace" << dendl;
temp = false;
- it->upper_bound(start_key);
- pend = end_key;
- dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
+ it->upper_bound(coll_range_start);
+ if (end.hobj.is_max())
+ pend = coll_range_end;
+ else
+ pend = end;
+ dout(30) << __func__ << " pend " << pend << dendl;
continue;
}
+ if (it->valid() && it->oid() < coll_range_end) {
+ *pnext = it->oid();
+ set_next = true;
+ }
break;
}
- dout(30) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
- if (is_extent_shard_key(it->key())) {
- it->next();
- continue;
- }
- ghobject_t oid;
- int r = get_key_object(it->key(), &oid);
- ceph_assert(r == 0);
- dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
+ dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
if (ls->size() >= (unsigned)max) {
dout(20) << __func__ << " reached max " << max << dendl;
- *pnext = oid;
+ *pnext = it->oid();
set_next = true;
break;
}
- ls->push_back(oid);
+ ls->push_back(it->oid());
it->next();
}
out:
if (val.is_partial()) {
auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
val.length());
- b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
} else {
auto& b = o->onode.attrs[name.c_str()] = val;
- b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
}
txc->write_onode(o);
dout(10) << __func__ << " " << c->cid << " " << o->oid
if (p->second.is_partial()) {
auto& b = o->onode.attrs[p->first.c_str()] =
bufferptr(p->second.c_str(), p->second.length());
- b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
} else {
auto& b = o->onode.attrs[p->first.c_str()] = p->second;
- b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
}
}
txc->write_onode(o);
<< new_oid << dendl;
int r;
ghobject_t old_oid = oldo->oid;
- mempool::bluestore_cache_other::string new_okey;
+ mempool::bluestore_cache_meta::string new_okey;
if (newo) {
if (newo->exists) {
// then check if all of them are marked as non-existent.
// Bypass the check if (next != ghobject_t::get_max())
r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
- nonexistent_count + 1, &ls, &next);
+ nonexistent_count + 1, false, &ls, &next);
if (r >= 0) {
// If true mean collecton has more objects than nonexistent_count,
// so bypass check.