1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 #ifndef DBOBJECTMAP_DB_H
3 #define DBOBJECTMAP_DB_H
5 #include "include/buffer_fwd.h"
11 #include <boost/scoped_ptr.hpp>
13 #include "os/ObjectMap.h"
14 #include "kv/KeyValueDB.h"
15 #include "osd/osd_types.h"
16 #include "common/Mutex.h"
17 #include "common/Cond.h"
18 #include "common/simple_cache.hpp"
19 #include <boost/optional/optional_io.hpp>
21 #include "SequencerPosition.h"
24 * DBObjectMap: Implements ObjectMap in terms of KeyValueDB
26 * Prefix space structure:
28 * @see complete_prefix
32 * - HOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->header.seq and
33 * corresponding omap header
34 * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
38 * @see generate_new_header
39 * - USER_PREFIX + header_key(header->seq) + USER_PREFIX
40 * : key->value for header->seq
41 * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below
42 * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs
43 * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX
44 * : USER_HEADER_KEY - omap header for header->seq
45 * : HEADER_KEY - encoding of header for header->seq
47 * For each node (represented by a header), we
48 * store three mappings: the key mapping, the complete mapping, and the parent.
49 * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in
50 * this mapping indicates that the key mapping contains all entries on [x,y).
51 * Note, max string is represented by "", so ""->"" indicates that the parent
52 * is unnecessary (@see rm_keys). When looking up a key not contained in the
53 * the complete set, we have to check the parent if we don't find it in the
54 * key set. During rm_keys, we copy keys from the parent and update the
55 * complete set to reflect the change @see rm_keys.
57 class DBObjectMap
: public ObjectMap
{
60 KeyValueDB
*get_db() override
{ return db
.get(); }
63 * Serializes access to next_seq as well as the in_use set
70 * Set of headers currently in use
73 set
<ghobject_t
> map_header_in_use
;
76 * Takes the map_header_in_use entry in constructor, releases in
81 boost::optional
<ghobject_t
> locked
;
83 MapHeaderLock(const MapHeaderLock
&);
84 MapHeaderLock
&operator=(const MapHeaderLock
&);
86 explicit MapHeaderLock(DBObjectMap
*db
) : db(db
) {}
87 MapHeaderLock(DBObjectMap
*db
, const ghobject_t
&oid
) : db(db
), locked(oid
) {
88 Mutex::Locker
l(db
->header_lock
);
89 while (db
->map_header_in_use
.count(*locked
))
90 db
->map_header_cond
.Wait(db
->header_lock
);
91 db
->map_header_in_use
.insert(*locked
);
94 const ghobject_t
&get_locked() const {
99 void swap(MapHeaderLock
&o
) {
100 ceph_assert(db
== o
.db
);
102 // centos6's boost optional doesn't seem to have swap :(
103 boost::optional
<ghobject_t
> _locked
= o
.locked
;
110 Mutex::Locker
l(db
->header_lock
);
111 ceph_assert(db
->map_header_in_use
.count(*locked
));
112 db
->map_header_cond
.Signal();
113 db
->map_header_in_use
.erase(*locked
);
118 DBObjectMap(CephContext
* cct
, KeyValueDB
*db
)
119 : ObjectMap(cct
, db
), header_lock("DBOBjectMap"),
120 cache_lock("DBObjectMap::CacheLock"),
121 caches(cct
->_conf
->filestore_omap_header_cache_size
)
125 const ghobject_t
&oid
,
126 const map
<string
, bufferlist
> &set
,
127 const SequencerPosition
*spos
=0
131 const ghobject_t
&oid
,
132 const bufferlist
&bl
,
133 const SequencerPosition
*spos
=0
137 const ghobject_t
&oid
,
142 const ghobject_t
&oid
,
143 const SequencerPosition
*spos
=0
146 int clear_keys_header(
147 const ghobject_t
&oid
,
148 const SequencerPosition
*spos
=0
152 const ghobject_t
&oid
,
153 const set
<string
> &to_clear
,
154 const SequencerPosition
*spos
=0
158 const ghobject_t
&oid
,
160 map
<string
, bufferlist
> *out
164 const ghobject_t
&oid
,
169 const ghobject_t
&oid
,
170 const set
<string
> &keys
,
171 map
<string
, bufferlist
> *out
175 const ghobject_t
&oid
,
176 const set
<string
> &keys
,
181 const ghobject_t
&oid
,
182 const set
<string
> &to_get
,
183 map
<string
, bufferlist
> *out
187 const ghobject_t
&oid
,
192 const ghobject_t
&oid
,
193 const map
<string
, bufferlist
> &to_set
,
194 const SequencerPosition
*spos
=0
198 const ghobject_t
&oid
,
199 const set
<string
> &to_remove
,
200 const SequencerPosition
*spos
=0
204 const ghobject_t
&oid
,
205 const ghobject_t
&target
,
206 const SequencerPosition
*spos
=0
210 const ghobject_t
&from
,
211 const ghobject_t
&to
,
212 const SequencerPosition
*spos
=0
216 const ghobject_t
&oid
,
217 const ghobject_t
&target
,
218 const SequencerPosition
*spos
=0
221 /// Read initial state from backing store
223 /// Write current state settings to DB
225 /// Read initial state and upgrade or initialize state
226 int init(bool upgrade
= false);
228 /// Upgrade store to current version
231 /// Consistency check, debug, there must be no parallel writes
232 int check(std::ostream
&out
, bool repair
= false, bool force
= false) override
;
234 /// Ensure that all previous operations are durable
235 int sync(const ghobject_t
*oid
=0, const SequencerPosition
*spos
=0) override
;
237 void compact() override
{
242 /// Util, get all objects, there must be no other concurrent access
243 int list_objects(vector
<ghobject_t
> *objs
///< [out] objects
247 // Util, get all object headers, there must be no other concurrent access
248 int list_object_headers(vector
<_Header
> *out
///< [out] headers
251 ObjectMapIterator
get_iterator(const ghobject_t
&oid
) override
;
253 static const string USER_PREFIX
;
254 static const string XATTR_PREFIX
;
255 static const string SYS_PREFIX
;
256 static const string COMPLETE_PREFIX
;
257 static const string HEADER_KEY
;
258 static const string USER_HEADER_KEY
;
259 static const string GLOBAL_STATE_KEY
;
260 static const string HOBJECT_TO_SEQ
;
263 static const string LEAF_PREFIX
;
264 static const string REVERSE_LEAF_PREFIX
;
266 /// persistent state for store @see generate_header
268 static const __u8 CUR_VERSION
= 3;
271 // legacy is false when complete regions never used
273 State() : v(0), seq(1), legacy(false) {}
274 explicit State(uint64_t seq
) : v(0), seq(seq
), legacy(false) {}
276 void encode(bufferlist
&bl
) const {
277 ENCODE_START(3, 1, bl
);
284 void decode(bufferlist::const_iterator
&bl
) {
298 void dump(Formatter
*f
) const {
299 f
->dump_unsigned("v", v
);
300 f
->dump_unsigned("seq", seq
);
301 f
->dump_bool("legacy", legacy
);
304 static void generate_test_instances(list
<State
*> &o
) {
305 o
.push_back(new State(0));
306 o
.push_back(new State(20));
313 uint64_t num_children
;
317 SequencerPosition spos
;
319 void encode(bufferlist
&bl
) const {
321 ENCODE_START(2, 1, bl
);
324 encode(num_children
, bl
);
331 void decode(bufferlist::const_iterator
&bl
) {
336 decode(num_children
, bl
);
344 void dump(Formatter
*f
) const {
345 f
->dump_unsigned("seq", seq
);
346 f
->dump_unsigned("parent", parent
);
347 f
->dump_unsigned("num_children", num_children
);
348 f
->dump_stream("oid") << oid
;
351 static void generate_test_instances(list
<_Header
*> &o
) {
352 o
.push_back(new _Header
);
353 o
.push_back(new _Header
);
354 o
.back()->parent
= 20;
358 _Header() : seq(0), parent(0), num_children(1) {}
361 /// String munging (public for testing)
362 static string
ghobject_key(const ghobject_t
&oid
);
363 static string
ghobject_key_v0(coll_t c
, const ghobject_t
&oid
);
364 static int is_buggy_ghobject_key_v1(CephContext
* cct
,
367 /// Implicit lock on Header->seq
368 typedef std::shared_ptr
<_Header
> Header
;
370 SimpleLRU
<ghobject_t
, _Header
> caches
;
372 string
map_header_key(const ghobject_t
&oid
);
373 string
header_key(uint64_t seq
);
374 string
complete_prefix(Header header
);
375 string
user_prefix(Header header
);
376 string
sys_prefix(Header header
);
377 string
xattr_prefix(Header header
);
378 string
sys_parent_prefix(_Header header
);
379 string
sys_parent_prefix(Header header
) {
380 return sys_parent_prefix(*header
);
383 class EmptyIteratorImpl
: public ObjectMapIteratorImpl
{
385 int seek_to_first() override
{ return 0; }
386 int seek_to_last() { return 0; }
387 int upper_bound(const string
&after
) override
{ return 0; }
388 int lower_bound(const string
&to
) override
{ return 0; }
389 bool valid() override
{ return false; }
390 int next() override
{ ceph_abort(); return 0; }
391 string
key() override
{ ceph_abort(); return ""; }
392 bufferlist
value() override
{ ceph_abort(); return bufferlist(); }
393 int status() override
{ return 0; }
398 class DBObjectMapIteratorImpl
: public ObjectMapIteratorImpl
{
402 /// NOTE: implicit lock hlock->get_locked() when returned out of the class
404 /// NOTE: implicit lock on header->seq AND for all ancestors
407 /// parent_iter == NULL iff no parent
408 std::shared_ptr
<DBObjectMapIteratorImpl
> parent_iter
;
409 KeyValueDB::Iterator key_iter
;
410 KeyValueDB::Iterator complete_iter
;
412 /// cur_iter points to currently valid iterator
413 std::shared_ptr
<ObjectMapIteratorImpl
> cur_iter
;
416 /// init() called, key_iter, complete_iter, parent_iter filled in
421 DBObjectMapIteratorImpl(DBObjectMap
*map
, Header header
) :
422 map(map
), hlock(map
), header(header
), r(0), ready(false), invalid(true) {}
423 int seek_to_first() override
;
425 int upper_bound(const string
&after
) override
;
426 int lower_bound(const string
&to
) override
;
427 bool valid() override
;
429 string
key() override
;
430 bufferlist
value() override
;
431 int status() override
;
434 return cur_iter
== parent_iter
;
437 /// skips to next valid parent entry
440 /// first parent() >= to
441 int lower_bound_parent(const string
&to
);
444 * Tests whether to_test is in complete region
446 * postcondition: complete_iter will be max s.t. complete_iter->value > to_test
448 int in_complete_region(const string
&to_test
, ///< [in] key to test
449 string
*begin
, ///< [out] beginning of region
450 string
*end
///< [out] end of region
451 ); ///< @returns true if to_test is in the complete region, else false
459 typedef std::shared_ptr
<DBObjectMapIteratorImpl
> DBObjectMapIterator
;
460 DBObjectMapIterator
_get_iterator(Header header
) {
461 return std::make_shared
<DBObjectMapIteratorImpl
>(this, header
);
466 /// Removes node corresponding to header
467 void clear_header(Header header
, KeyValueDB::Transaction t
);
469 /// Set node containing input to new contents
470 void set_header(Header input
, KeyValueDB::Transaction t
);
472 /// Remove leaf node corresponding to oid in c
473 void remove_map_header(
474 const MapHeaderLock
&l
,
475 const ghobject_t
&oid
,
477 KeyValueDB::Transaction t
);
479 /// Set leaf node for c and oid to the value of header
481 const MapHeaderLock
&l
,
482 const ghobject_t
&oid
, _Header header
,
483 KeyValueDB::Transaction t
);
485 /// Set leaf node for c and oid to the value of header
486 bool check_spos(const ghobject_t
&oid
,
488 const SequencerPosition
*spos
);
490 /// Lookup or create header for c oid
491 Header
lookup_create_map_header(
492 const MapHeaderLock
&l
,
493 const ghobject_t
&oid
,
494 KeyValueDB::Transaction t
);
497 * Generate new header for c oid with new seq number
499 * Has the side effect of synchronously saving the new DBObjectMap state
501 Header
_generate_new_header(const ghobject_t
&oid
, Header parent
);
502 Header
generate_new_header(const ghobject_t
&oid
, Header parent
) {
503 Mutex::Locker
l(header_lock
);
504 return _generate_new_header(oid
, parent
);
507 /// Lookup leaf header for c oid
508 Header
_lookup_map_header(
509 const MapHeaderLock
&l
,
510 const ghobject_t
&oid
);
511 Header
lookup_map_header(
512 const MapHeaderLock
&l2
,
513 const ghobject_t
&oid
) {
514 Mutex::Locker
l(header_lock
);
515 return _lookup_map_header(l2
, oid
);
518 /// Lookup header node for input
519 Header
lookup_parent(Header input
);
523 int _get_header(Header header
, bufferlist
*bl
);
525 /// Scan keys in header into out_keys and out_values (if nonnull)
526 int scan(Header header
,
527 const set
<string
> &in_keys
,
528 set
<string
> *out_keys
,
529 map
<string
, bufferlist
> *out_values
);
531 /// Remove header and all related prefixes
532 int _clear(Header header
,
533 KeyValueDB::Transaction t
);
535 /* Scan complete region bumping *begin to the beginning of any
536 * containing region and adding all complete region keys between
537 * the updated begin and end to the complete_keys_to_remove set */
538 int merge_new_complete(DBObjectMapIterator
&iter
,
541 set
<string
> *complete_keys_to_remove
);
543 /// Writes out State (mainly next_seq)
544 int write_state(KeyValueDB::Transaction _t
=
545 KeyValueDB::Transaction());
547 /// Copies header entry from parent @see rm_keys
548 int copy_up_header(Header header
,
549 KeyValueDB::Transaction t
);
551 /// Sets header @see set_header
552 void _set_header(Header header
, const bufferlist
&bl
,
553 KeyValueDB::Transaction t
);
556 * Removes header seq lock and possibly object lock
557 * once Header is out of scope
559 * @see generate_new_header
561 class RemoveOnDelete
{
564 explicit RemoveOnDelete(DBObjectMap
*db
) :
566 void operator() (_Header
*header
) {
567 Mutex::Locker
l(db
->header_lock
);
568 ceph_assert(db
->in_use
.count(header
->seq
));
569 db
->in_use
.erase(header
->seq
);
570 db
->header_cond
.Signal();
574 friend class RemoveOnDelete
;
576 WRITE_CLASS_ENCODER(DBObjectMap::_Header
)
577 WRITE_CLASS_ENCODER(DBObjectMap::State
)
579 ostream
& operator<<(ostream
& out
, const DBObjectMap::_Header
& h
);