1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 #ifndef DBOBJECTMAP_DB_H
3 #define DBOBJECTMAP_DB_H
5 #include "include/buffer_fwd.h"
11 #include <boost/scoped_ptr.hpp>
13 #include "os/ObjectMap.h"
14 #include "kv/KeyValueDB.h"
15 #include "osd/osd_types.h"
16 #include "common/ceph_mutex.h"
17 #include "common/simple_cache.hpp"
18 #include <boost/optional/optional_io.hpp>
20 #include "SequencerPosition.h"
23 * DBObjectMap: Implements ObjectMap in terms of KeyValueDB
25 * Prefix space structure:
27 * @see complete_prefix
31 * - HOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->header.seq and
32 * corresponding omap header
33 * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
37 * @see generate_new_header
38 * - USER_PREFIX + header_key(header->seq) + USER_PREFIX
39 * : key->value for header->seq
40 * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below
41 * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs
42 * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX
43 * : USER_HEADER_KEY - omap header for header->seq
44 * : HEADER_KEY - encoding of header for header->seq
46 * For each node (represented by a header), we
47 * store three mappings: the key mapping, the complete mapping, and the parent.
48 * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in
49 * this mapping indicates that the key mapping contains all entries on [x,y).
50 * Note, max std::string is represented by "", so ""->"" indicates that the parent
51 * is unnecessary (@see rm_keys). When looking up a key not contained in the
52 * the complete std::set, we have to check the parent if we don't find it in the
53 * key std::set. During rm_keys, we copy keys from the parent and update the
54 * complete std::set to reflect the change @see rm_keys.
56 class DBObjectMap
: public ObjectMap
{
59 KeyValueDB
*get_db() override
{ return db
.get(); }
62 * Serializes access to next_seq as well as the in_use std::set
64 ceph::mutex header_lock
= ceph::make_mutex("DBOBjectMap");
65 ceph::condition_variable header_cond
;
66 ceph::condition_variable map_header_cond
;
69 * Std::Set of headers currently in use
71 std::set
<uint64_t> in_use
;
72 std::set
<ghobject_t
> map_header_in_use
;
75 * Takes the map_header_in_use entry in constructor, releases in
80 boost::optional
<ghobject_t
> locked
;
82 MapHeaderLock(const MapHeaderLock
&);
83 MapHeaderLock
&operator=(const MapHeaderLock
&);
85 explicit MapHeaderLock(DBObjectMap
*db
) : db(db
) {}
86 MapHeaderLock(DBObjectMap
*db
, const ghobject_t
&oid
) : db(db
), locked(oid
) {
87 std::unique_lock l
{db
->header_lock
};
88 db
->map_header_cond
.wait(l
, [db
, this] {
89 return !db
->map_header_in_use
.count(*locked
);
91 db
->map_header_in_use
.insert(*locked
);
94 const ghobject_t
&get_locked() const {
99 void swap(MapHeaderLock
&o
) {
100 ceph_assert(db
== o
.db
);
102 // centos6's boost optional doesn't seem to have swap :(
103 boost::optional
<ghobject_t
> _locked
= o
.locked
;
110 std::lock_guard l
{db
->header_lock
};
111 ceph_assert(db
->map_header_in_use
.count(*locked
));
112 db
->map_header_cond
.notify_all();
113 db
->map_header_in_use
.erase(*locked
);
118 DBObjectMap(CephContext
* cct
, KeyValueDB
*db
)
119 : ObjectMap(cct
, db
),
120 caches(cct
->_conf
->filestore_omap_header_cache_size
)
124 const ghobject_t
&oid
,
125 const std::map
<std::string
, ceph::buffer::list
> &set
,
126 const SequencerPosition
*spos
=0
130 const ghobject_t
&oid
,
131 const ceph::buffer::list
&bl
,
132 const SequencerPosition
*spos
=0
136 const ghobject_t
&oid
,
137 ceph::buffer::list
*bl
141 const ghobject_t
&oid
,
142 const SequencerPosition
*spos
=0
145 int clear_keys_header(
146 const ghobject_t
&oid
,
147 const SequencerPosition
*spos
=0
151 const ghobject_t
&oid
,
152 const std::set
<std::string
> &to_clear
,
153 const SequencerPosition
*spos
=0
157 const ghobject_t
&oid
,
158 ceph::buffer::list
*header
,
159 std::map
<std::string
, ceph::buffer::list
> *out
163 const ghobject_t
&oid
,
164 std::set
<std::string
> *keys
168 const ghobject_t
&oid
,
169 const std::set
<std::string
> &keys
,
170 std::map
<std::string
, ceph::buffer::list
> *out
174 const ghobject_t
&oid
,
175 const std::set
<std::string
> &keys
,
176 std::set
<std::string
> *out
180 const ghobject_t
&oid
,
181 const std::set
<std::string
> &to_get
,
182 std::map
<std::string
, ceph::buffer::list
> *out
186 const ghobject_t
&oid
,
187 std::set
<std::string
> *out
191 const ghobject_t
&oid
,
192 const std::map
<std::string
, ceph::buffer::list
> &to_set
,
193 const SequencerPosition
*spos
=0
197 const ghobject_t
&oid
,
198 const std::set
<std::string
> &to_remove
,
199 const SequencerPosition
*spos
=0
203 const ghobject_t
&oid
,
204 const ghobject_t
&target
,
205 const SequencerPosition
*spos
=0
209 const ghobject_t
&from
,
210 const ghobject_t
&to
,
211 const SequencerPosition
*spos
=0
215 const ghobject_t
&oid
,
216 const ghobject_t
&target
,
217 const SequencerPosition
*spos
=0
220 /// Read initial state from backing store
222 /// Write current state settings to DB
224 /// Read initial state and upgrade or initialize state
225 int init(bool upgrade
= false);
227 /// Upgrade store to current version
230 /// Consistency check, debug, there must be no parallel writes
231 int check(std::ostream
&out
, bool repair
= false, bool force
= false) override
;
233 /// Ensure that all previous operations are durable
234 int sync(const ghobject_t
*oid
=0, const SequencerPosition
*spos
=0) override
;
236 void compact() override
{
241 /// Util, get all objects, there must be no other concurrent access
242 int list_objects(std::vector
<ghobject_t
> *objs
///< [out] objects
246 // Util, get all object headers, there must be no other concurrent access
247 int list_object_headers(std::vector
<_Header
> *out
///< [out] headers
250 ObjectMapIterator
get_iterator(const ghobject_t
&oid
) override
;
252 static const std::string USER_PREFIX
;
253 static const std::string XATTR_PREFIX
;
254 static const std::string SYS_PREFIX
;
255 static const std::string COMPLETE_PREFIX
;
256 static const std::string HEADER_KEY
;
257 static const std::string USER_HEADER_KEY
;
258 static const std::string GLOBAL_STATE_KEY
;
259 static const std::string HOBJECT_TO_SEQ
;
262 static const std::string LEAF_PREFIX
;
263 static const std::string REVERSE_LEAF_PREFIX
;
265 /// persistent state for store @see generate_header
267 static const __u8 CUR_VERSION
= 3;
270 // legacy is false when complete regions never used
272 State() : v(0), seq(1), legacy(false) {}
273 explicit State(uint64_t seq
) : v(0), seq(seq
), legacy(false) {}
275 void encode(ceph::buffer::list
&bl
) const {
276 ENCODE_START(3, 1, bl
);
283 void decode(ceph::buffer::list::const_iterator
&bl
) {
297 void dump(ceph::Formatter
*f
) const {
298 f
->dump_unsigned("v", v
);
299 f
->dump_unsigned("seq", seq
);
300 f
->dump_bool("legacy", legacy
);
303 static void generate_test_instances(std::list
<State
*> &o
) {
304 o
.push_back(new State(0));
305 o
.push_back(new State(20));
312 uint64_t num_children
;
316 SequencerPosition spos
;
318 void encode(ceph::buffer::list
&bl
) const {
320 ENCODE_START(2, 1, bl
);
323 encode(num_children
, bl
);
330 void decode(ceph::buffer::list::const_iterator
&bl
) {
335 decode(num_children
, bl
);
343 void dump(ceph::Formatter
*f
) const {
344 f
->dump_unsigned("seq", seq
);
345 f
->dump_unsigned("parent", parent
);
346 f
->dump_unsigned("num_children", num_children
);
347 f
->dump_stream("oid") << oid
;
350 static void generate_test_instances(std::list
<_Header
*> &o
) {
351 o
.push_back(new _Header
);
352 o
.push_back(new _Header
);
353 o
.back()->parent
= 20;
358 return sizeof(_Header
);
361 _Header() : seq(0), parent(0), num_children(1) {}
364 /// Std::String munging (public for testing)
365 static std::string
ghobject_key(const ghobject_t
&oid
);
366 static std::string
ghobject_key_v0(coll_t c
, const ghobject_t
&oid
);
367 static int is_buggy_ghobject_key_v1(CephContext
* cct
,
368 const std::string
&in
);
370 /// Implicit lock on Header->seq
371 typedef std::shared_ptr
<_Header
> Header
;
372 ceph::mutex cache_lock
= ceph::make_mutex("DBObjectMap::CacheLock");
373 SimpleLRU
<ghobject_t
, _Header
> caches
;
375 std::string
map_header_key(const ghobject_t
&oid
);
376 std::string
header_key(uint64_t seq
);
377 std::string
complete_prefix(Header header
);
378 std::string
user_prefix(Header header
);
379 std::string
sys_prefix(Header header
);
380 std::string
xattr_prefix(Header header
);
381 std::string
sys_parent_prefix(_Header header
);
382 std::string
sys_parent_prefix(Header header
) {
383 return sys_parent_prefix(*header
);
386 class EmptyIteratorImpl
: public ObjectMapIteratorImpl
{
388 int seek_to_first() override
{ return 0; }
389 int seek_to_last() { return 0; }
390 int upper_bound(const std::string
&after
) override
{ return 0; }
391 int lower_bound(const std::string
&to
) override
{ return 0; }
392 bool valid() override
{ return false; }
393 int next() override
{ ceph_abort(); return 0; }
394 std::string
key() override
{ ceph_abort(); return ""; }
395 ceph::buffer::list
value() override
{ ceph_abort(); return ceph::buffer::list(); }
396 int status() override
{ return 0; }
401 class DBObjectMapIteratorImpl
: public ObjectMapIteratorImpl
{
405 /// NOTE: implicit lock hlock->get_locked() when returned out of the class
407 /// NOTE: implicit lock on header->seq AND for all ancestors
410 /// parent_iter == NULL iff no parent
411 std::shared_ptr
<DBObjectMapIteratorImpl
> parent_iter
;
412 KeyValueDB::Iterator key_iter
;
413 KeyValueDB::Iterator complete_iter
;
415 /// cur_iter points to currently valid iterator
416 std::shared_ptr
<ObjectMapIteratorImpl
> cur_iter
;
419 /// init() called, key_iter, complete_iter, parent_iter filled in
424 DBObjectMapIteratorImpl(DBObjectMap
*map
, Header header
) :
425 map(map
), hlock(map
), header(header
), r(0), ready(false), invalid(true) {}
426 int seek_to_first() override
;
428 int upper_bound(const std::string
&after
) override
;
429 int lower_bound(const std::string
&to
) override
;
430 bool valid() override
;
432 std::string
key() override
;
433 ceph::buffer::list
value() override
;
434 int status() override
;
437 return cur_iter
== parent_iter
;
440 /// skips to next valid parent entry
443 /// first parent() >= to
444 int lower_bound_parent(const std::string
&to
);
447 * Tests whether to_test is in complete region
449 * postcondition: complete_iter will be max s.t. complete_iter->value > to_test
451 int in_complete_region(const std::string
&to_test
, ///< [in] key to test
452 std::string
*begin
, ///< [out] beginning of region
453 std::string
*end
///< [out] end of region
454 ); ///< @returns true if to_test is in the complete region, else false
462 typedef std::shared_ptr
<DBObjectMapIteratorImpl
> DBObjectMapIterator
;
463 DBObjectMapIterator
_get_iterator(Header header
) {
464 return std::make_shared
<DBObjectMapIteratorImpl
>(this, header
);
469 /// Removes node corresponding to header
470 void clear_header(Header header
, KeyValueDB::Transaction t
);
472 /// Std::Set node containing input to new contents
473 void set_header(Header input
, KeyValueDB::Transaction t
);
475 /// Remove leaf node corresponding to oid in c
476 void remove_map_header(
477 const MapHeaderLock
&l
,
478 const ghobject_t
&oid
,
480 KeyValueDB::Transaction t
);
482 /// Std::Set leaf node for c and oid to the value of header
484 const MapHeaderLock
&l
,
485 const ghobject_t
&oid
, _Header header
,
486 KeyValueDB::Transaction t
);
488 /// Std::Set leaf node for c and oid to the value of header
489 bool check_spos(const ghobject_t
&oid
,
491 const SequencerPosition
*spos
);
493 /// Lookup or create header for c oid
494 Header
lookup_create_map_header(
495 const MapHeaderLock
&l
,
496 const ghobject_t
&oid
,
497 KeyValueDB::Transaction t
);
500 * Generate new header for c oid with new seq number
502 * Has the side effect of synchronously saving the new DBObjectMap state
504 Header
_generate_new_header(const ghobject_t
&oid
, Header parent
);
505 Header
generate_new_header(const ghobject_t
&oid
, Header parent
) {
506 std::lock_guard l
{header_lock
};
507 return _generate_new_header(oid
, parent
);
510 /// Lookup leaf header for c oid
511 Header
_lookup_map_header(
512 const MapHeaderLock
&l
,
513 const ghobject_t
&oid
);
514 Header
lookup_map_header(
515 const MapHeaderLock
&l2
,
516 const ghobject_t
&oid
) {
517 std::lock_guard l
{header_lock
};
518 return _lookup_map_header(l2
, oid
);
521 /// Lookup header node for input
522 Header
lookup_parent(Header input
);
526 int _get_header(Header header
, ceph::buffer::list
*bl
);
528 /// Scan keys in header into out_keys and out_values (if nonnull)
529 int scan(Header header
,
530 const std::set
<std::string
> &in_keys
,
531 std::set
<std::string
> *out_keys
,
532 std::map
<std::string
, ceph::buffer::list
> *out_values
);
534 /// Remove header and all related prefixes
535 int _clear(Header header
,
536 KeyValueDB::Transaction t
);
538 /* Scan complete region bumping *begin to the beginning of any
539 * containing region and adding all complete region keys between
540 * the updated begin and end to the complete_keys_to_remove std::set */
541 int merge_new_complete(DBObjectMapIterator
&iter
,
543 const std::string
&end
,
544 std::set
<std::string
> *complete_keys_to_remove
);
546 /// Writes out State (mainly next_seq)
547 int write_state(KeyValueDB::Transaction _t
=
548 KeyValueDB::Transaction());
550 /// Copies header entry from parent @see rm_keys
551 int copy_up_header(Header header
,
552 KeyValueDB::Transaction t
);
554 /// Sets header @see set_header
555 void _set_header(Header header
, const ceph::buffer::list
&bl
,
556 KeyValueDB::Transaction t
);
559 * Removes header seq lock and possibly object lock
560 * once Header is out of scope
562 * @see generate_new_header
564 class RemoveOnDelete
{
567 explicit RemoveOnDelete(DBObjectMap
*db
) :
569 void operator() (_Header
*header
) {
570 std::lock_guard l
{db
->header_lock
};
571 ceph_assert(db
->in_use
.count(header
->seq
));
572 db
->in_use
.erase(header
->seq
);
573 db
->header_cond
.notify_all();
577 friend class RemoveOnDelete
;
579 WRITE_CLASS_ENCODER(DBObjectMap::_Header
)
580 WRITE_CLASS_ENCODER(DBObjectMap::State
)
582 std::ostream
& operator<<(std::ostream
& out
, const DBObjectMap::_Header
& h
);