]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/filestore/DBObjectMap.h
update sources to v12.1.1
[ceph.git] / ceph / src / os / filestore / DBObjectMap.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 #ifndef DBOBJECTMAP_DB_H
3 #define DBOBJECTMAP_DB_H
4
5 #include "include/buffer_fwd.h"
6 #include <set>
7 #include <map>
8 #include <string>
9
10 #include <vector>
11 #include "include/memory.h"
12 #include <boost/scoped_ptr.hpp>
13
14 #include "os/ObjectMap.h"
15 #include "kv/KeyValueDB.h"
16 #include "osd/osd_types.h"
17 #include "common/Mutex.h"
18 #include "common/Cond.h"
19 #include "common/simple_cache.hpp"
20 #include <boost/optional/optional_io.hpp>
21
22 #include "SequencerPosition.h"
23
24 /**
25 * DBObjectMap: Implements ObjectMap in terms of KeyValueDB
26 *
27 * Prefix space structure:
28 *
29 * @see complete_prefix
30 * @see user_prefix
31 * @see sys_prefix
32 *
33 * - HOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->header.seq and
34 * corresponding omap header
35 * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
36 * @see State
37 * @see write_state
38 * @see init
39 * @see generate_new_header
40 * - USER_PREFIX + header_key(header->seq) + USER_PREFIX
41 * : key->value for header->seq
42 * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below
43 * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs
44 * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX
45 * : USER_HEADER_KEY - omap header for header->seq
46 * : HEADER_KEY - encoding of header for header->seq
47 *
48 * For each node (represented by a header), we
49 * store three mappings: the key mapping, the complete mapping, and the parent.
50 * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in
51 * this mapping indicates that the key mapping contains all entries on [x,y).
52 * Note, max string is represented by "", so ""->"" indicates that the parent
53 * is unnecessary (@see rm_keys). When looking up a key not contained in the
54 * the complete set, we have to check the parent if we don't find it in the
55 * key set. During rm_keys, we copy keys from the parent and update the
56 * complete set to reflect the change @see rm_keys.
57 */
58 class DBObjectMap : public ObjectMap {
59 public:
60 boost::scoped_ptr<KeyValueDB> db;
61
62 /**
63 * Serializes access to next_seq as well as the in_use set
64 */
65 Mutex header_lock;
66 Cond header_cond;
67 Cond map_header_cond;
68
69 /**
70 * Set of headers currently in use
71 */
72 set<uint64_t> in_use;
73 set<ghobject_t> map_header_in_use;
74
75 /**
76 * Takes the map_header_in_use entry in constructor, releases in
77 * destructor
78 */
79 class MapHeaderLock {
80 DBObjectMap *db;
81 boost::optional<ghobject_t> locked;
82
83 MapHeaderLock(const MapHeaderLock &);
84 MapHeaderLock &operator=(const MapHeaderLock &);
85 public:
86 explicit MapHeaderLock(DBObjectMap *db) : db(db) {}
87 MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) {
88 Mutex::Locker l(db->header_lock);
89 while (db->map_header_in_use.count(*locked))
90 db->map_header_cond.Wait(db->header_lock);
91 db->map_header_in_use.insert(*locked);
92 }
93
94 const ghobject_t &get_locked() const {
95 assert(locked);
96 return *locked;
97 }
98
99 void swap(MapHeaderLock &o) {
100 assert(db == o.db);
101
102 // centos6's boost optional doesn't seem to have swap :(
103 boost::optional<ghobject_t> _locked = o.locked;
104 o.locked = locked;
105 locked = _locked;
106 }
107
108 ~MapHeaderLock() {
109 if (locked) {
110 Mutex::Locker l(db->header_lock);
111 assert(db->map_header_in_use.count(*locked));
112 db->map_header_cond.Signal();
113 db->map_header_in_use.erase(*locked);
114 }
115 }
116 };
117
118 DBObjectMap(CephContext* cct, KeyValueDB *db)
119 : ObjectMap(cct), db(db), header_lock("DBOBjectMap"),
120 cache_lock("DBObjectMap::CacheLock"),
121 caches(cct->_conf->filestore_omap_header_cache_size)
122 {}
123
124 int set_keys(
125 const ghobject_t &oid,
126 const map<string, bufferlist> &set,
127 const SequencerPosition *spos=0
128 ) override;
129
130 int set_header(
131 const ghobject_t &oid,
132 const bufferlist &bl,
133 const SequencerPosition *spos=0
134 ) override;
135
136 int get_header(
137 const ghobject_t &oid,
138 bufferlist *bl
139 ) override;
140
141 int clear(
142 const ghobject_t &oid,
143 const SequencerPosition *spos=0
144 ) override;
145
146 int clear_keys_header(
147 const ghobject_t &oid,
148 const SequencerPosition *spos=0
149 ) override;
150
151 int rm_keys(
152 const ghobject_t &oid,
153 const set<string> &to_clear,
154 const SequencerPosition *spos=0
155 ) override;
156
157 int get(
158 const ghobject_t &oid,
159 bufferlist *header,
160 map<string, bufferlist> *out
161 ) override;
162
163 int get_keys(
164 const ghobject_t &oid,
165 set<string> *keys
166 ) override;
167
168 int get_values(
169 const ghobject_t &oid,
170 const set<string> &keys,
171 map<string, bufferlist> *out
172 ) override;
173
174 int check_keys(
175 const ghobject_t &oid,
176 const set<string> &keys,
177 set<string> *out
178 ) override;
179
180 int get_xattrs(
181 const ghobject_t &oid,
182 const set<string> &to_get,
183 map<string, bufferlist> *out
184 ) override;
185
186 int get_all_xattrs(
187 const ghobject_t &oid,
188 set<string> *out
189 ) override;
190
191 int set_xattrs(
192 const ghobject_t &oid,
193 const map<string, bufferlist> &to_set,
194 const SequencerPosition *spos=0
195 ) override;
196
197 int remove_xattrs(
198 const ghobject_t &oid,
199 const set<string> &to_remove,
200 const SequencerPosition *spos=0
201 ) override;
202
203 int clone(
204 const ghobject_t &oid,
205 const ghobject_t &target,
206 const SequencerPosition *spos=0
207 ) override;
208
209 int rename(
210 const ghobject_t &from,
211 const ghobject_t &to,
212 const SequencerPosition *spos=0
213 );
214
215 int legacy_clone(
216 const ghobject_t &oid,
217 const ghobject_t &target,
218 const SequencerPosition *spos=0
219 );
220
221 /// Read initial state from backing store
222 int init(bool upgrade = false);
223
224 /// Upgrade store to current version
225 int upgrade_to_v2();
226
227 /// Consistency check, debug, there must be no parallel writes
228 int check(std::ostream &out, bool repair = false) override;
229
230 /// Ensure that all previous operations are durable
231 int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override;
232
233 void compact() override {
234 assert(db);
235 db->compact();
236 }
237
238 /// Util, get all objects, there must be no other concurrent access
239 int list_objects(vector<ghobject_t> *objs ///< [out] objects
240 );
241
242 struct _Header;
243 // Util, get all object headers, there must be no other concurrent access
244 int list_object_headers(vector<_Header> *out ///< [out] headers
245 );
246
247 ObjectMapIterator get_iterator(const ghobject_t &oid) override;
248
249 static const string USER_PREFIX;
250 static const string XATTR_PREFIX;
251 static const string SYS_PREFIX;
252 static const string COMPLETE_PREFIX;
253 static const string HEADER_KEY;
254 static const string USER_HEADER_KEY;
255 static const string GLOBAL_STATE_KEY;
256 static const string HOBJECT_TO_SEQ;
257
258 /// Legacy
259 static const string LEAF_PREFIX;
260 static const string REVERSE_LEAF_PREFIX;
261
262 /// persistent state for store @see generate_header
263 struct State {
264 __u8 v;
265 uint64_t seq;
266 State() : v(0), seq(1) {}
267 explicit State(uint64_t seq) : v(0), seq(seq) {}
268
269 void encode(bufferlist &bl) const {
270 ENCODE_START(2, 1, bl);
271 ::encode(v, bl);
272 ::encode(seq, bl);
273 ENCODE_FINISH(bl);
274 }
275
276 void decode(bufferlist::iterator &bl) {
277 DECODE_START(2, bl);
278 if (struct_v >= 2)
279 ::decode(v, bl);
280 else
281 v = 0;
282 ::decode(seq, bl);
283 DECODE_FINISH(bl);
284 }
285
286 void dump(Formatter *f) const {
287 f->dump_unsigned("seq", seq);
288 }
289
290 static void generate_test_instances(list<State*> &o) {
291 o.push_back(new State(0));
292 o.push_back(new State(20));
293 }
294 } state;
295
296 struct _Header {
297 uint64_t seq;
298 uint64_t parent;
299 uint64_t num_children;
300
301 ghobject_t oid;
302
303 SequencerPosition spos;
304
305 void encode(bufferlist &bl) const {
306 coll_t unused;
307 ENCODE_START(2, 1, bl);
308 ::encode(seq, bl);
309 ::encode(parent, bl);
310 ::encode(num_children, bl);
311 ::encode(unused, bl);
312 ::encode(oid, bl);
313 ::encode(spos, bl);
314 ENCODE_FINISH(bl);
315 }
316
317 void decode(bufferlist::iterator &bl) {
318 coll_t unused;
319 DECODE_START(2, bl);
320 ::decode(seq, bl);
321 ::decode(parent, bl);
322 ::decode(num_children, bl);
323 ::decode(unused, bl);
324 ::decode(oid, bl);
325 if (struct_v >= 2)
326 ::decode(spos, bl);
327 DECODE_FINISH(bl);
328 }
329
330 void dump(Formatter *f) const {
331 f->dump_unsigned("seq", seq);
332 f->dump_unsigned("parent", parent);
333 f->dump_unsigned("num_children", num_children);
334 f->dump_stream("oid") << oid;
335 }
336
337 static void generate_test_instances(list<_Header*> &o) {
338 o.push_back(new _Header);
339 o.push_back(new _Header);
340 o.back()->parent = 20;
341 o.back()->seq = 30;
342 }
343
344 _Header() : seq(0), parent(0), num_children(1) {}
345 };
346
347 /// String munging (public for testing)
348 static string ghobject_key(const ghobject_t &oid);
349 static string ghobject_key_v0(coll_t c, const ghobject_t &oid);
350 static int is_buggy_ghobject_key_v1(CephContext* cct,
351 const string &in);
352 private:
353 /// Implicit lock on Header->seq
354 typedef ceph::shared_ptr<_Header> Header;
355 Mutex cache_lock;
356 SimpleLRU<ghobject_t, _Header> caches;
357
358 string map_header_key(const ghobject_t &oid);
359 string header_key(uint64_t seq);
360 string complete_prefix(Header header);
361 string user_prefix(Header header);
362 string sys_prefix(Header header);
363 string xattr_prefix(Header header);
364 string sys_parent_prefix(_Header header);
365 string sys_parent_prefix(Header header) {
366 return sys_parent_prefix(*header);
367 }
368
369 class EmptyIteratorImpl : public ObjectMapIteratorImpl {
370 public:
371 int seek_to_first() override { return 0; }
372 int seek_to_last() { return 0; }
373 int upper_bound(const string &after) override { return 0; }
374 int lower_bound(const string &to) override { return 0; }
375 bool valid() override { return false; }
376 int next(bool validate=true) override { ceph_abort(); return 0; }
377 string key() override { ceph_abort(); return ""; }
378 bufferlist value() override { ceph_abort(); return bufferlist(); }
379 int status() override { return 0; }
380 };
381
382
383 /// Iterator
384 class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl {
385 public:
386 DBObjectMap *map;
387
388 /// NOTE: implicit lock hlock->get_locked() when returned out of the class
389 MapHeaderLock hlock;
390 /// NOTE: implicit lock on header->seq AND for all ancestors
391 Header header;
392
393 /// parent_iter == NULL iff no parent
394 ceph::shared_ptr<DBObjectMapIteratorImpl> parent_iter;
395 KeyValueDB::Iterator key_iter;
396 KeyValueDB::Iterator complete_iter;
397
398 /// cur_iter points to currently valid iterator
399 ceph::shared_ptr<ObjectMapIteratorImpl> cur_iter;
400 int r;
401
402 /// init() called, key_iter, complete_iter, parent_iter filled in
403 bool ready;
404 /// past end
405 bool invalid;
406
407 DBObjectMapIteratorImpl(DBObjectMap *map, Header header) :
408 map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {}
409 int seek_to_first() override;
410 int seek_to_last();
411 int upper_bound(const string &after) override;
412 int lower_bound(const string &to) override;
413 bool valid() override;
414 int next(bool validate=true) override;
415 string key() override;
416 bufferlist value() override;
417 int status() override;
418
419 bool on_parent() {
420 return cur_iter == parent_iter;
421 }
422
423 /// skips to next valid parent entry
424 int next_parent();
425
426 /// first parent() >= to
427 int lower_bound_parent(const string &to);
428
429 /**
430 * Tests whether to_test is in complete region
431 *
432 * postcondition: complete_iter will be max s.t. complete_iter->value > to_test
433 */
434 int in_complete_region(const string &to_test, ///< [in] key to test
435 string *begin, ///< [out] beginning of region
436 string *end ///< [out] end of region
437 ); ///< @returns true if to_test is in the complete region, else false
438
439 private:
440 int init();
441 bool valid_parent();
442 int adjust();
443 };
444
445 typedef ceph::shared_ptr<DBObjectMapIteratorImpl> DBObjectMapIterator;
446 DBObjectMapIterator _get_iterator(Header header) {
447 return std::make_shared<DBObjectMapIteratorImpl>(this, header);
448 }
449
450 /// sys
451
452 /// Removes node corresponding to header
453 void clear_header(Header header, KeyValueDB::Transaction t);
454
455 /// Set node containing input to new contents
456 void set_header(Header input, KeyValueDB::Transaction t);
457
458 /// Remove leaf node corresponding to oid in c
459 void remove_map_header(
460 const MapHeaderLock &l,
461 const ghobject_t &oid,
462 Header header,
463 KeyValueDB::Transaction t);
464
465 /// Set leaf node for c and oid to the value of header
466 void set_map_header(
467 const MapHeaderLock &l,
468 const ghobject_t &oid, _Header header,
469 KeyValueDB::Transaction t);
470
471 /// Set leaf node for c and oid to the value of header
472 bool check_spos(const ghobject_t &oid,
473 Header header,
474 const SequencerPosition *spos);
475
476 /// Lookup or create header for c oid
477 Header lookup_create_map_header(
478 const MapHeaderLock &l,
479 const ghobject_t &oid,
480 KeyValueDB::Transaction t);
481
482 /**
483 * Generate new header for c oid with new seq number
484 *
485 * Has the side effect of syncronously saving the new DBObjectMap state
486 */
487 Header _generate_new_header(const ghobject_t &oid, Header parent);
488 Header generate_new_header(const ghobject_t &oid, Header parent) {
489 Mutex::Locker l(header_lock);
490 return _generate_new_header(oid, parent);
491 }
492
493 /// Lookup leaf header for c oid
494 Header _lookup_map_header(
495 const MapHeaderLock &l,
496 const ghobject_t &oid);
497 Header lookup_map_header(
498 const MapHeaderLock &l2,
499 const ghobject_t &oid) {
500 Mutex::Locker l(header_lock);
501 return _lookup_map_header(l2, oid);
502 }
503
504 /// Lookup header node for input
505 Header lookup_parent(Header input);
506
507
508 /// Helpers
509 int _get_header(Header header, bufferlist *bl);
510
511 /// Scan keys in header into out_keys and out_values (if nonnull)
512 int scan(Header header,
513 const set<string> &in_keys,
514 set<string> *out_keys,
515 map<string, bufferlist> *out_values);
516
517 /// Remove header and all related prefixes
518 int _clear(Header header,
519 KeyValueDB::Transaction t);
520
521 /* Scan complete region bumping *begin to the beginning of any
522 * containing region and adding all complete region keys between
523 * the updated begin and end to the complete_keys_to_remove set */
524 int merge_new_complete(DBObjectMapIterator &iter,
525 string *begin,
526 const string &end,
527 set<string> *complete_keys_to_remove);
528
529 /// Writes out State (mainly next_seq)
530 int write_state(KeyValueDB::Transaction _t =
531 KeyValueDB::Transaction());
532
533 /// Copies header entry from parent @see rm_keys
534 int copy_up_header(Header header,
535 KeyValueDB::Transaction t);
536
537 /// Sets header @see set_header
538 void _set_header(Header header, const bufferlist &bl,
539 KeyValueDB::Transaction t);
540
541 /**
542 * Removes header seq lock and possibly object lock
543 * once Header is out of scope
544 * @see lookup_parent
545 * @see generate_new_header
546 */
547 class RemoveOnDelete {
548 public:
549 DBObjectMap *db;
550 explicit RemoveOnDelete(DBObjectMap *db) :
551 db(db) {}
552 void operator() (_Header *header) {
553 Mutex::Locker l(db->header_lock);
554 assert(db->in_use.count(header->seq));
555 db->in_use.erase(header->seq);
556 db->header_cond.Signal();
557 delete header;
558 }
559 };
560 friend class RemoveOnDelete;
561 };
562 WRITE_CLASS_ENCODER(DBObjectMap::_Header)
563 WRITE_CLASS_ENCODER(DBObjectMap::State)
564
565 ostream& operator<<(ostream& out, const DBObjectMap::_Header& h);
566
567 #endif