]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/filestore/DBObjectMap.h
import 15.2.0 Octopus source
[ceph.git] / ceph / src / os / filestore / DBObjectMap.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2#ifndef DBOBJECTMAP_DB_H
3#define DBOBJECTMAP_DB_H
4
5#include "include/buffer_fwd.h"
6#include <set>
7#include <map>
8#include <string>
9
10#include <vector>
7c673cae
FG
11#include <boost/scoped_ptr.hpp>
12
13#include "os/ObjectMap.h"
14#include "kv/KeyValueDB.h"
15#include "osd/osd_types.h"
9f95a23c 16#include "common/ceph_mutex.h"
7c673cae
FG
17#include "common/simple_cache.hpp"
18#include <boost/optional/optional_io.hpp>
19
20#include "SequencerPosition.h"
21
22/**
23 * DBObjectMap: Implements ObjectMap in terms of KeyValueDB
24 *
25 * Prefix space structure:
26 *
27 * @see complete_prefix
28 * @see user_prefix
29 * @see sys_prefix
30 *
31 * - HOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->header.seq and
32 * corresponding omap header
33 * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
34 * @see State
35 * @see write_state
36 * @see init
37 * @see generate_new_header
38 * - USER_PREFIX + header_key(header->seq) + USER_PREFIX
39 * : key->value for header->seq
40 * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below
41 * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs
42 * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX
43 * : USER_HEADER_KEY - omap header for header->seq
44 * : HEADER_KEY - encoding of header for header->seq
45 *
46 * For each node (represented by a header), we
47 * store three mappings: the key mapping, the complete mapping, and the parent.
48 * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in
49 * this mapping indicates that the key mapping contains all entries on [x,y).
50 * Note, max string is represented by "", so ""->"" indicates that the parent
51 * is unnecessary (@see rm_keys). When looking up a key not contained in the
52 * the complete set, we have to check the parent if we don't find it in the
53 * key set. During rm_keys, we copy keys from the parent and update the
54 * complete set to reflect the change @see rm_keys.
55 */
56class DBObjectMap : public ObjectMap {
57public:
11fdf7f2
TL
58
59 KeyValueDB *get_db() override { return db.get(); }
7c673cae
FG
60
61 /**
62 * Serializes access to next_seq as well as the in_use set
63 */
9f95a23c
TL
64 ceph::mutex header_lock = ceph::make_mutex("DBOBjectMap");
65 ceph::condition_variable header_cond;
66 ceph::condition_variable map_header_cond;
7c673cae
FG
67
68 /**
69 * Set of headers currently in use
70 */
71 set<uint64_t> in_use;
72 set<ghobject_t> map_header_in_use;
73
74 /**
75 * Takes the map_header_in_use entry in constructor, releases in
76 * destructor
77 */
78 class MapHeaderLock {
79 DBObjectMap *db;
80 boost::optional<ghobject_t> locked;
81
82 MapHeaderLock(const MapHeaderLock &);
83 MapHeaderLock &operator=(const MapHeaderLock &);
84 public:
85 explicit MapHeaderLock(DBObjectMap *db) : db(db) {}
86 MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) {
9f95a23c
TL
87 std::unique_lock l{db->header_lock};
88 db->map_header_cond.wait(l, [db, this] {
89 return !db->map_header_in_use.count(*locked);
90 });
7c673cae
FG
91 db->map_header_in_use.insert(*locked);
92 }
93
94 const ghobject_t &get_locked() const {
11fdf7f2 95 ceph_assert(locked);
7c673cae
FG
96 return *locked;
97 }
98
99 void swap(MapHeaderLock &o) {
11fdf7f2 100 ceph_assert(db == o.db);
7c673cae
FG
101
102 // centos6's boost optional doesn't seem to have swap :(
103 boost::optional<ghobject_t> _locked = o.locked;
104 o.locked = locked;
105 locked = _locked;
106 }
107
108 ~MapHeaderLock() {
109 if (locked) {
9f95a23c 110 std::lock_guard l{db->header_lock};
11fdf7f2 111 ceph_assert(db->map_header_in_use.count(*locked));
9f95a23c 112 db->map_header_cond.notify_all();
7c673cae
FG
113 db->map_header_in_use.erase(*locked);
114 }
115 }
116 };
117
118 DBObjectMap(CephContext* cct, KeyValueDB *db)
9f95a23c 119 : ObjectMap(cct, db),
7c673cae
FG
120 caches(cct->_conf->filestore_omap_header_cache_size)
121 {}
122
123 int set_keys(
124 const ghobject_t &oid,
125 const map<string, bufferlist> &set,
126 const SequencerPosition *spos=0
127 ) override;
128
129 int set_header(
130 const ghobject_t &oid,
131 const bufferlist &bl,
132 const SequencerPosition *spos=0
133 ) override;
134
135 int get_header(
136 const ghobject_t &oid,
137 bufferlist *bl
138 ) override;
139
140 int clear(
141 const ghobject_t &oid,
142 const SequencerPosition *spos=0
143 ) override;
144
145 int clear_keys_header(
146 const ghobject_t &oid,
147 const SequencerPosition *spos=0
148 ) override;
149
150 int rm_keys(
151 const ghobject_t &oid,
152 const set<string> &to_clear,
153 const SequencerPosition *spos=0
154 ) override;
155
156 int get(
157 const ghobject_t &oid,
158 bufferlist *header,
159 map<string, bufferlist> *out
160 ) override;
161
162 int get_keys(
163 const ghobject_t &oid,
164 set<string> *keys
165 ) override;
166
167 int get_values(
168 const ghobject_t &oid,
169 const set<string> &keys,
170 map<string, bufferlist> *out
171 ) override;
172
173 int check_keys(
174 const ghobject_t &oid,
175 const set<string> &keys,
176 set<string> *out
177 ) override;
178
179 int get_xattrs(
180 const ghobject_t &oid,
181 const set<string> &to_get,
182 map<string, bufferlist> *out
183 ) override;
184
185 int get_all_xattrs(
186 const ghobject_t &oid,
187 set<string> *out
188 ) override;
189
190 int set_xattrs(
191 const ghobject_t &oid,
192 const map<string, bufferlist> &to_set,
193 const SequencerPosition *spos=0
194 ) override;
195
196 int remove_xattrs(
197 const ghobject_t &oid,
198 const set<string> &to_remove,
199 const SequencerPosition *spos=0
200 ) override;
201
202 int clone(
203 const ghobject_t &oid,
204 const ghobject_t &target,
205 const SequencerPosition *spos=0
206 ) override;
207
208 int rename(
209 const ghobject_t &from,
210 const ghobject_t &to,
211 const SequencerPosition *spos=0
212 );
213
214 int legacy_clone(
215 const ghobject_t &oid,
216 const ghobject_t &target,
217 const SequencerPosition *spos=0
218 );
219
220 /// Read initial state from backing store
3efd9988
FG
221 int get_state();
222 /// Write current state settings to DB
223 void set_state();
224 /// Read initial state and upgrade or initialize state
7c673cae
FG
225 int init(bool upgrade = false);
226
227 /// Upgrade store to current version
228 int upgrade_to_v2();
229
230 /// Consistency check, debug, there must be no parallel writes
3efd9988 231 int check(std::ostream &out, bool repair = false, bool force = false) override;
7c673cae
FG
232
233 /// Ensure that all previous operations are durable
234 int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override;
235
224ce89b 236 void compact() override {
11fdf7f2 237 ceph_assert(db);
224ce89b
WB
238 db->compact();
239 }
240
7c673cae
FG
241 /// Util, get all objects, there must be no other concurrent access
242 int list_objects(vector<ghobject_t> *objs ///< [out] objects
243 );
244
245 struct _Header;
246 // Util, get all object headers, there must be no other concurrent access
247 int list_object_headers(vector<_Header> *out ///< [out] headers
248 );
249
250 ObjectMapIterator get_iterator(const ghobject_t &oid) override;
251
252 static const string USER_PREFIX;
253 static const string XATTR_PREFIX;
254 static const string SYS_PREFIX;
255 static const string COMPLETE_PREFIX;
256 static const string HEADER_KEY;
257 static const string USER_HEADER_KEY;
258 static const string GLOBAL_STATE_KEY;
259 static const string HOBJECT_TO_SEQ;
260
261 /// Legacy
262 static const string LEAF_PREFIX;
263 static const string REVERSE_LEAF_PREFIX;
264
265 /// persistent state for store @see generate_header
266 struct State {
3efd9988 267 static const __u8 CUR_VERSION = 3;
7c673cae
FG
268 __u8 v;
269 uint64_t seq;
3efd9988
FG
270 // legacy is false when complete regions never used
271 bool legacy;
272 State() : v(0), seq(1), legacy(false) {}
273 explicit State(uint64_t seq) : v(0), seq(seq), legacy(false) {}
7c673cae
FG
274
275 void encode(bufferlist &bl) const {
3efd9988 276 ENCODE_START(3, 1, bl);
11fdf7f2
TL
277 encode(v, bl);
278 encode(seq, bl);
279 encode(legacy, bl);
7c673cae
FG
280 ENCODE_FINISH(bl);
281 }
282
11fdf7f2 283 void decode(bufferlist::const_iterator &bl) {
3efd9988 284 DECODE_START(3, bl);
7c673cae 285 if (struct_v >= 2)
11fdf7f2 286 decode(v, bl);
7c673cae
FG
287 else
288 v = 0;
11fdf7f2 289 decode(seq, bl);
3efd9988 290 if (struct_v >= 3)
11fdf7f2 291 decode(legacy, bl);
3efd9988
FG
292 else
293 legacy = false;
7c673cae
FG
294 DECODE_FINISH(bl);
295 }
296
297 void dump(Formatter *f) const {
3efd9988 298 f->dump_unsigned("v", v);
7c673cae 299 f->dump_unsigned("seq", seq);
3efd9988 300 f->dump_bool("legacy", legacy);
7c673cae
FG
301 }
302
303 static void generate_test_instances(list<State*> &o) {
304 o.push_back(new State(0));
305 o.push_back(new State(20));
306 }
307 } state;
308
309 struct _Header {
310 uint64_t seq;
311 uint64_t parent;
312 uint64_t num_children;
313
314 ghobject_t oid;
315
316 SequencerPosition spos;
317
318 void encode(bufferlist &bl) const {
319 coll_t unused;
320 ENCODE_START(2, 1, bl);
11fdf7f2
TL
321 encode(seq, bl);
322 encode(parent, bl);
323 encode(num_children, bl);
324 encode(unused, bl);
325 encode(oid, bl);
326 encode(spos, bl);
7c673cae
FG
327 ENCODE_FINISH(bl);
328 }
329
11fdf7f2 330 void decode(bufferlist::const_iterator &bl) {
7c673cae
FG
331 coll_t unused;
332 DECODE_START(2, bl);
11fdf7f2
TL
333 decode(seq, bl);
334 decode(parent, bl);
335 decode(num_children, bl);
336 decode(unused, bl);
337 decode(oid, bl);
7c673cae 338 if (struct_v >= 2)
11fdf7f2 339 decode(spos, bl);
7c673cae
FG
340 DECODE_FINISH(bl);
341 }
342
343 void dump(Formatter *f) const {
344 f->dump_unsigned("seq", seq);
345 f->dump_unsigned("parent", parent);
346 f->dump_unsigned("num_children", num_children);
347 f->dump_stream("oid") << oid;
348 }
349
350 static void generate_test_instances(list<_Header*> &o) {
351 o.push_back(new _Header);
352 o.push_back(new _Header);
353 o.back()->parent = 20;
354 o.back()->seq = 30;
355 }
356
eafe8130
TL
357 size_t length() {
358 return sizeof(_Header);
359 }
360
7c673cae
FG
361 _Header() : seq(0), parent(0), num_children(1) {}
362 };
363
364 /// String munging (public for testing)
365 static string ghobject_key(const ghobject_t &oid);
366 static string ghobject_key_v0(coll_t c, const ghobject_t &oid);
367 static int is_buggy_ghobject_key_v1(CephContext* cct,
368 const string &in);
369private:
370 /// Implicit lock on Header->seq
11fdf7f2 371 typedef std::shared_ptr<_Header> Header;
9f95a23c 372 ceph::mutex cache_lock = ceph::make_mutex("DBObjectMap::CacheLock");
7c673cae
FG
373 SimpleLRU<ghobject_t, _Header> caches;
374
375 string map_header_key(const ghobject_t &oid);
376 string header_key(uint64_t seq);
377 string complete_prefix(Header header);
378 string user_prefix(Header header);
379 string sys_prefix(Header header);
380 string xattr_prefix(Header header);
381 string sys_parent_prefix(_Header header);
382 string sys_parent_prefix(Header header) {
383 return sys_parent_prefix(*header);
384 }
385
386 class EmptyIteratorImpl : public ObjectMapIteratorImpl {
387 public:
388 int seek_to_first() override { return 0; }
389 int seek_to_last() { return 0; }
390 int upper_bound(const string &after) override { return 0; }
391 int lower_bound(const string &to) override { return 0; }
392 bool valid() override { return false; }
11fdf7f2 393 int next() override { ceph_abort(); return 0; }
7c673cae
FG
394 string key() override { ceph_abort(); return ""; }
395 bufferlist value() override { ceph_abort(); return bufferlist(); }
396 int status() override { return 0; }
397 };
398
399
400 /// Iterator
401 class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl {
402 public:
403 DBObjectMap *map;
404
405 /// NOTE: implicit lock hlock->get_locked() when returned out of the class
406 MapHeaderLock hlock;
407 /// NOTE: implicit lock on header->seq AND for all ancestors
408 Header header;
409
410 /// parent_iter == NULL iff no parent
11fdf7f2 411 std::shared_ptr<DBObjectMapIteratorImpl> parent_iter;
7c673cae
FG
412 KeyValueDB::Iterator key_iter;
413 KeyValueDB::Iterator complete_iter;
414
415 /// cur_iter points to currently valid iterator
11fdf7f2 416 std::shared_ptr<ObjectMapIteratorImpl> cur_iter;
7c673cae
FG
417 int r;
418
419 /// init() called, key_iter, complete_iter, parent_iter filled in
420 bool ready;
421 /// past end
422 bool invalid;
423
424 DBObjectMapIteratorImpl(DBObjectMap *map, Header header) :
425 map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {}
426 int seek_to_first() override;
427 int seek_to_last();
428 int upper_bound(const string &after) override;
429 int lower_bound(const string &to) override;
430 bool valid() override;
11fdf7f2 431 int next() override;
7c673cae
FG
432 string key() override;
433 bufferlist value() override;
434 int status() override;
435
436 bool on_parent() {
437 return cur_iter == parent_iter;
438 }
439
440 /// skips to next valid parent entry
441 int next_parent();
442
443 /// first parent() >= to
444 int lower_bound_parent(const string &to);
445
446 /**
447 * Tests whether to_test is in complete region
448 *
449 * postcondition: complete_iter will be max s.t. complete_iter->value > to_test
450 */
451 int in_complete_region(const string &to_test, ///< [in] key to test
452 string *begin, ///< [out] beginning of region
453 string *end ///< [out] end of region
454 ); ///< @returns true if to_test is in the complete region, else false
455
456 private:
457 int init();
458 bool valid_parent();
459 int adjust();
460 };
461
11fdf7f2 462 typedef std::shared_ptr<DBObjectMapIteratorImpl> DBObjectMapIterator;
7c673cae
FG
463 DBObjectMapIterator _get_iterator(Header header) {
464 return std::make_shared<DBObjectMapIteratorImpl>(this, header);
465 }
466
467 /// sys
468
469 /// Removes node corresponding to header
470 void clear_header(Header header, KeyValueDB::Transaction t);
471
472 /// Set node containing input to new contents
473 void set_header(Header input, KeyValueDB::Transaction t);
474
475 /// Remove leaf node corresponding to oid in c
476 void remove_map_header(
477 const MapHeaderLock &l,
478 const ghobject_t &oid,
479 Header header,
480 KeyValueDB::Transaction t);
481
482 /// Set leaf node for c and oid to the value of header
483 void set_map_header(
484 const MapHeaderLock &l,
485 const ghobject_t &oid, _Header header,
486 KeyValueDB::Transaction t);
487
488 /// Set leaf node for c and oid to the value of header
489 bool check_spos(const ghobject_t &oid,
490 Header header,
491 const SequencerPosition *spos);
492
493 /// Lookup or create header for c oid
494 Header lookup_create_map_header(
495 const MapHeaderLock &l,
496 const ghobject_t &oid,
497 KeyValueDB::Transaction t);
498
499 /**
500 * Generate new header for c oid with new seq number
501 *
11fdf7f2 502 * Has the side effect of synchronously saving the new DBObjectMap state
7c673cae
FG
503 */
504 Header _generate_new_header(const ghobject_t &oid, Header parent);
505 Header generate_new_header(const ghobject_t &oid, Header parent) {
9f95a23c 506 std::lock_guard l{header_lock};
7c673cae
FG
507 return _generate_new_header(oid, parent);
508 }
509
510 /// Lookup leaf header for c oid
511 Header _lookup_map_header(
512 const MapHeaderLock &l,
513 const ghobject_t &oid);
514 Header lookup_map_header(
515 const MapHeaderLock &l2,
516 const ghobject_t &oid) {
9f95a23c 517 std::lock_guard l{header_lock};
7c673cae
FG
518 return _lookup_map_header(l2, oid);
519 }
520
521 /// Lookup header node for input
522 Header lookup_parent(Header input);
523
524
525 /// Helpers
526 int _get_header(Header header, bufferlist *bl);
527
528 /// Scan keys in header into out_keys and out_values (if nonnull)
529 int scan(Header header,
530 const set<string> &in_keys,
531 set<string> *out_keys,
532 map<string, bufferlist> *out_values);
533
534 /// Remove header and all related prefixes
535 int _clear(Header header,
536 KeyValueDB::Transaction t);
537
538 /* Scan complete region bumping *begin to the beginning of any
539 * containing region and adding all complete region keys between
540 * the updated begin and end to the complete_keys_to_remove set */
541 int merge_new_complete(DBObjectMapIterator &iter,
542 string *begin,
543 const string &end,
544 set<string> *complete_keys_to_remove);
545
546 /// Writes out State (mainly next_seq)
547 int write_state(KeyValueDB::Transaction _t =
548 KeyValueDB::Transaction());
549
550 /// Copies header entry from parent @see rm_keys
551 int copy_up_header(Header header,
552 KeyValueDB::Transaction t);
553
554 /// Sets header @see set_header
555 void _set_header(Header header, const bufferlist &bl,
556 KeyValueDB::Transaction t);
557
558 /**
559 * Removes header seq lock and possibly object lock
560 * once Header is out of scope
561 * @see lookup_parent
562 * @see generate_new_header
563 */
564 class RemoveOnDelete {
565 public:
566 DBObjectMap *db;
567 explicit RemoveOnDelete(DBObjectMap *db) :
568 db(db) {}
569 void operator() (_Header *header) {
9f95a23c 570 std::lock_guard l{db->header_lock};
11fdf7f2 571 ceph_assert(db->in_use.count(header->seq));
7c673cae 572 db->in_use.erase(header->seq);
9f95a23c 573 db->header_cond.notify_all();
7c673cae
FG
574 delete header;
575 }
576 };
577 friend class RemoveOnDelete;
578};
579WRITE_CLASS_ENCODER(DBObjectMap::_Header)
580WRITE_CLASS_ENCODER(DBObjectMap::State)
581
582ostream& operator<<(ostream& out, const DBObjectMap::_Header& h);
583
584#endif