]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | #ifndef DBOBJECTMAP_DB_H | |
3 | #define DBOBJECTMAP_DB_H | |
4 | ||
5 | #include "include/buffer_fwd.h" | |
6 | #include <set> | |
7 | #include <map> | |
8 | #include <string> | |
9 | ||
10 | #include <vector> | |
7c673cae FG |
11 | #include <boost/scoped_ptr.hpp> |
12 | ||
13 | #include "os/ObjectMap.h" | |
14 | #include "kv/KeyValueDB.h" | |
15 | #include "osd/osd_types.h" | |
9f95a23c | 16 | #include "common/ceph_mutex.h" |
7c673cae FG |
17 | #include "common/simple_cache.hpp" |
18 | #include <boost/optional/optional_io.hpp> | |
19 | ||
20 | #include "SequencerPosition.h" | |
21 | ||
22 | /** | |
23 | * DBObjectMap: Implements ObjectMap in terms of KeyValueDB | |
24 | * | |
25 | * Prefix space structure: | |
26 | * | |
27 | * @see complete_prefix | |
28 | * @see user_prefix | |
29 | * @see sys_prefix | |
30 | * | |
31 | * - HOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->header.seq and | |
32 | * corresponding omap header | |
33 | * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number | |
34 | * @see State | |
35 | * @see write_state | |
36 | * @see init | |
37 | * @see generate_new_header | |
38 | * - USER_PREFIX + header_key(header->seq) + USER_PREFIX | |
39 | * : key->value for header->seq | |
40 | * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below | |
41 | * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs | |
42 | * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX | |
43 | * : USER_HEADER_KEY - omap header for header->seq | |
44 | * : HEADER_KEY - encoding of header for header->seq | |
45 | * | |
46 | * For each node (represented by a header), we | |
47 | * store three mappings: the key mapping, the complete mapping, and the parent. | |
48 | * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in | |
49 | * this mapping indicates that the key mapping contains all entries on [x,y). | |
50 | * Note, max string is represented by "", so ""->"" indicates that the parent | |
51 | * is unnecessary (@see rm_keys). When looking up a key not contained in the | |
52 | * the complete set, we have to check the parent if we don't find it in the | |
53 | * key set. During rm_keys, we copy keys from the parent and update the | |
54 | * complete set to reflect the change @see rm_keys. | |
55 | */ | |
56 | class DBObjectMap : public ObjectMap { | |
57 | public: | |
11fdf7f2 TL |
58 | |
59 | KeyValueDB *get_db() override { return db.get(); } | |
7c673cae FG |
60 | |
61 | /** | |
62 | * Serializes access to next_seq as well as the in_use set | |
63 | */ | |
9f95a23c TL |
64 | ceph::mutex header_lock = ceph::make_mutex("DBOBjectMap"); |
65 | ceph::condition_variable header_cond; | |
66 | ceph::condition_variable map_header_cond; | |
7c673cae FG |
67 | |
68 | /** | |
69 | * Set of headers currently in use | |
70 | */ | |
71 | set<uint64_t> in_use; | |
72 | set<ghobject_t> map_header_in_use; | |
73 | ||
74 | /** | |
75 | * Takes the map_header_in_use entry in constructor, releases in | |
76 | * destructor | |
77 | */ | |
78 | class MapHeaderLock { | |
79 | DBObjectMap *db; | |
80 | boost::optional<ghobject_t> locked; | |
81 | ||
82 | MapHeaderLock(const MapHeaderLock &); | |
83 | MapHeaderLock &operator=(const MapHeaderLock &); | |
84 | public: | |
85 | explicit MapHeaderLock(DBObjectMap *db) : db(db) {} | |
86 | MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) { | |
9f95a23c TL |
87 | std::unique_lock l{db->header_lock}; |
88 | db->map_header_cond.wait(l, [db, this] { | |
89 | return !db->map_header_in_use.count(*locked); | |
90 | }); | |
7c673cae FG |
91 | db->map_header_in_use.insert(*locked); |
92 | } | |
93 | ||
94 | const ghobject_t &get_locked() const { | |
11fdf7f2 | 95 | ceph_assert(locked); |
7c673cae FG |
96 | return *locked; |
97 | } | |
98 | ||
99 | void swap(MapHeaderLock &o) { | |
11fdf7f2 | 100 | ceph_assert(db == o.db); |
7c673cae FG |
101 | |
102 | // centos6's boost optional doesn't seem to have swap :( | |
103 | boost::optional<ghobject_t> _locked = o.locked; | |
104 | o.locked = locked; | |
105 | locked = _locked; | |
106 | } | |
107 | ||
108 | ~MapHeaderLock() { | |
109 | if (locked) { | |
9f95a23c | 110 | std::lock_guard l{db->header_lock}; |
11fdf7f2 | 111 | ceph_assert(db->map_header_in_use.count(*locked)); |
9f95a23c | 112 | db->map_header_cond.notify_all(); |
7c673cae FG |
113 | db->map_header_in_use.erase(*locked); |
114 | } | |
115 | } | |
116 | }; | |
117 | ||
118 | DBObjectMap(CephContext* cct, KeyValueDB *db) | |
9f95a23c | 119 | : ObjectMap(cct, db), |
7c673cae FG |
120 | caches(cct->_conf->filestore_omap_header_cache_size) |
121 | {} | |
122 | ||
123 | int set_keys( | |
124 | const ghobject_t &oid, | |
125 | const map<string, bufferlist> &set, | |
126 | const SequencerPosition *spos=0 | |
127 | ) override; | |
128 | ||
129 | int set_header( | |
130 | const ghobject_t &oid, | |
131 | const bufferlist &bl, | |
132 | const SequencerPosition *spos=0 | |
133 | ) override; | |
134 | ||
135 | int get_header( | |
136 | const ghobject_t &oid, | |
137 | bufferlist *bl | |
138 | ) override; | |
139 | ||
140 | int clear( | |
141 | const ghobject_t &oid, | |
142 | const SequencerPosition *spos=0 | |
143 | ) override; | |
144 | ||
145 | int clear_keys_header( | |
146 | const ghobject_t &oid, | |
147 | const SequencerPosition *spos=0 | |
148 | ) override; | |
149 | ||
150 | int rm_keys( | |
151 | const ghobject_t &oid, | |
152 | const set<string> &to_clear, | |
153 | const SequencerPosition *spos=0 | |
154 | ) override; | |
155 | ||
156 | int get( | |
157 | const ghobject_t &oid, | |
158 | bufferlist *header, | |
159 | map<string, bufferlist> *out | |
160 | ) override; | |
161 | ||
162 | int get_keys( | |
163 | const ghobject_t &oid, | |
164 | set<string> *keys | |
165 | ) override; | |
166 | ||
167 | int get_values( | |
168 | const ghobject_t &oid, | |
169 | const set<string> &keys, | |
170 | map<string, bufferlist> *out | |
171 | ) override; | |
172 | ||
173 | int check_keys( | |
174 | const ghobject_t &oid, | |
175 | const set<string> &keys, | |
176 | set<string> *out | |
177 | ) override; | |
178 | ||
179 | int get_xattrs( | |
180 | const ghobject_t &oid, | |
181 | const set<string> &to_get, | |
182 | map<string, bufferlist> *out | |
183 | ) override; | |
184 | ||
185 | int get_all_xattrs( | |
186 | const ghobject_t &oid, | |
187 | set<string> *out | |
188 | ) override; | |
189 | ||
190 | int set_xattrs( | |
191 | const ghobject_t &oid, | |
192 | const map<string, bufferlist> &to_set, | |
193 | const SequencerPosition *spos=0 | |
194 | ) override; | |
195 | ||
196 | int remove_xattrs( | |
197 | const ghobject_t &oid, | |
198 | const set<string> &to_remove, | |
199 | const SequencerPosition *spos=0 | |
200 | ) override; | |
201 | ||
202 | int clone( | |
203 | const ghobject_t &oid, | |
204 | const ghobject_t &target, | |
205 | const SequencerPosition *spos=0 | |
206 | ) override; | |
207 | ||
208 | int rename( | |
209 | const ghobject_t &from, | |
210 | const ghobject_t &to, | |
211 | const SequencerPosition *spos=0 | |
212 | ); | |
213 | ||
214 | int legacy_clone( | |
215 | const ghobject_t &oid, | |
216 | const ghobject_t &target, | |
217 | const SequencerPosition *spos=0 | |
218 | ); | |
219 | ||
220 | /// Read initial state from backing store | |
3efd9988 FG |
221 | int get_state(); |
222 | /// Write current state settings to DB | |
223 | void set_state(); | |
224 | /// Read initial state and upgrade or initialize state | |
7c673cae FG |
225 | int init(bool upgrade = false); |
226 | ||
227 | /// Upgrade store to current version | |
228 | int upgrade_to_v2(); | |
229 | ||
230 | /// Consistency check, debug, there must be no parallel writes | |
3efd9988 | 231 | int check(std::ostream &out, bool repair = false, bool force = false) override; |
7c673cae FG |
232 | |
233 | /// Ensure that all previous operations are durable | |
234 | int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override; | |
235 | ||
224ce89b | 236 | void compact() override { |
11fdf7f2 | 237 | ceph_assert(db); |
224ce89b WB |
238 | db->compact(); |
239 | } | |
240 | ||
7c673cae FG |
241 | /// Util, get all objects, there must be no other concurrent access |
242 | int list_objects(vector<ghobject_t> *objs ///< [out] objects | |
243 | ); | |
244 | ||
245 | struct _Header; | |
246 | // Util, get all object headers, there must be no other concurrent access | |
247 | int list_object_headers(vector<_Header> *out ///< [out] headers | |
248 | ); | |
249 | ||
250 | ObjectMapIterator get_iterator(const ghobject_t &oid) override; | |
251 | ||
252 | static const string USER_PREFIX; | |
253 | static const string XATTR_PREFIX; | |
254 | static const string SYS_PREFIX; | |
255 | static const string COMPLETE_PREFIX; | |
256 | static const string HEADER_KEY; | |
257 | static const string USER_HEADER_KEY; | |
258 | static const string GLOBAL_STATE_KEY; | |
259 | static const string HOBJECT_TO_SEQ; | |
260 | ||
261 | /// Legacy | |
262 | static const string LEAF_PREFIX; | |
263 | static const string REVERSE_LEAF_PREFIX; | |
264 | ||
265 | /// persistent state for store @see generate_header | |
266 | struct State { | |
3efd9988 | 267 | static const __u8 CUR_VERSION = 3; |
7c673cae FG |
268 | __u8 v; |
269 | uint64_t seq; | |
3efd9988 FG |
270 | // legacy is false when complete regions never used |
271 | bool legacy; | |
272 | State() : v(0), seq(1), legacy(false) {} | |
273 | explicit State(uint64_t seq) : v(0), seq(seq), legacy(false) {} | |
7c673cae FG |
274 | |
275 | void encode(bufferlist &bl) const { | |
3efd9988 | 276 | ENCODE_START(3, 1, bl); |
11fdf7f2 TL |
277 | encode(v, bl); |
278 | encode(seq, bl); | |
279 | encode(legacy, bl); | |
7c673cae FG |
280 | ENCODE_FINISH(bl); |
281 | } | |
282 | ||
11fdf7f2 | 283 | void decode(bufferlist::const_iterator &bl) { |
3efd9988 | 284 | DECODE_START(3, bl); |
7c673cae | 285 | if (struct_v >= 2) |
11fdf7f2 | 286 | decode(v, bl); |
7c673cae FG |
287 | else |
288 | v = 0; | |
11fdf7f2 | 289 | decode(seq, bl); |
3efd9988 | 290 | if (struct_v >= 3) |
11fdf7f2 | 291 | decode(legacy, bl); |
3efd9988 FG |
292 | else |
293 | legacy = false; | |
7c673cae FG |
294 | DECODE_FINISH(bl); |
295 | } | |
296 | ||
297 | void dump(Formatter *f) const { | |
3efd9988 | 298 | f->dump_unsigned("v", v); |
7c673cae | 299 | f->dump_unsigned("seq", seq); |
3efd9988 | 300 | f->dump_bool("legacy", legacy); |
7c673cae FG |
301 | } |
302 | ||
303 | static void generate_test_instances(list<State*> &o) { | |
304 | o.push_back(new State(0)); | |
305 | o.push_back(new State(20)); | |
306 | } | |
307 | } state; | |
308 | ||
309 | struct _Header { | |
310 | uint64_t seq; | |
311 | uint64_t parent; | |
312 | uint64_t num_children; | |
313 | ||
314 | ghobject_t oid; | |
315 | ||
316 | SequencerPosition spos; | |
317 | ||
318 | void encode(bufferlist &bl) const { | |
319 | coll_t unused; | |
320 | ENCODE_START(2, 1, bl); | |
11fdf7f2 TL |
321 | encode(seq, bl); |
322 | encode(parent, bl); | |
323 | encode(num_children, bl); | |
324 | encode(unused, bl); | |
325 | encode(oid, bl); | |
326 | encode(spos, bl); | |
7c673cae FG |
327 | ENCODE_FINISH(bl); |
328 | } | |
329 | ||
11fdf7f2 | 330 | void decode(bufferlist::const_iterator &bl) { |
7c673cae FG |
331 | coll_t unused; |
332 | DECODE_START(2, bl); | |
11fdf7f2 TL |
333 | decode(seq, bl); |
334 | decode(parent, bl); | |
335 | decode(num_children, bl); | |
336 | decode(unused, bl); | |
337 | decode(oid, bl); | |
7c673cae | 338 | if (struct_v >= 2) |
11fdf7f2 | 339 | decode(spos, bl); |
7c673cae FG |
340 | DECODE_FINISH(bl); |
341 | } | |
342 | ||
343 | void dump(Formatter *f) const { | |
344 | f->dump_unsigned("seq", seq); | |
345 | f->dump_unsigned("parent", parent); | |
346 | f->dump_unsigned("num_children", num_children); | |
347 | f->dump_stream("oid") << oid; | |
348 | } | |
349 | ||
350 | static void generate_test_instances(list<_Header*> &o) { | |
351 | o.push_back(new _Header); | |
352 | o.push_back(new _Header); | |
353 | o.back()->parent = 20; | |
354 | o.back()->seq = 30; | |
355 | } | |
356 | ||
eafe8130 TL |
357 | size_t length() { |
358 | return sizeof(_Header); | |
359 | } | |
360 | ||
7c673cae FG |
361 | _Header() : seq(0), parent(0), num_children(1) {} |
362 | }; | |
363 | ||
364 | /// String munging (public for testing) | |
365 | static string ghobject_key(const ghobject_t &oid); | |
366 | static string ghobject_key_v0(coll_t c, const ghobject_t &oid); | |
367 | static int is_buggy_ghobject_key_v1(CephContext* cct, | |
368 | const string &in); | |
369 | private: | |
370 | /// Implicit lock on Header->seq | |
11fdf7f2 | 371 | typedef std::shared_ptr<_Header> Header; |
9f95a23c | 372 | ceph::mutex cache_lock = ceph::make_mutex("DBObjectMap::CacheLock"); |
7c673cae FG |
373 | SimpleLRU<ghobject_t, _Header> caches; |
374 | ||
375 | string map_header_key(const ghobject_t &oid); | |
376 | string header_key(uint64_t seq); | |
377 | string complete_prefix(Header header); | |
378 | string user_prefix(Header header); | |
379 | string sys_prefix(Header header); | |
380 | string xattr_prefix(Header header); | |
381 | string sys_parent_prefix(_Header header); | |
382 | string sys_parent_prefix(Header header) { | |
383 | return sys_parent_prefix(*header); | |
384 | } | |
385 | ||
386 | class EmptyIteratorImpl : public ObjectMapIteratorImpl { | |
387 | public: | |
388 | int seek_to_first() override { return 0; } | |
389 | int seek_to_last() { return 0; } | |
390 | int upper_bound(const string &after) override { return 0; } | |
391 | int lower_bound(const string &to) override { return 0; } | |
392 | bool valid() override { return false; } | |
11fdf7f2 | 393 | int next() override { ceph_abort(); return 0; } |
7c673cae FG |
394 | string key() override { ceph_abort(); return ""; } |
395 | bufferlist value() override { ceph_abort(); return bufferlist(); } | |
396 | int status() override { return 0; } | |
397 | }; | |
398 | ||
399 | ||
400 | /// Iterator | |
401 | class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl { | |
402 | public: | |
403 | DBObjectMap *map; | |
404 | ||
405 | /// NOTE: implicit lock hlock->get_locked() when returned out of the class | |
406 | MapHeaderLock hlock; | |
407 | /// NOTE: implicit lock on header->seq AND for all ancestors | |
408 | Header header; | |
409 | ||
410 | /// parent_iter == NULL iff no parent | |
11fdf7f2 | 411 | std::shared_ptr<DBObjectMapIteratorImpl> parent_iter; |
7c673cae FG |
412 | KeyValueDB::Iterator key_iter; |
413 | KeyValueDB::Iterator complete_iter; | |
414 | ||
415 | /// cur_iter points to currently valid iterator | |
11fdf7f2 | 416 | std::shared_ptr<ObjectMapIteratorImpl> cur_iter; |
7c673cae FG |
417 | int r; |
418 | ||
419 | /// init() called, key_iter, complete_iter, parent_iter filled in | |
420 | bool ready; | |
421 | /// past end | |
422 | bool invalid; | |
423 | ||
424 | DBObjectMapIteratorImpl(DBObjectMap *map, Header header) : | |
425 | map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {} | |
426 | int seek_to_first() override; | |
427 | int seek_to_last(); | |
428 | int upper_bound(const string &after) override; | |
429 | int lower_bound(const string &to) override; | |
430 | bool valid() override; | |
11fdf7f2 | 431 | int next() override; |
7c673cae FG |
432 | string key() override; |
433 | bufferlist value() override; | |
434 | int status() override; | |
435 | ||
436 | bool on_parent() { | |
437 | return cur_iter == parent_iter; | |
438 | } | |
439 | ||
440 | /// skips to next valid parent entry | |
441 | int next_parent(); | |
442 | ||
443 | /// first parent() >= to | |
444 | int lower_bound_parent(const string &to); | |
445 | ||
446 | /** | |
447 | * Tests whether to_test is in complete region | |
448 | * | |
449 | * postcondition: complete_iter will be max s.t. complete_iter->value > to_test | |
450 | */ | |
451 | int in_complete_region(const string &to_test, ///< [in] key to test | |
452 | string *begin, ///< [out] beginning of region | |
453 | string *end ///< [out] end of region | |
454 | ); ///< @returns true if to_test is in the complete region, else false | |
455 | ||
456 | private: | |
457 | int init(); | |
458 | bool valid_parent(); | |
459 | int adjust(); | |
460 | }; | |
461 | ||
11fdf7f2 | 462 | typedef std::shared_ptr<DBObjectMapIteratorImpl> DBObjectMapIterator; |
7c673cae FG |
463 | DBObjectMapIterator _get_iterator(Header header) { |
464 | return std::make_shared<DBObjectMapIteratorImpl>(this, header); | |
465 | } | |
466 | ||
467 | /// sys | |
468 | ||
469 | /// Removes node corresponding to header | |
470 | void clear_header(Header header, KeyValueDB::Transaction t); | |
471 | ||
472 | /// Set node containing input to new contents | |
473 | void set_header(Header input, KeyValueDB::Transaction t); | |
474 | ||
475 | /// Remove leaf node corresponding to oid in c | |
476 | void remove_map_header( | |
477 | const MapHeaderLock &l, | |
478 | const ghobject_t &oid, | |
479 | Header header, | |
480 | KeyValueDB::Transaction t); | |
481 | ||
482 | /// Set leaf node for c and oid to the value of header | |
483 | void set_map_header( | |
484 | const MapHeaderLock &l, | |
485 | const ghobject_t &oid, _Header header, | |
486 | KeyValueDB::Transaction t); | |
487 | ||
488 | /// Set leaf node for c and oid to the value of header | |
489 | bool check_spos(const ghobject_t &oid, | |
490 | Header header, | |
491 | const SequencerPosition *spos); | |
492 | ||
493 | /// Lookup or create header for c oid | |
494 | Header lookup_create_map_header( | |
495 | const MapHeaderLock &l, | |
496 | const ghobject_t &oid, | |
497 | KeyValueDB::Transaction t); | |
498 | ||
499 | /** | |
500 | * Generate new header for c oid with new seq number | |
501 | * | |
11fdf7f2 | 502 | * Has the side effect of synchronously saving the new DBObjectMap state |
7c673cae FG |
503 | */ |
504 | Header _generate_new_header(const ghobject_t &oid, Header parent); | |
505 | Header generate_new_header(const ghobject_t &oid, Header parent) { | |
9f95a23c | 506 | std::lock_guard l{header_lock}; |
7c673cae FG |
507 | return _generate_new_header(oid, parent); |
508 | } | |
509 | ||
510 | /// Lookup leaf header for c oid | |
511 | Header _lookup_map_header( | |
512 | const MapHeaderLock &l, | |
513 | const ghobject_t &oid); | |
514 | Header lookup_map_header( | |
515 | const MapHeaderLock &l2, | |
516 | const ghobject_t &oid) { | |
9f95a23c | 517 | std::lock_guard l{header_lock}; |
7c673cae FG |
518 | return _lookup_map_header(l2, oid); |
519 | } | |
520 | ||
521 | /// Lookup header node for input | |
522 | Header lookup_parent(Header input); | |
523 | ||
524 | ||
525 | /// Helpers | |
526 | int _get_header(Header header, bufferlist *bl); | |
527 | ||
528 | /// Scan keys in header into out_keys and out_values (if nonnull) | |
529 | int scan(Header header, | |
530 | const set<string> &in_keys, | |
531 | set<string> *out_keys, | |
532 | map<string, bufferlist> *out_values); | |
533 | ||
534 | /// Remove header and all related prefixes | |
535 | int _clear(Header header, | |
536 | KeyValueDB::Transaction t); | |
537 | ||
538 | /* Scan complete region bumping *begin to the beginning of any | |
539 | * containing region and adding all complete region keys between | |
540 | * the updated begin and end to the complete_keys_to_remove set */ | |
541 | int merge_new_complete(DBObjectMapIterator &iter, | |
542 | string *begin, | |
543 | const string &end, | |
544 | set<string> *complete_keys_to_remove); | |
545 | ||
546 | /// Writes out State (mainly next_seq) | |
547 | int write_state(KeyValueDB::Transaction _t = | |
548 | KeyValueDB::Transaction()); | |
549 | ||
550 | /// Copies header entry from parent @see rm_keys | |
551 | int copy_up_header(Header header, | |
552 | KeyValueDB::Transaction t); | |
553 | ||
554 | /// Sets header @see set_header | |
555 | void _set_header(Header header, const bufferlist &bl, | |
556 | KeyValueDB::Transaction t); | |
557 | ||
558 | /** | |
559 | * Removes header seq lock and possibly object lock | |
560 | * once Header is out of scope | |
561 | * @see lookup_parent | |
562 | * @see generate_new_header | |
563 | */ | |
564 | class RemoveOnDelete { | |
565 | public: | |
566 | DBObjectMap *db; | |
567 | explicit RemoveOnDelete(DBObjectMap *db) : | |
568 | db(db) {} | |
569 | void operator() (_Header *header) { | |
9f95a23c | 570 | std::lock_guard l{db->header_lock}; |
11fdf7f2 | 571 | ceph_assert(db->in_use.count(header->seq)); |
7c673cae | 572 | db->in_use.erase(header->seq); |
9f95a23c | 573 | db->header_cond.notify_all(); |
7c673cae FG |
574 | delete header; |
575 | } | |
576 | }; | |
577 | friend class RemoveOnDelete; | |
578 | }; | |
579 | WRITE_CLASS_ENCODER(DBObjectMap::_Header) | |
580 | WRITE_CLASS_ENCODER(DBObjectMap::State) | |
581 | ||
582 | ostream& operator<<(ostream& out, const DBObjectMap::_Header& h); | |
583 | ||
584 | #endif |