]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | #ifndef DBOBJECTMAP_DB_H | |
3 | #define DBOBJECTMAP_DB_H | |
4 | ||
5 | #include "include/buffer_fwd.h" | |
6 | #include <set> | |
7 | #include <map> | |
8 | #include <string> | |
9 | ||
10 | #include <vector> | |
11 | #include "include/memory.h" | |
12 | #include <boost/scoped_ptr.hpp> | |
13 | ||
14 | #include "os/ObjectMap.h" | |
15 | #include "kv/KeyValueDB.h" | |
16 | #include "osd/osd_types.h" | |
17 | #include "common/Mutex.h" | |
18 | #include "common/Cond.h" | |
19 | #include "common/simple_cache.hpp" | |
20 | #include <boost/optional/optional_io.hpp> | |
21 | ||
22 | #include "SequencerPosition.h" | |
23 | ||
24 | /** | |
25 | * DBObjectMap: Implements ObjectMap in terms of KeyValueDB | |
26 | * | |
27 | * Prefix space structure: | |
28 | * | |
29 | * @see complete_prefix | |
30 | * @see user_prefix | |
31 | * @see sys_prefix | |
32 | * | |
33 | * - HOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->header.seq and | |
34 | * corresponding omap header | |
35 | * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number | |
36 | * @see State | |
37 | * @see write_state | |
38 | * @see init | |
39 | * @see generate_new_header | |
40 | * - USER_PREFIX + header_key(header->seq) + USER_PREFIX | |
41 | * : key->value for header->seq | |
42 | * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below | |
43 | * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs | |
44 | * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX | |
45 | * : USER_HEADER_KEY - omap header for header->seq | |
46 | * : HEADER_KEY - encoding of header for header->seq | |
47 | * | |
48 | * For each node (represented by a header), we | |
49 | * store three mappings: the key mapping, the complete mapping, and the parent. | |
50 | * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in | |
51 | * this mapping indicates that the key mapping contains all entries on [x,y). | |
52 | * Note, max string is represented by "", so ""->"" indicates that the parent | |
53 | * is unnecessary (@see rm_keys). When looking up a key not contained in the | |
54 | * the complete set, we have to check the parent if we don't find it in the | |
55 | * key set. During rm_keys, we copy keys from the parent and update the | |
56 | * complete set to reflect the change @see rm_keys. | |
57 | */ | |
58 | class DBObjectMap : public ObjectMap { | |
59 | public: | |
60 | boost::scoped_ptr<KeyValueDB> db; | |
61 | ||
62 | /** | |
63 | * Serializes access to next_seq as well as the in_use set | |
64 | */ | |
65 | Mutex header_lock; | |
66 | Cond header_cond; | |
67 | Cond map_header_cond; | |
68 | ||
69 | /** | |
70 | * Set of headers currently in use | |
71 | */ | |
72 | set<uint64_t> in_use; | |
73 | set<ghobject_t> map_header_in_use; | |
74 | ||
75 | /** | |
76 | * Takes the map_header_in_use entry in constructor, releases in | |
77 | * destructor | |
78 | */ | |
79 | class MapHeaderLock { | |
80 | DBObjectMap *db; | |
81 | boost::optional<ghobject_t> locked; | |
82 | ||
83 | MapHeaderLock(const MapHeaderLock &); | |
84 | MapHeaderLock &operator=(const MapHeaderLock &); | |
85 | public: | |
86 | explicit MapHeaderLock(DBObjectMap *db) : db(db) {} | |
87 | MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) { | |
88 | Mutex::Locker l(db->header_lock); | |
89 | while (db->map_header_in_use.count(*locked)) | |
90 | db->map_header_cond.Wait(db->header_lock); | |
91 | db->map_header_in_use.insert(*locked); | |
92 | } | |
93 | ||
94 | const ghobject_t &get_locked() const { | |
95 | assert(locked); | |
96 | return *locked; | |
97 | } | |
98 | ||
99 | void swap(MapHeaderLock &o) { | |
100 | assert(db == o.db); | |
101 | ||
102 | // centos6's boost optional doesn't seem to have swap :( | |
103 | boost::optional<ghobject_t> _locked = o.locked; | |
104 | o.locked = locked; | |
105 | locked = _locked; | |
106 | } | |
107 | ||
108 | ~MapHeaderLock() { | |
109 | if (locked) { | |
110 | Mutex::Locker l(db->header_lock); | |
111 | assert(db->map_header_in_use.count(*locked)); | |
112 | db->map_header_cond.Signal(); | |
113 | db->map_header_in_use.erase(*locked); | |
114 | } | |
115 | } | |
116 | }; | |
117 | ||
118 | DBObjectMap(CephContext* cct, KeyValueDB *db) | |
119 | : ObjectMap(cct), db(db), header_lock("DBOBjectMap"), | |
120 | cache_lock("DBObjectMap::CacheLock"), | |
121 | caches(cct->_conf->filestore_omap_header_cache_size) | |
122 | {} | |
123 | ||
124 | int set_keys( | |
125 | const ghobject_t &oid, | |
126 | const map<string, bufferlist> &set, | |
127 | const SequencerPosition *spos=0 | |
128 | ) override; | |
129 | ||
130 | int set_header( | |
131 | const ghobject_t &oid, | |
132 | const bufferlist &bl, | |
133 | const SequencerPosition *spos=0 | |
134 | ) override; | |
135 | ||
136 | int get_header( | |
137 | const ghobject_t &oid, | |
138 | bufferlist *bl | |
139 | ) override; | |
140 | ||
141 | int clear( | |
142 | const ghobject_t &oid, | |
143 | const SequencerPosition *spos=0 | |
144 | ) override; | |
145 | ||
146 | int clear_keys_header( | |
147 | const ghobject_t &oid, | |
148 | const SequencerPosition *spos=0 | |
149 | ) override; | |
150 | ||
151 | int rm_keys( | |
152 | const ghobject_t &oid, | |
153 | const set<string> &to_clear, | |
154 | const SequencerPosition *spos=0 | |
155 | ) override; | |
156 | ||
157 | int get( | |
158 | const ghobject_t &oid, | |
159 | bufferlist *header, | |
160 | map<string, bufferlist> *out | |
161 | ) override; | |
162 | ||
163 | int get_keys( | |
164 | const ghobject_t &oid, | |
165 | set<string> *keys | |
166 | ) override; | |
167 | ||
168 | int get_values( | |
169 | const ghobject_t &oid, | |
170 | const set<string> &keys, | |
171 | map<string, bufferlist> *out | |
172 | ) override; | |
173 | ||
174 | int check_keys( | |
175 | const ghobject_t &oid, | |
176 | const set<string> &keys, | |
177 | set<string> *out | |
178 | ) override; | |
179 | ||
180 | int get_xattrs( | |
181 | const ghobject_t &oid, | |
182 | const set<string> &to_get, | |
183 | map<string, bufferlist> *out | |
184 | ) override; | |
185 | ||
186 | int get_all_xattrs( | |
187 | const ghobject_t &oid, | |
188 | set<string> *out | |
189 | ) override; | |
190 | ||
191 | int set_xattrs( | |
192 | const ghobject_t &oid, | |
193 | const map<string, bufferlist> &to_set, | |
194 | const SequencerPosition *spos=0 | |
195 | ) override; | |
196 | ||
197 | int remove_xattrs( | |
198 | const ghobject_t &oid, | |
199 | const set<string> &to_remove, | |
200 | const SequencerPosition *spos=0 | |
201 | ) override; | |
202 | ||
203 | int clone( | |
204 | const ghobject_t &oid, | |
205 | const ghobject_t &target, | |
206 | const SequencerPosition *spos=0 | |
207 | ) override; | |
208 | ||
209 | int rename( | |
210 | const ghobject_t &from, | |
211 | const ghobject_t &to, | |
212 | const SequencerPosition *spos=0 | |
213 | ); | |
214 | ||
215 | int legacy_clone( | |
216 | const ghobject_t &oid, | |
217 | const ghobject_t &target, | |
218 | const SequencerPosition *spos=0 | |
219 | ); | |
220 | ||
221 | /// Read initial state from backing store | |
222 | int init(bool upgrade = false); | |
223 | ||
224 | /// Upgrade store to current version | |
225 | int upgrade_to_v2(); | |
226 | ||
227 | /// Consistency check, debug, there must be no parallel writes | |
228 | int check(std::ostream &out, bool repair = false) override; | |
229 | ||
230 | /// Ensure that all previous operations are durable | |
231 | int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override; | |
232 | ||
224ce89b WB |
233 | void compact() override { |
234 | assert(db); | |
235 | db->compact(); | |
236 | } | |
237 | ||
7c673cae FG |
238 | /// Util, get all objects, there must be no other concurrent access |
239 | int list_objects(vector<ghobject_t> *objs ///< [out] objects | |
240 | ); | |
241 | ||
242 | struct _Header; | |
243 | // Util, get all object headers, there must be no other concurrent access | |
244 | int list_object_headers(vector<_Header> *out ///< [out] headers | |
245 | ); | |
246 | ||
247 | ObjectMapIterator get_iterator(const ghobject_t &oid) override; | |
248 | ||
249 | static const string USER_PREFIX; | |
250 | static const string XATTR_PREFIX; | |
251 | static const string SYS_PREFIX; | |
252 | static const string COMPLETE_PREFIX; | |
253 | static const string HEADER_KEY; | |
254 | static const string USER_HEADER_KEY; | |
255 | static const string GLOBAL_STATE_KEY; | |
256 | static const string HOBJECT_TO_SEQ; | |
257 | ||
258 | /// Legacy | |
259 | static const string LEAF_PREFIX; | |
260 | static const string REVERSE_LEAF_PREFIX; | |
261 | ||
262 | /// persistent state for store @see generate_header | |
263 | struct State { | |
264 | __u8 v; | |
265 | uint64_t seq; | |
266 | State() : v(0), seq(1) {} | |
267 | explicit State(uint64_t seq) : v(0), seq(seq) {} | |
268 | ||
269 | void encode(bufferlist &bl) const { | |
270 | ENCODE_START(2, 1, bl); | |
271 | ::encode(v, bl); | |
272 | ::encode(seq, bl); | |
273 | ENCODE_FINISH(bl); | |
274 | } | |
275 | ||
276 | void decode(bufferlist::iterator &bl) { | |
277 | DECODE_START(2, bl); | |
278 | if (struct_v >= 2) | |
279 | ::decode(v, bl); | |
280 | else | |
281 | v = 0; | |
282 | ::decode(seq, bl); | |
283 | DECODE_FINISH(bl); | |
284 | } | |
285 | ||
286 | void dump(Formatter *f) const { | |
287 | f->dump_unsigned("seq", seq); | |
288 | } | |
289 | ||
290 | static void generate_test_instances(list<State*> &o) { | |
291 | o.push_back(new State(0)); | |
292 | o.push_back(new State(20)); | |
293 | } | |
294 | } state; | |
295 | ||
296 | struct _Header { | |
297 | uint64_t seq; | |
298 | uint64_t parent; | |
299 | uint64_t num_children; | |
300 | ||
301 | ghobject_t oid; | |
302 | ||
303 | SequencerPosition spos; | |
304 | ||
305 | void encode(bufferlist &bl) const { | |
306 | coll_t unused; | |
307 | ENCODE_START(2, 1, bl); | |
308 | ::encode(seq, bl); | |
309 | ::encode(parent, bl); | |
310 | ::encode(num_children, bl); | |
311 | ::encode(unused, bl); | |
312 | ::encode(oid, bl); | |
313 | ::encode(spos, bl); | |
314 | ENCODE_FINISH(bl); | |
315 | } | |
316 | ||
317 | void decode(bufferlist::iterator &bl) { | |
318 | coll_t unused; | |
319 | DECODE_START(2, bl); | |
320 | ::decode(seq, bl); | |
321 | ::decode(parent, bl); | |
322 | ::decode(num_children, bl); | |
323 | ::decode(unused, bl); | |
324 | ::decode(oid, bl); | |
325 | if (struct_v >= 2) | |
326 | ::decode(spos, bl); | |
327 | DECODE_FINISH(bl); | |
328 | } | |
329 | ||
330 | void dump(Formatter *f) const { | |
331 | f->dump_unsigned("seq", seq); | |
332 | f->dump_unsigned("parent", parent); | |
333 | f->dump_unsigned("num_children", num_children); | |
334 | f->dump_stream("oid") << oid; | |
335 | } | |
336 | ||
337 | static void generate_test_instances(list<_Header*> &o) { | |
338 | o.push_back(new _Header); | |
339 | o.push_back(new _Header); | |
340 | o.back()->parent = 20; | |
341 | o.back()->seq = 30; | |
342 | } | |
343 | ||
344 | _Header() : seq(0), parent(0), num_children(1) {} | |
345 | }; | |
346 | ||
347 | /// String munging (public for testing) | |
348 | static string ghobject_key(const ghobject_t &oid); | |
349 | static string ghobject_key_v0(coll_t c, const ghobject_t &oid); | |
350 | static int is_buggy_ghobject_key_v1(CephContext* cct, | |
351 | const string &in); | |
352 | private: | |
353 | /// Implicit lock on Header->seq | |
354 | typedef ceph::shared_ptr<_Header> Header; | |
355 | Mutex cache_lock; | |
356 | SimpleLRU<ghobject_t, _Header> caches; | |
357 | ||
358 | string map_header_key(const ghobject_t &oid); | |
359 | string header_key(uint64_t seq); | |
360 | string complete_prefix(Header header); | |
361 | string user_prefix(Header header); | |
362 | string sys_prefix(Header header); | |
363 | string xattr_prefix(Header header); | |
364 | string sys_parent_prefix(_Header header); | |
365 | string sys_parent_prefix(Header header) { | |
366 | return sys_parent_prefix(*header); | |
367 | } | |
368 | ||
369 | class EmptyIteratorImpl : public ObjectMapIteratorImpl { | |
370 | public: | |
371 | int seek_to_first() override { return 0; } | |
372 | int seek_to_last() { return 0; } | |
373 | int upper_bound(const string &after) override { return 0; } | |
374 | int lower_bound(const string &to) override { return 0; } | |
375 | bool valid() override { return false; } | |
376 | int next(bool validate=true) override { ceph_abort(); return 0; } | |
377 | string key() override { ceph_abort(); return ""; } | |
378 | bufferlist value() override { ceph_abort(); return bufferlist(); } | |
379 | int status() override { return 0; } | |
380 | }; | |
381 | ||
382 | ||
383 | /// Iterator | |
384 | class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl { | |
385 | public: | |
386 | DBObjectMap *map; | |
387 | ||
388 | /// NOTE: implicit lock hlock->get_locked() when returned out of the class | |
389 | MapHeaderLock hlock; | |
390 | /// NOTE: implicit lock on header->seq AND for all ancestors | |
391 | Header header; | |
392 | ||
393 | /// parent_iter == NULL iff no parent | |
394 | ceph::shared_ptr<DBObjectMapIteratorImpl> parent_iter; | |
395 | KeyValueDB::Iterator key_iter; | |
396 | KeyValueDB::Iterator complete_iter; | |
397 | ||
398 | /// cur_iter points to currently valid iterator | |
399 | ceph::shared_ptr<ObjectMapIteratorImpl> cur_iter; | |
400 | int r; | |
401 | ||
402 | /// init() called, key_iter, complete_iter, parent_iter filled in | |
403 | bool ready; | |
404 | /// past end | |
405 | bool invalid; | |
406 | ||
407 | DBObjectMapIteratorImpl(DBObjectMap *map, Header header) : | |
408 | map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {} | |
409 | int seek_to_first() override; | |
410 | int seek_to_last(); | |
411 | int upper_bound(const string &after) override; | |
412 | int lower_bound(const string &to) override; | |
413 | bool valid() override; | |
414 | int next(bool validate=true) override; | |
415 | string key() override; | |
416 | bufferlist value() override; | |
417 | int status() override; | |
418 | ||
419 | bool on_parent() { | |
420 | return cur_iter == parent_iter; | |
421 | } | |
422 | ||
423 | /// skips to next valid parent entry | |
424 | int next_parent(); | |
425 | ||
426 | /// first parent() >= to | |
427 | int lower_bound_parent(const string &to); | |
428 | ||
429 | /** | |
430 | * Tests whether to_test is in complete region | |
431 | * | |
432 | * postcondition: complete_iter will be max s.t. complete_iter->value > to_test | |
433 | */ | |
434 | int in_complete_region(const string &to_test, ///< [in] key to test | |
435 | string *begin, ///< [out] beginning of region | |
436 | string *end ///< [out] end of region | |
437 | ); ///< @returns true if to_test is in the complete region, else false | |
438 | ||
439 | private: | |
440 | int init(); | |
441 | bool valid_parent(); | |
442 | int adjust(); | |
443 | }; | |
444 | ||
445 | typedef ceph::shared_ptr<DBObjectMapIteratorImpl> DBObjectMapIterator; | |
446 | DBObjectMapIterator _get_iterator(Header header) { | |
447 | return std::make_shared<DBObjectMapIteratorImpl>(this, header); | |
448 | } | |
449 | ||
450 | /// sys | |
451 | ||
452 | /// Removes node corresponding to header | |
453 | void clear_header(Header header, KeyValueDB::Transaction t); | |
454 | ||
455 | /// Set node containing input to new contents | |
456 | void set_header(Header input, KeyValueDB::Transaction t); | |
457 | ||
458 | /// Remove leaf node corresponding to oid in c | |
459 | void remove_map_header( | |
460 | const MapHeaderLock &l, | |
461 | const ghobject_t &oid, | |
462 | Header header, | |
463 | KeyValueDB::Transaction t); | |
464 | ||
465 | /// Set leaf node for c and oid to the value of header | |
466 | void set_map_header( | |
467 | const MapHeaderLock &l, | |
468 | const ghobject_t &oid, _Header header, | |
469 | KeyValueDB::Transaction t); | |
470 | ||
471 | /// Set leaf node for c and oid to the value of header | |
472 | bool check_spos(const ghobject_t &oid, | |
473 | Header header, | |
474 | const SequencerPosition *spos); | |
475 | ||
476 | /// Lookup or create header for c oid | |
477 | Header lookup_create_map_header( | |
478 | const MapHeaderLock &l, | |
479 | const ghobject_t &oid, | |
480 | KeyValueDB::Transaction t); | |
481 | ||
482 | /** | |
483 | * Generate new header for c oid with new seq number | |
484 | * | |
485 | * Has the side effect of syncronously saving the new DBObjectMap state | |
486 | */ | |
487 | Header _generate_new_header(const ghobject_t &oid, Header parent); | |
488 | Header generate_new_header(const ghobject_t &oid, Header parent) { | |
489 | Mutex::Locker l(header_lock); | |
490 | return _generate_new_header(oid, parent); | |
491 | } | |
492 | ||
493 | /// Lookup leaf header for c oid | |
494 | Header _lookup_map_header( | |
495 | const MapHeaderLock &l, | |
496 | const ghobject_t &oid); | |
497 | Header lookup_map_header( | |
498 | const MapHeaderLock &l2, | |
499 | const ghobject_t &oid) { | |
500 | Mutex::Locker l(header_lock); | |
501 | return _lookup_map_header(l2, oid); | |
502 | } | |
503 | ||
504 | /// Lookup header node for input | |
505 | Header lookup_parent(Header input); | |
506 | ||
507 | ||
508 | /// Helpers | |
509 | int _get_header(Header header, bufferlist *bl); | |
510 | ||
511 | /// Scan keys in header into out_keys and out_values (if nonnull) | |
512 | int scan(Header header, | |
513 | const set<string> &in_keys, | |
514 | set<string> *out_keys, | |
515 | map<string, bufferlist> *out_values); | |
516 | ||
517 | /// Remove header and all related prefixes | |
518 | int _clear(Header header, | |
519 | KeyValueDB::Transaction t); | |
520 | ||
521 | /* Scan complete region bumping *begin to the beginning of any | |
522 | * containing region and adding all complete region keys between | |
523 | * the updated begin and end to the complete_keys_to_remove set */ | |
524 | int merge_new_complete(DBObjectMapIterator &iter, | |
525 | string *begin, | |
526 | const string &end, | |
527 | set<string> *complete_keys_to_remove); | |
528 | ||
529 | /// Writes out State (mainly next_seq) | |
530 | int write_state(KeyValueDB::Transaction _t = | |
531 | KeyValueDB::Transaction()); | |
532 | ||
533 | /// Copies header entry from parent @see rm_keys | |
534 | int copy_up_header(Header header, | |
535 | KeyValueDB::Transaction t); | |
536 | ||
537 | /// Sets header @see set_header | |
538 | void _set_header(Header header, const bufferlist &bl, | |
539 | KeyValueDB::Transaction t); | |
540 | ||
541 | /** | |
542 | * Removes header seq lock and possibly object lock | |
543 | * once Header is out of scope | |
544 | * @see lookup_parent | |
545 | * @see generate_new_header | |
546 | */ | |
547 | class RemoveOnDelete { | |
548 | public: | |
549 | DBObjectMap *db; | |
550 | explicit RemoveOnDelete(DBObjectMap *db) : | |
551 | db(db) {} | |
552 | void operator() (_Header *header) { | |
553 | Mutex::Locker l(db->header_lock); | |
554 | assert(db->in_use.count(header->seq)); | |
555 | db->in_use.erase(header->seq); | |
556 | db->header_cond.Signal(); | |
557 | delete header; | |
558 | } | |
559 | }; | |
560 | friend class RemoveOnDelete; | |
561 | }; | |
562 | WRITE_CLASS_ENCODER(DBObjectMap::_Header) | |
563 | WRITE_CLASS_ENCODER(DBObjectMap::State) | |
564 | ||
565 | ostream& operator<<(ostream& out, const DBObjectMap::_Header& h); | |
566 | ||
567 | #endif |