]> git.proxmox.com Git - ceph.git/blame - ceph/src/key_value_store/kv_flat_btree_async.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / key_value_store / kv_flat_btree_async.h
CommitLineData
7c673cae
FG
1/*
2 * Uses a two-level B-tree to store a set of key-value pairs.
3 *
4 * September 2, 2012
5 * Eleanor Cawthon
6 * eleanor.cawthon@inktank.com
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 */
13
14#ifndef KVFLATBTREEASYNC_H_
15#define KVFLATBTREEASYNC_H_
16
17#define ESUICIDE 134
18#define EPREFIX 136
19#define EFIRSTOBJ 138
20
21#include "key_value_store/key_value_structure.h"
22#include "include/utime.h"
23#include "include/types.h"
24#include "include/encoding.h"
25#include "common/Mutex.h"
26#include "common/Clock.h"
27#include "common/Formatter.h"
28#include "global/global_context.h"
29#include "include/rados/librados.hpp"
30#include <cfloat>
31#include <queue>
32#include <sstream>
33#include <stdarg.h>
34
7c673cae
FG
35using ceph::bufferlist;
36
37enum {
38 ADD_PREFIX = 1,
39 MAKE_OBJECT = 2,
40 UNWRITE_OBJECT = 3,
41 RESTORE_OBJECT = 4,
42 REMOVE_OBJECT = 5,
43 REMOVE_PREFIX = 6,
44 AIO_MAKE_OBJECT = 7
45};
46
47struct rebalance_args;
48
49
50/**
51 * stores information about a key in the index.
52 *
53 * prefix is "0" unless key is "", in which case it is "1". This ensures that
54 * the object with key "" will always be the highest key in the index.
55 */
56struct key_data {
57 string raw_key;
58 string prefix;
59
60 key_data()
61 {}
62
63 /**
64 * @pre: key is a raw key (does not contain a prefix)
65 */
66 key_data(string key)
67 : raw_key(key)
68 {
69 raw_key == "" ? prefix = "1" : prefix = "0";
70 }
71
72 bool operator==(key_data k) const {
73 return ((raw_key == k.raw_key) && (prefix == k.prefix));
74 }
75
76 bool operator!=(key_data k) const {
77 return ((raw_key != k.raw_key) || (prefix != k.prefix));
78 }
79
80 bool operator<(key_data k) const {
81 return this->encoded() < k.encoded();
82 }
83
84 bool operator>(key_data k) const {
85 return this->encoded() > k.encoded();
86 }
87
88 /**
89 * parses the prefix from encoded and stores the data in this.
90 *
91 * @pre: encoded has a prefix
92 */
93 void parse(string encoded) {
94 prefix = encoded[0];
95 raw_key = encoded.substr(1,encoded.length());
96 }
97
98 /**
99 * returns a string containing the encoded (prefixed) key
100 */
101 string encoded() const {
102 return prefix + raw_key;
103 }
104
105 void encode(bufferlist &bl) const {
106 ENCODE_START(1,1,bl);
11fdf7f2
TL
107 encode(raw_key, bl);
108 encode(prefix, bl);
7c673cae
FG
109 ENCODE_FINISH(bl);
110 }
11fdf7f2 111 void decode(bufferlist::const_iterator &p) {
7c673cae 112 DECODE_START(1, p);
11fdf7f2
TL
113 decode(raw_key, p);
114 decode(prefix, p);
7c673cae
FG
115 DECODE_FINISH(p);
116 }
117};
118WRITE_CLASS_ENCODER(key_data)
119
120
121/**
122 * Stores information read from a librados object.
123 */
124struct object_data {
125 key_data min_kdata; //the max key from the previous index entry
126 key_data max_kdata; //the max key, from the index
127 string name; //the object's name
128 map<std::string, bufferlist> omap; // the omap of the object
129 bool unwritable; // an xattr that, if false, means an op is in
130 // progress and other clients should not write to it.
131 uint64_t version; //the version at time of read
132 uint64_t size; //the number of elements in the omap
133
134 object_data()
135 : unwritable(false),
136 version(0),
137 size(0)
138 {}
139
140 object_data(string the_name)
141 : name(the_name),
142 unwritable(false),
143 version(0),
144 size(0)
145 {}
146
147 object_data(key_data min, key_data kdat, string the_name)
148 : min_kdata(min),
149 max_kdata(kdat),
150 name(the_name),
151 unwritable(false),
152 version(0),
153 size(0)
154 {}
155
156 object_data(key_data min, key_data kdat, string the_name,
157 map<std::string, bufferlist> the_omap)
158 : min_kdata(min),
159 max_kdata(kdat),
160 name(the_name),
161 omap(the_omap),
162 unwritable(false),
163 version(0),
164 size(0)
165 {}
166
167 object_data(key_data min, key_data kdat, string the_name, int the_version)
168 : min_kdata(min),
169 max_kdata(kdat),
170 name(the_name),
171 unwritable(false),
172 version(the_version),
173 size(0)
174 {}
175
176 void encode(bufferlist &bl) const {
177 ENCODE_START(1,1,bl);
11fdf7f2
TL
178 encode(min_kdata, bl);
179 encode(max_kdata, bl);
180 encode(name, bl);
181 encode(omap, bl);
182 encode(unwritable, bl);
183 encode(version, bl);
184 encode(size, bl);
7c673cae
FG
185 ENCODE_FINISH(bl);
186 }
11fdf7f2 187 void decode(bufferlist::const_iterator &p) {
7c673cae 188 DECODE_START(1, p);
11fdf7f2
TL
189 decode(min_kdata, p);
190 decode(max_kdata, p);
191 decode(name, p);
192 decode(omap, p);
193 decode(unwritable, p);
194 decode(version, p);
195 decode(size, p);
7c673cae
FG
196 DECODE_FINISH(p);
197 }
198};
199WRITE_CLASS_ENCODER(object_data)
200
201/**
202 * information about objects to be created by a split or merge - stored in the
203 * index_data.
204 */
205struct create_data {
206 key_data min;
207 key_data max;
208 string obj;
209
210 create_data()
211 {}
212
213 create_data(key_data n, key_data x, string o)
214 : min(n),
215 max(x),
216 obj(o)
217 {}
218
219 create_data(object_data o)
220 : min(o.min_kdata),
221 max(o.max_kdata),
222 obj(o.name)
223 {}
224
225 create_data & operator=(const create_data &c) {
226 min = c.min;
227 max = c.max;
228 obj = c.obj;
229 return *this;
230 }
231
232 void encode(bufferlist &bl) const {
233 ENCODE_START(1,1,bl);
11fdf7f2
TL
234 encode(min, bl);
235 encode(max, bl);
236 encode(obj, bl);
7c673cae
FG
237 ENCODE_FINISH(bl);
238 }
11fdf7f2 239 void decode(bufferlist::const_iterator &p) {
7c673cae 240 DECODE_START(1, p);
11fdf7f2
TL
241 decode(min, p);
242 decode(max, p);
243 decode(obj, p);
7c673cae
FG
244 DECODE_FINISH(p);
245 }
246};
247WRITE_CLASS_ENCODER(create_data)
248
249/**
250 * information about objects to be deleted by a split or merge - stored in the
251 * index_data.
252 */
253struct delete_data {
254 key_data min;
255 key_data max;
256 string obj;
257 uint64_t version;
258
259 delete_data()
260 : version(0)
261 {}
262
263 delete_data(key_data n, key_data x, string o, uint64_t v)
264 : min(n),
265 max(x),
266 obj(o),
267 version(v)
268 {}
269
270 delete_data & operator=(const delete_data &d) {
271 min = d.min;
272 max = d.max;
273 obj = d.obj;
274 version = d.version;
275 return *this;
276 }
277
278
279 void encode(bufferlist &bl) const {
280 ENCODE_START(1,1,bl);
11fdf7f2
TL
281 encode(min, bl);
282 encode(max, bl);
283 encode(obj, bl);
284 encode(version, bl);
7c673cae
FG
285 ENCODE_FINISH(bl);
286 }
11fdf7f2 287 void decode(bufferlist::const_iterator &p) {
7c673cae 288 DECODE_START(1, p);
11fdf7f2
TL
289 decode(min, p);
290 decode(max, p);
291 decode(obj, p);
292 decode(version, p);
7c673cae
FG
293 DECODE_FINISH(p);
294 }
295};
296WRITE_CLASS_ENCODER(delete_data)
297
298/**
299 * The index object is a key value map that stores
300 * the highest key stored in an object as keys, and an index_data
301 * as the corresponding value. The index_data contains the encoded
302 * high and low keys (where keys in this object are > min_kdata and
303 * <= kdata), the name of the librados object where keys containing
304 * that range of keys are located, and information about split and
305 * merge operations that may need to be cleaned up if a client dies.
306 */
307struct index_data {
308 //the encoded key corresponding to the object
309 key_data kdata;
310
311 //"1" if there is a prefix (because a split or merge is
312 //in progress), otherwise ""
313 string prefix;
314
315 //the kdata of the previous index entry
316 key_data min_kdata;
317
318 utime_t ts; //time that a split/merge started
319
320 //objects to be created
321 vector<create_data > to_create;
322
323 //objects to be deleted
324 vector<delete_data > to_delete;
325
326 //the name of the object where the key range is located.
327 string obj;
328
329 index_data()
330 {}
331
332 index_data(string raw_key)
333 : kdata(raw_key)
334 {}
335
336 index_data(key_data max, key_data min, string o)
337 : kdata(max),
338 min_kdata(min),
339 obj(o)
340 {}
341
342 index_data(create_data c)
343 : kdata(c.max),
344 min_kdata(c.min),
345 obj(c.obj)
346 {}
347
348 bool operator<(const index_data &other) const {
349 return (kdata.encoded() < other.kdata.encoded());
350 }
351
352 //true if there is a prefix and now - ts > timeout.
353 bool is_timed_out(utime_t now, utime_t timeout) const;
354
355 void encode(bufferlist &bl) const {
356 ENCODE_START(1,1,bl);
11fdf7f2
TL
357 encode(prefix, bl);
358 encode(min_kdata, bl);
359 encode(kdata, bl);
360 encode(ts, bl);
361 encode(to_create, bl);
362 encode(to_delete, bl);
363 encode(obj, bl);
7c673cae
FG
364 ENCODE_FINISH(bl);
365 }
11fdf7f2 366 void decode(bufferlist::const_iterator &p) {
7c673cae 367 DECODE_START(1, p);
11fdf7f2
TL
368 decode(prefix, p);
369 decode(min_kdata, p);
370 decode(kdata, p);
371 decode(ts, p);
372 decode(to_create, p);
373 decode(to_delete, p);
374 decode(obj, p);
7c673cae
FG
375 DECODE_FINISH(p);
376 }
377
378 /*
379 * Prints a string representation of the information, in the following format:
380 * (min_kdata/
381 * kdata,
382 * prefix
383 * ts
384 * elements of to_create, organized into (high key| obj name)
385 * ;
386 * elements of to_delete, organized into (high key| obj name | version number)
387 * :
388 * val)
389 */
390 string str() const {
391 stringstream strm;
392 strm << '(' << min_kdata.encoded() << "/" << kdata.encoded() << ','
393 << prefix;
394 if (prefix == "1") {
395 strm << ts.sec() << '.' << ts.usec();
396 for(vector<create_data>::const_iterator it = to_create.begin();
397 it != to_create.end(); ++it) {
398 strm << '(' << it->min.encoded() << '/' << it->max.encoded() << '|'
399 << it->obj << ')';
400 }
401 strm << ';';
402 for(vector<delete_data >::const_iterator it = to_delete.begin();
403 it != to_delete.end(); ++it) {
404 strm << '(' << it->min.encoded() << '/' << it->max.encoded() << '|'
405 << it->obj << '|'
406 << it->version << ')';
407 }
408 strm << ':';
409 }
410 strm << obj << ')';
411 return strm.str();
412 }
413};
414WRITE_CLASS_ENCODER(index_data)
415
416/**
417 * Structure to store information read from the index for reuse.
418 */
419class IndexCache {
420protected:
421 map<key_data, pair<index_data, utime_t> > k2itmap;
422 map<utime_t, key_data> t2kmap;
423 int cache_size;
424
425public:
426 IndexCache(int n)
427 : cache_size(n)
428 {}
429 /**
430 * Inserts idata into the cache and removes whatever key mapped to before.
431 * If the cache is full, pops the oldest entry.
432 */
433 void push(const string &key, const index_data &idata);
434
435 /**
436 * Inserts idata into the cache. If idata.kdata is already in the cache,
437 * replaces the old one. Pops the oldest entry if the cache is full.
438 */
439 void push(const index_data &idata);
440
441 /**
442 * Removes the oldest entry from the cache
443 */
444 void pop();
445
446 /**
447 * Removes the value associated with kdata from both maps
448 */
449 void erase(key_data kdata);
450
451 /**
452 * gets the idata where key belongs. If none, returns -ENODATA.
453 */
454 int get(const string &key, index_data *idata) const;
455
456 /**
457 * Gets the idata where key goes and the one after it. If there are not
458 * valid entries for both of them, returns -ENODATA.
459 */
460 int get(const string &key, index_data *idata, index_data * next_idata) const;
461 void clear();
462};
463
464class KvFlatBtreeAsync;
465
466
467/**
468 * These are used internally to translate aio operations into useful thread
469 * arguments.
470 */
471struct aio_set_args {
472 KvFlatBtreeAsync * kvba;
473 string key;
474 bufferlist val;
475 bool exc;
476 callback cb;
477 void * cb_args;
478 int * err;
479};
480
481struct aio_rm_args {
482 KvFlatBtreeAsync * kvba;
483 string key;
484 callback cb;
485 void * cb_args;
486 int * err;
487};
488
489struct aio_get_args {
490 KvFlatBtreeAsync * kvba;
491 string key;
492 bufferlist * val;
493 bool exc;
494 callback cb;
495 void * cb_args;
496 int * err;
497};
498
499class KvFlatBtreeAsync : public KeyValueStructure {
500protected:
501
502 //don't change these once operations start being called - they are not
503 //protected with mutexes!
504 int k;
505 string index_name;
506 librados::IoCtx io_ctx;
507 string rados_id;
508 string client_name;
509 librados::Rados rados;
510 string pool_name;
511 injection_t interrupt;
512 int wait_ms;
513 utime_t timeout; //declare a client dead if it goes this long without
514 //finishing a split/merge
515 int cache_size;
516 double cache_refresh; //read cache_size / cache_refresh entries each time the
517 //index is read
518 bool verbose;//if true, display lots of debug output
519
520 //shared variables protected with mutexes
521 Mutex client_index_lock;
522 int client_index; //names of new objects are client_name.client_index
523 Mutex icache_lock;
524 IndexCache icache;
525 friend struct index_data;
526
527 /**
528 * finds the object in the index with the lowest key value that is greater
529 * than idata.kdata. If idata.kdata is the max key, returns -EOVERFLOW. If
530 * idata has a prefix and has timed out, cleans up.
531 *
532 * @param idata: idata for the object to search for.
533 * @param out_data: the idata for the next object.
534 *
535 * @pre: idata must contain a key_data.
536 * @post: out_data contains complete information
537 */
538 int next(const index_data &idata, index_data * out_data);
539
540 /**
541 * finds the object in the index with the lowest key value that is greater
542 * than idata.kdata. If idata.kdata is the lowest key, returns -ERANGE If
543 * idata has a prefix and has timed out, cleans up.
544 *
545 * @param idata: idata for the object to search for.
546 * @param out_data: the idata for the next object.
547 *
548 * @pre: idata must contain a key_data.
549 * @post: out_data contains complete information
550 */
551 int prev(const index_data &idata, index_data * out_data);
552
553 /**
554 * finds the index_data where a key belongs, from cache if possible. If it
555 * reads the index object, it will read the first cache_size entries after
556 * key and put them in the cache.
557 *
558 * @param key: the key to search for
559 * @param idata: the index_data for the first index value such that idata.key
560 * is greater than key.
561 * @param next_idata: if not NULL, this will be set to the idata after idata
562 * @param force_update: if false, will try to read from cache first.
563 *
564 * @pre: key is not encoded
565 * @post: idata contains complete information
566 * stored
567 */
568 int read_index(const string &key, index_data * idata,
569 index_data * next_idata, bool force_update);
570
571 /**
572 * Reads obj and generates information about it. Iff the object has >= 2k
573 * entries, reads the whole omap and then splits it.
574 *
575 * @param idata: index data for the object being split
576 * @pre: idata contains a key and an obj
577 * @post: idata.obj has been split and icache has been updated
578 * @return -EBALANCE if obj does not need to be split, 0 if split successful,
579 * error from read_object or perform_ops if there is one.
580 */
581 int split(const index_data &idata);
582
583 /**
584 * reads o1 and the next object after o1 and, if necessary, rebalances them.
585 * if hk1 is the highest key in the index, calls rebalance on the next highest
586 * key.
587 *
588 * @param idata: index data for the object being rebalanced
589 * @param next_idata: index data for the next object. If blank, will read.
590 * @pre: idata contains a key and an obj
591 * @post: idata.obj has been rebalanced and icache has been updated
592 * @return -EBALANCE if no change needed, -ENOENT if o1 does not exist,
593 * -ECANCELED if second object does not exist, otherwise, error from
594 * perform_ops
595 */
596 int rebalance(const index_data &idata1, const index_data &next_idata);
597
598 /**
599 * performs an ObjectReadOperation to populate odata
600 *
601 * @post: odata has all information about obj except for key (which is "")
602 */
603 int read_object(const string &obj, object_data * odata);
604
605 /**
606 * performs a maybe_read_for_balance ObjectOperation so the omap is only
607 * read if the object is out of bounds.
608 */
609 int read_object(const string &obj, rebalance_args * args);
610
611 /**
612 * sets up owo to change the index in preparation for a split/merge.
613 *
614 * @param to_create: vector of object_data to be created.
615 * @param to_delete: vector of object_data to be deleted.
616 * @param owo: the ObjectWriteOperation to set up
617 * @param idata: will be populated by index data for this op.
618 * @param err: error code reference to pass to omap_cmp
619 * @pre: entries in to_create and to_delete must have keys and names.
620 */
621 void set_up_prefix_index(
622 const vector<object_data> &to_create,
623 const vector<object_data> &to_delete,
624 librados::ObjectWriteOperation * owo,
625 index_data * idata,
626 int * err);
627
628 /**
629 * sets up all make, mark, restore, and delete ops, as well as the remove
630 * prefix op, based on idata.
631 *
632 * @param create_vector: vector of data about the objects to be created.
633 * @pre: entries in create_data must have names and omaps and be in idata
634 * order
635 * @param delete_vector: vector of data about the objects to be deleted
636 * @pre: entries in to_delete must have versions and be in idata order
637 * @param ops: the owos to set up. the pair is a pair of op identifiers
638 * and names of objects - set_up_ops fills these in.
639 * @pre: ops must be the correct size and the ObjectWriteOperation pointers
640 * must be valid.
641 * @param idata: the idata with information about how to set up the ops
642 * @pre: idata has valid to_create and to_delete
643 * @param err: the int to get the error value for omap_cmp
644 */
645 void set_up_ops(
646 const vector<object_data> &create_vector,
647 const vector<object_data> &delete_vector,
648 vector<pair<pair<int, string>, librados::ObjectWriteOperation*> > * ops,
649 const index_data &idata,
650 int * err);
651
652 /**
653 * sets up owo to exclusive create, set omap to to_set, and set
654 * unwritable to "0"
655 */
656 void set_up_make_object(
657 const map<std::string, bufferlist> &to_set,
658 librados::ObjectWriteOperation *owo);
659
660 /**
661 * sets up owo to assert object version and that object version is
662 * writable,
663 * then mark it unwritable.
664 *
665 * @param ver: if this is 0, no version is asserted.
666 */
667 void set_up_unwrite_object(
668 const int &ver, librados::ObjectWriteOperation *owo);
669
670 /**
671 * sets up owo to assert that an object is unwritable and then mark it
672 * writable
673 */
674 void set_up_restore_object(
675 librados::ObjectWriteOperation *owo);
676
677 /**
678 * sets up owo to assert that the object is unwritable and then remove it
679 */
680 void set_up_delete_object(
681 librados::ObjectWriteOperation *owo);
682
683 /**
684 * perform the operations in ops and handles errors.
685 *
686 * @param debug_prefix: what to print at the beginning of debug output
687 * @param idata: the idata for the object being operated on, to be
688 * passed to cleanup if necessary
689 * @param ops: this contains an int identifying the type of op,
690 * a string that is the name of the object to operate on, and a pointer
691 * to the ObjectWriteOperation to use. All of this must be complete.
692 * @post: all operations are performed and most errors are handled
693 * (e.g., cleans up if an assertion fails). If an unknown error is found,
694 * returns it.
695 */
696 int perform_ops( const string &debug_prefix,
697 const index_data &idata,
698 vector<pair<pair<int, string>, librados::ObjectWriteOperation*> > * ops);
699
700 /**
701 * Called when a client discovers that another client has died during a
702 * split or a merge. cleans up after that client.
703 *
704 * @param idata: the index data parsed from the index entry left by the dead
705 * client.
706 * @param error: the error that caused the client to realize the other client
707 * died (should be -ENOENT or -ETIMEDOUT)
708 * @post: rolls forward if -ENOENT, otherwise rolls back.
709 */
710 int cleanup(const index_data &idata, const int &error);
711
712 /**
713 * does the ObjectWriteOperation and splits, reads the index, and/or retries
714 * until success.
715 */
716 int set_op(const string &key, const bufferlist &val,
717 bool update_on_existing, index_data &idata);
718
719 /**
720 * does the ObjectWriteOperation and merges, reads the index, and/or retries
721 * until success.
722 */
723 int remove_op(const string &key, index_data &idata, index_data &next_idata);
724
725 /**
726 * does the ObjectWriteOperation and reads the index and/or retries
727 * until success.
728 */
729 int get_op(const string &key, bufferlist * val, index_data &idata);
730
731 /**
732 * does the ObjectWriteOperation and splits, reads the index, and/or retries
733 * until success.
734 */
735 int handle_set_rm_errors(int &err, string key, string obj,
736 index_data * idata, index_data * next_idata);
737
738 /**
739 * called by aio_set, aio_remove, and aio_get, respectively.
740 */
741 static void* pset(void *ptr);
742 static void* prm(void *ptr);
743 static void* pget(void *ptr);
744public:
745
746
747 //interruption methods, for correctness testing
748 /**
749 * returns 0
750 */
751 int nothing() override;
752 /**
753 * 10% chance of waiting wait_ms seconds
754 */
755 int wait() override;
756 /**
757 * 10% chance of killing the client.
758 */
759 int suicide() override;
760
761KvFlatBtreeAsync(int k_val, string name, int cache, double cache_r,
762 bool verb)
763 : k(k_val),
764 index_name("index_object"),
765 rados_id(name),
766 client_name(string(name).append(".")),
767 pool_name("rbd"),
768 interrupt(&KeyValueStructure::nothing),
769 wait_ms(0),
770 timeout(100000,0),
771 cache_size(cache),
772 cache_refresh(cache_r),
773 verbose(verb),
774 client_index_lock("client_index_lock"),
775 client_index(0),
776 icache_lock("icache_lock"),
777 icache(cache)
778 {}
779
780 /**
781 * creates a string with an int at the end.
782 *
783 * @param s: the string on the left
784 * @param i: the int to be appended to the string
785 * @return the string
786 */
787 static string to_string(string s, int i);
788
789 /**
790 * returns in encoded
791 */
792 static bufferlist to_bl(const string &in) {
793 bufferlist bl;
794 bl.append(in);
795 return bl;
796 }
797
798 /**
799 * returns idata encoded;
800 */
801 static bufferlist to_bl(const index_data &idata) {
802 bufferlist bl;
803 idata.encode(bl);
804 return bl;
805 }
806
807 /**
808 * returns the rados_id of this KvFlatBtreeAsync
809 */
810 string get_name();
811
812 /**
813 * sets this kvba to call inject before every ObjectWriteOperation.
814 * If inject is wait and wait_time is set, wait will have a 10% chance of
11fdf7f2 815 * sleeping for waite_time milliseconds.
7c673cae
FG
816 */
817 void set_inject(injection_t inject, int wait_time) override;
818
819 /**
820 * sets up the rados and io_ctx of this KvFlatBtreeAsync. If the don't already
821 * exist, creates the index and max object.
822 */
823 int setup(int argc, const char** argv) override;
824
825 int set(const string &key, const bufferlist &val,
826 bool update_on_existing) override;
827
828 int remove(const string &key) override;
829
830 /**
831 * returns true if all of the following are true:
832 *
833 * all objects are accounted for in the index or a prefix
834 * (i.e., no floating objects)
835 * all objects have k <= size <= 2k
836 * all keys in an object are within the specified predicted by the index
837 *
838 * if any of those fails, states that the problem(s) are, and prints str().
839 *
840 * @pre: no operations are in progress
841 */
842 bool is_consistent() override;
843
844 /**
845 * returns an ASCII representation of the index and sub objects, showing
846 * stats about each object and all omaps. Don't use if you have more than
847 * about 10 objects.
848 */
849 string str() override;
850
851 int get(const string &key, bufferlist *val) override;
852
853 //async versions of these methods
854 void aio_get(const string &key, bufferlist *val, callback cb,
855 void *cb_args, int * err) override;
856 void aio_set(const string &key, const bufferlist &val, bool exclusive,
857 callback cb, void * cb_args, int * err) override;
858 void aio_remove(const string &key, callback cb, void *cb_args, int * err) override;
859
860 //these methods that deal with multiple keys at once are efficient, but make
861 //no guarantees about atomicity!
862
863 /**
864 * Removes all objects and resets the store as if setup had just run. Makes no
865 * attempt to do this safely - make sure this is the only operation running
866 * when it is called!
867 */
868 int remove_all() override;
869
870 /**
871 * This does not add prefixes to the index and therefore DOES NOT guarantee
872 * consistency! It is ONLY safe if there is only one instance at a time.
873 * It follows the same general logic as a rebalance, but
874 * with all objects that contain any of the keys in in_map. It is O(n), where
875 * n is the number of librados objects it has to change. Higher object sizes
876 * (i.e., k values) also decrease the efficiency of this method because it
877 * copies all of the entries in each object it modifies. Writing new objects
878 * is done in parallel.
879 *
880 * This is efficient if:
881 * * other clients are very unlikely to be modifying any of the objects while
882 * this operation is in progress
883 * * The entries in in_map are close together
884 * * It is especially efficient for initially entering lots of entries into
885 * an empty structure.
886 *
887 * It is very inefficient compared to setting one key and/or will starve if:
888 * * other clients are modifying the objects it tries to modify
889 * * The keys are distributed across the range of keys in the store
890 * * there is a small number of keys compared to k
891 */
892 int set_many(const map<string, bufferlist> &in_map) override;
893
894 int get_all_keys(std::set<string> *keys) override;
895 int get_all_keys_and_values(map<string,bufferlist> *kv_map) override;
896
897};
898
899#endif /* KVFLATBTREEASYNC_H_ */