]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* |
2 | * Uses a two-level B-tree to store a set of key-value pairs. | |
3 | * | |
4 | * September 2, 2012 | |
5 | * Eleanor Cawthon | |
6 | * eleanor.cawthon@inktank.com | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | */ | |
13 | ||
14 | #ifndef KVFLATBTREEASYNC_H_ | |
15 | #define KVFLATBTREEASYNC_H_ | |
16 | ||
17 | #define ESUICIDE 134 | |
18 | #define EPREFIX 136 | |
19 | #define EFIRSTOBJ 138 | |
20 | ||
21 | #include "key_value_store/key_value_structure.h" | |
22 | #include "include/utime.h" | |
23 | #include "include/types.h" | |
24 | #include "include/encoding.h" | |
25 | #include "common/Mutex.h" | |
26 | #include "common/Clock.h" | |
27 | #include "common/Formatter.h" | |
28 | #include "global/global_context.h" | |
29 | #include "include/rados/librados.hpp" | |
30 | #include <cfloat> | |
31 | #include <queue> | |
32 | #include <sstream> | |
33 | #include <stdarg.h> | |
34 | ||
7c673cae FG |
35 | using ceph::bufferlist; |
36 | ||
37 | enum { | |
38 | ADD_PREFIX = 1, | |
39 | MAKE_OBJECT = 2, | |
40 | UNWRITE_OBJECT = 3, | |
41 | RESTORE_OBJECT = 4, | |
42 | REMOVE_OBJECT = 5, | |
43 | REMOVE_PREFIX = 6, | |
44 | AIO_MAKE_OBJECT = 7 | |
45 | }; | |
46 | ||
47 | struct rebalance_args; | |
48 | ||
49 | ||
50 | /** | |
51 | * stores information about a key in the index. | |
52 | * | |
53 | * prefix is "0" unless key is "", in which case it is "1". This ensures that | |
54 | * the object with key "" will always be the highest key in the index. | |
55 | */ | |
56 | struct key_data { | |
57 | string raw_key; | |
58 | string prefix; | |
59 | ||
60 | key_data() | |
61 | {} | |
62 | ||
63 | /** | |
64 | * @pre: key is a raw key (does not contain a prefix) | |
65 | */ | |
66 | key_data(string key) | |
67 | : raw_key(key) | |
68 | { | |
69 | raw_key == "" ? prefix = "1" : prefix = "0"; | |
70 | } | |
71 | ||
72 | bool operator==(key_data k) const { | |
73 | return ((raw_key == k.raw_key) && (prefix == k.prefix)); | |
74 | } | |
75 | ||
76 | bool operator!=(key_data k) const { | |
77 | return ((raw_key != k.raw_key) || (prefix != k.prefix)); | |
78 | } | |
79 | ||
80 | bool operator<(key_data k) const { | |
81 | return this->encoded() < k.encoded(); | |
82 | } | |
83 | ||
84 | bool operator>(key_data k) const { | |
85 | return this->encoded() > k.encoded(); | |
86 | } | |
87 | ||
88 | /** | |
89 | * parses the prefix from encoded and stores the data in this. | |
90 | * | |
91 | * @pre: encoded has a prefix | |
92 | */ | |
93 | void parse(string encoded) { | |
94 | prefix = encoded[0]; | |
95 | raw_key = encoded.substr(1,encoded.length()); | |
96 | } | |
97 | ||
98 | /** | |
99 | * returns a string containing the encoded (prefixed) key | |
100 | */ | |
101 | string encoded() const { | |
102 | return prefix + raw_key; | |
103 | } | |
104 | ||
105 | void encode(bufferlist &bl) const { | |
106 | ENCODE_START(1,1,bl); | |
11fdf7f2 TL |
107 | encode(raw_key, bl); |
108 | encode(prefix, bl); | |
7c673cae FG |
109 | ENCODE_FINISH(bl); |
110 | } | |
11fdf7f2 | 111 | void decode(bufferlist::const_iterator &p) { |
7c673cae | 112 | DECODE_START(1, p); |
11fdf7f2 TL |
113 | decode(raw_key, p); |
114 | decode(prefix, p); | |
7c673cae FG |
115 | DECODE_FINISH(p); |
116 | } | |
117 | }; | |
118 | WRITE_CLASS_ENCODER(key_data) | |
119 | ||
120 | ||
121 | /** | |
122 | * Stores information read from a librados object. | |
123 | */ | |
124 | struct object_data { | |
125 | key_data min_kdata; //the max key from the previous index entry | |
126 | key_data max_kdata; //the max key, from the index | |
127 | string name; //the object's name | |
128 | map<std::string, bufferlist> omap; // the omap of the object | |
129 | bool unwritable; // an xattr that, if false, means an op is in | |
130 | // progress and other clients should not write to it. | |
131 | uint64_t version; //the version at time of read | |
132 | uint64_t size; //the number of elements in the omap | |
133 | ||
134 | object_data() | |
135 | : unwritable(false), | |
136 | version(0), | |
137 | size(0) | |
138 | {} | |
139 | ||
140 | object_data(string the_name) | |
141 | : name(the_name), | |
142 | unwritable(false), | |
143 | version(0), | |
144 | size(0) | |
145 | {} | |
146 | ||
147 | object_data(key_data min, key_data kdat, string the_name) | |
148 | : min_kdata(min), | |
149 | max_kdata(kdat), | |
150 | name(the_name), | |
151 | unwritable(false), | |
152 | version(0), | |
153 | size(0) | |
154 | {} | |
155 | ||
156 | object_data(key_data min, key_data kdat, string the_name, | |
157 | map<std::string, bufferlist> the_omap) | |
158 | : min_kdata(min), | |
159 | max_kdata(kdat), | |
160 | name(the_name), | |
161 | omap(the_omap), | |
162 | unwritable(false), | |
163 | version(0), | |
164 | size(0) | |
165 | {} | |
166 | ||
167 | object_data(key_data min, key_data kdat, string the_name, int the_version) | |
168 | : min_kdata(min), | |
169 | max_kdata(kdat), | |
170 | name(the_name), | |
171 | unwritable(false), | |
172 | version(the_version), | |
173 | size(0) | |
174 | {} | |
175 | ||
176 | void encode(bufferlist &bl) const { | |
177 | ENCODE_START(1,1,bl); | |
11fdf7f2 TL |
178 | encode(min_kdata, bl); |
179 | encode(max_kdata, bl); | |
180 | encode(name, bl); | |
181 | encode(omap, bl); | |
182 | encode(unwritable, bl); | |
183 | encode(version, bl); | |
184 | encode(size, bl); | |
7c673cae FG |
185 | ENCODE_FINISH(bl); |
186 | } | |
11fdf7f2 | 187 | void decode(bufferlist::const_iterator &p) { |
7c673cae | 188 | DECODE_START(1, p); |
11fdf7f2 TL |
189 | decode(min_kdata, p); |
190 | decode(max_kdata, p); | |
191 | decode(name, p); | |
192 | decode(omap, p); | |
193 | decode(unwritable, p); | |
194 | decode(version, p); | |
195 | decode(size, p); | |
7c673cae FG |
196 | DECODE_FINISH(p); |
197 | } | |
198 | }; | |
199 | WRITE_CLASS_ENCODER(object_data) | |
200 | ||
201 | /** | |
202 | * information about objects to be created by a split or merge - stored in the | |
203 | * index_data. | |
204 | */ | |
205 | struct create_data { | |
206 | key_data min; | |
207 | key_data max; | |
208 | string obj; | |
209 | ||
210 | create_data() | |
211 | {} | |
212 | ||
213 | create_data(key_data n, key_data x, string o) | |
214 | : min(n), | |
215 | max(x), | |
216 | obj(o) | |
217 | {} | |
218 | ||
219 | create_data(object_data o) | |
220 | : min(o.min_kdata), | |
221 | max(o.max_kdata), | |
222 | obj(o.name) | |
223 | {} | |
224 | ||
225 | create_data & operator=(const create_data &c) { | |
226 | min = c.min; | |
227 | max = c.max; | |
228 | obj = c.obj; | |
229 | return *this; | |
230 | } | |
231 | ||
232 | void encode(bufferlist &bl) const { | |
233 | ENCODE_START(1,1,bl); | |
11fdf7f2 TL |
234 | encode(min, bl); |
235 | encode(max, bl); | |
236 | encode(obj, bl); | |
7c673cae FG |
237 | ENCODE_FINISH(bl); |
238 | } | |
11fdf7f2 | 239 | void decode(bufferlist::const_iterator &p) { |
7c673cae | 240 | DECODE_START(1, p); |
11fdf7f2 TL |
241 | decode(min, p); |
242 | decode(max, p); | |
243 | decode(obj, p); | |
7c673cae FG |
244 | DECODE_FINISH(p); |
245 | } | |
246 | }; | |
247 | WRITE_CLASS_ENCODER(create_data) | |
248 | ||
249 | /** | |
250 | * information about objects to be deleted by a split or merge - stored in the | |
251 | * index_data. | |
252 | */ | |
253 | struct delete_data { | |
254 | key_data min; | |
255 | key_data max; | |
256 | string obj; | |
257 | uint64_t version; | |
258 | ||
259 | delete_data() | |
260 | : version(0) | |
261 | {} | |
262 | ||
263 | delete_data(key_data n, key_data x, string o, uint64_t v) | |
264 | : min(n), | |
265 | max(x), | |
266 | obj(o), | |
267 | version(v) | |
268 | {} | |
269 | ||
270 | delete_data & operator=(const delete_data &d) { | |
271 | min = d.min; | |
272 | max = d.max; | |
273 | obj = d.obj; | |
274 | version = d.version; | |
275 | return *this; | |
276 | } | |
277 | ||
278 | ||
279 | void encode(bufferlist &bl) const { | |
280 | ENCODE_START(1,1,bl); | |
11fdf7f2 TL |
281 | encode(min, bl); |
282 | encode(max, bl); | |
283 | encode(obj, bl); | |
284 | encode(version, bl); | |
7c673cae FG |
285 | ENCODE_FINISH(bl); |
286 | } | |
11fdf7f2 | 287 | void decode(bufferlist::const_iterator &p) { |
7c673cae | 288 | DECODE_START(1, p); |
11fdf7f2 TL |
289 | decode(min, p); |
290 | decode(max, p); | |
291 | decode(obj, p); | |
292 | decode(version, p); | |
7c673cae FG |
293 | DECODE_FINISH(p); |
294 | } | |
295 | }; | |
296 | WRITE_CLASS_ENCODER(delete_data) | |
297 | ||
298 | /** | |
299 | * The index object is a key value map that stores | |
300 | * the highest key stored in an object as keys, and an index_data | |
301 | * as the corresponding value. The index_data contains the encoded | |
302 | * high and low keys (where keys in this object are > min_kdata and | |
303 | * <= kdata), the name of the librados object where keys containing | |
304 | * that range of keys are located, and information about split and | |
305 | * merge operations that may need to be cleaned up if a client dies. | |
306 | */ | |
307 | struct index_data { | |
308 | //the encoded key corresponding to the object | |
309 | key_data kdata; | |
310 | ||
311 | //"1" if there is a prefix (because a split or merge is | |
312 | //in progress), otherwise "" | |
313 | string prefix; | |
314 | ||
315 | //the kdata of the previous index entry | |
316 | key_data min_kdata; | |
317 | ||
318 | utime_t ts; //time that a split/merge started | |
319 | ||
320 | //objects to be created | |
321 | vector<create_data > to_create; | |
322 | ||
323 | //objects to be deleted | |
324 | vector<delete_data > to_delete; | |
325 | ||
326 | //the name of the object where the key range is located. | |
327 | string obj; | |
328 | ||
329 | index_data() | |
330 | {} | |
331 | ||
332 | index_data(string raw_key) | |
333 | : kdata(raw_key) | |
334 | {} | |
335 | ||
336 | index_data(key_data max, key_data min, string o) | |
337 | : kdata(max), | |
338 | min_kdata(min), | |
339 | obj(o) | |
340 | {} | |
341 | ||
342 | index_data(create_data c) | |
343 | : kdata(c.max), | |
344 | min_kdata(c.min), | |
345 | obj(c.obj) | |
346 | {} | |
347 | ||
348 | bool operator<(const index_data &other) const { | |
349 | return (kdata.encoded() < other.kdata.encoded()); | |
350 | } | |
351 | ||
352 | //true if there is a prefix and now - ts > timeout. | |
353 | bool is_timed_out(utime_t now, utime_t timeout) const; | |
354 | ||
355 | void encode(bufferlist &bl) const { | |
356 | ENCODE_START(1,1,bl); | |
11fdf7f2 TL |
357 | encode(prefix, bl); |
358 | encode(min_kdata, bl); | |
359 | encode(kdata, bl); | |
360 | encode(ts, bl); | |
361 | encode(to_create, bl); | |
362 | encode(to_delete, bl); | |
363 | encode(obj, bl); | |
7c673cae FG |
364 | ENCODE_FINISH(bl); |
365 | } | |
11fdf7f2 | 366 | void decode(bufferlist::const_iterator &p) { |
7c673cae | 367 | DECODE_START(1, p); |
11fdf7f2 TL |
368 | decode(prefix, p); |
369 | decode(min_kdata, p); | |
370 | decode(kdata, p); | |
371 | decode(ts, p); | |
372 | decode(to_create, p); | |
373 | decode(to_delete, p); | |
374 | decode(obj, p); | |
7c673cae FG |
375 | DECODE_FINISH(p); |
376 | } | |
377 | ||
378 | /* | |
379 | * Prints a string representation of the information, in the following format: | |
380 | * (min_kdata/ | |
381 | * kdata, | |
382 | * prefix | |
383 | * ts | |
384 | * elements of to_create, organized into (high key| obj name) | |
385 | * ; | |
386 | * elements of to_delete, organized into (high key| obj name | version number) | |
387 | * : | |
388 | * val) | |
389 | */ | |
390 | string str() const { | |
391 | stringstream strm; | |
392 | strm << '(' << min_kdata.encoded() << "/" << kdata.encoded() << ',' | |
393 | << prefix; | |
394 | if (prefix == "1") { | |
395 | strm << ts.sec() << '.' << ts.usec(); | |
396 | for(vector<create_data>::const_iterator it = to_create.begin(); | |
397 | it != to_create.end(); ++it) { | |
398 | strm << '(' << it->min.encoded() << '/' << it->max.encoded() << '|' | |
399 | << it->obj << ')'; | |
400 | } | |
401 | strm << ';'; | |
402 | for(vector<delete_data >::const_iterator it = to_delete.begin(); | |
403 | it != to_delete.end(); ++it) { | |
404 | strm << '(' << it->min.encoded() << '/' << it->max.encoded() << '|' | |
405 | << it->obj << '|' | |
406 | << it->version << ')'; | |
407 | } | |
408 | strm << ':'; | |
409 | } | |
410 | strm << obj << ')'; | |
411 | return strm.str(); | |
412 | } | |
413 | }; | |
414 | WRITE_CLASS_ENCODER(index_data) | |
415 | ||
416 | /** | |
417 | * Structure to store information read from the index for reuse. | |
418 | */ | |
419 | class IndexCache { | |
420 | protected: | |
421 | map<key_data, pair<index_data, utime_t> > k2itmap; | |
422 | map<utime_t, key_data> t2kmap; | |
423 | int cache_size; | |
424 | ||
425 | public: | |
426 | IndexCache(int n) | |
427 | : cache_size(n) | |
428 | {} | |
429 | /** | |
430 | * Inserts idata into the cache and removes whatever key mapped to before. | |
431 | * If the cache is full, pops the oldest entry. | |
432 | */ | |
433 | void push(const string &key, const index_data &idata); | |
434 | ||
435 | /** | |
436 | * Inserts idata into the cache. If idata.kdata is already in the cache, | |
437 | * replaces the old one. Pops the oldest entry if the cache is full. | |
438 | */ | |
439 | void push(const index_data &idata); | |
440 | ||
441 | /** | |
442 | * Removes the oldest entry from the cache | |
443 | */ | |
444 | void pop(); | |
445 | ||
446 | /** | |
447 | * Removes the value associated with kdata from both maps | |
448 | */ | |
449 | void erase(key_data kdata); | |
450 | ||
451 | /** | |
452 | * gets the idata where key belongs. If none, returns -ENODATA. | |
453 | */ | |
454 | int get(const string &key, index_data *idata) const; | |
455 | ||
456 | /** | |
457 | * Gets the idata where key goes and the one after it. If there are not | |
458 | * valid entries for both of them, returns -ENODATA. | |
459 | */ | |
460 | int get(const string &key, index_data *idata, index_data * next_idata) const; | |
461 | void clear(); | |
462 | }; | |
463 | ||
464 | class KvFlatBtreeAsync; | |
465 | ||
466 | ||
467 | /** | |
468 | * These are used internally to translate aio operations into useful thread | |
469 | * arguments. | |
470 | */ | |
471 | struct aio_set_args { | |
472 | KvFlatBtreeAsync * kvba; | |
473 | string key; | |
474 | bufferlist val; | |
475 | bool exc; | |
476 | callback cb; | |
477 | void * cb_args; | |
478 | int * err; | |
479 | }; | |
480 | ||
481 | struct aio_rm_args { | |
482 | KvFlatBtreeAsync * kvba; | |
483 | string key; | |
484 | callback cb; | |
485 | void * cb_args; | |
486 | int * err; | |
487 | }; | |
488 | ||
489 | struct aio_get_args { | |
490 | KvFlatBtreeAsync * kvba; | |
491 | string key; | |
492 | bufferlist * val; | |
493 | bool exc; | |
494 | callback cb; | |
495 | void * cb_args; | |
496 | int * err; | |
497 | }; | |
498 | ||
499 | class KvFlatBtreeAsync : public KeyValueStructure { | |
500 | protected: | |
501 | ||
502 | //don't change these once operations start being called - they are not | |
503 | //protected with mutexes! | |
504 | int k; | |
505 | string index_name; | |
506 | librados::IoCtx io_ctx; | |
507 | string rados_id; | |
508 | string client_name; | |
509 | librados::Rados rados; | |
510 | string pool_name; | |
511 | injection_t interrupt; | |
512 | int wait_ms; | |
513 | utime_t timeout; //declare a client dead if it goes this long without | |
514 | //finishing a split/merge | |
515 | int cache_size; | |
516 | double cache_refresh; //read cache_size / cache_refresh entries each time the | |
517 | //index is read | |
518 | bool verbose;//if true, display lots of debug output | |
519 | ||
520 | //shared variables protected with mutexes | |
521 | Mutex client_index_lock; | |
522 | int client_index; //names of new objects are client_name.client_index | |
523 | Mutex icache_lock; | |
524 | IndexCache icache; | |
525 | friend struct index_data; | |
526 | ||
527 | /** | |
528 | * finds the object in the index with the lowest key value that is greater | |
529 | * than idata.kdata. If idata.kdata is the max key, returns -EOVERFLOW. If | |
530 | * idata has a prefix and has timed out, cleans up. | |
531 | * | |
532 | * @param idata: idata for the object to search for. | |
533 | * @param out_data: the idata for the next object. | |
534 | * | |
535 | * @pre: idata must contain a key_data. | |
536 | * @post: out_data contains complete information | |
537 | */ | |
538 | int next(const index_data &idata, index_data * out_data); | |
539 | ||
540 | /** | |
541 | * finds the object in the index with the lowest key value that is greater | |
542 | * than idata.kdata. If idata.kdata is the lowest key, returns -ERANGE If | |
543 | * idata has a prefix and has timed out, cleans up. | |
544 | * | |
545 | * @param idata: idata for the object to search for. | |
546 | * @param out_data: the idata for the next object. | |
547 | * | |
548 | * @pre: idata must contain a key_data. | |
549 | * @post: out_data contains complete information | |
550 | */ | |
551 | int prev(const index_data &idata, index_data * out_data); | |
552 | ||
553 | /** | |
554 | * finds the index_data where a key belongs, from cache if possible. If it | |
555 | * reads the index object, it will read the first cache_size entries after | |
556 | * key and put them in the cache. | |
557 | * | |
558 | * @param key: the key to search for | |
559 | * @param idata: the index_data for the first index value such that idata.key | |
560 | * is greater than key. | |
561 | * @param next_idata: if not NULL, this will be set to the idata after idata | |
562 | * @param force_update: if false, will try to read from cache first. | |
563 | * | |
564 | * @pre: key is not encoded | |
565 | * @post: idata contains complete information | |
566 | * stored | |
567 | */ | |
568 | int read_index(const string &key, index_data * idata, | |
569 | index_data * next_idata, bool force_update); | |
570 | ||
571 | /** | |
572 | * Reads obj and generates information about it. Iff the object has >= 2k | |
573 | * entries, reads the whole omap and then splits it. | |
574 | * | |
575 | * @param idata: index data for the object being split | |
576 | * @pre: idata contains a key and an obj | |
577 | * @post: idata.obj has been split and icache has been updated | |
578 | * @return -EBALANCE if obj does not need to be split, 0 if split successful, | |
579 | * error from read_object or perform_ops if there is one. | |
580 | */ | |
581 | int split(const index_data &idata); | |
582 | ||
583 | /** | |
584 | * reads o1 and the next object after o1 and, if necessary, rebalances them. | |
585 | * if hk1 is the highest key in the index, calls rebalance on the next highest | |
586 | * key. | |
587 | * | |
588 | * @param idata: index data for the object being rebalanced | |
589 | * @param next_idata: index data for the next object. If blank, will read. | |
590 | * @pre: idata contains a key and an obj | |
591 | * @post: idata.obj has been rebalanced and icache has been updated | |
592 | * @return -EBALANCE if no change needed, -ENOENT if o1 does not exist, | |
593 | * -ECANCELED if second object does not exist, otherwise, error from | |
594 | * perform_ops | |
595 | */ | |
596 | int rebalance(const index_data &idata1, const index_data &next_idata); | |
597 | ||
598 | /** | |
599 | * performs an ObjectReadOperation to populate odata | |
600 | * | |
601 | * @post: odata has all information about obj except for key (which is "") | |
602 | */ | |
603 | int read_object(const string &obj, object_data * odata); | |
604 | ||
605 | /** | |
606 | * performs a maybe_read_for_balance ObjectOperation so the omap is only | |
607 | * read if the object is out of bounds. | |
608 | */ | |
609 | int read_object(const string &obj, rebalance_args * args); | |
610 | ||
611 | /** | |
612 | * sets up owo to change the index in preparation for a split/merge. | |
613 | * | |
614 | * @param to_create: vector of object_data to be created. | |
615 | * @param to_delete: vector of object_data to be deleted. | |
616 | * @param owo: the ObjectWriteOperation to set up | |
617 | * @param idata: will be populated by index data for this op. | |
618 | * @param err: error code reference to pass to omap_cmp | |
619 | * @pre: entries in to_create and to_delete must have keys and names. | |
620 | */ | |
621 | void set_up_prefix_index( | |
622 | const vector<object_data> &to_create, | |
623 | const vector<object_data> &to_delete, | |
624 | librados::ObjectWriteOperation * owo, | |
625 | index_data * idata, | |
626 | int * err); | |
627 | ||
628 | /** | |
629 | * sets up all make, mark, restore, and delete ops, as well as the remove | |
630 | * prefix op, based on idata. | |
631 | * | |
632 | * @param create_vector: vector of data about the objects to be created. | |
633 | * @pre: entries in create_data must have names and omaps and be in idata | |
634 | * order | |
635 | * @param delete_vector: vector of data about the objects to be deleted | |
636 | * @pre: entries in to_delete must have versions and be in idata order | |
637 | * @param ops: the owos to set up. the pair is a pair of op identifiers | |
638 | * and names of objects - set_up_ops fills these in. | |
639 | * @pre: ops must be the correct size and the ObjectWriteOperation pointers | |
640 | * must be valid. | |
641 | * @param idata: the idata with information about how to set up the ops | |
642 | * @pre: idata has valid to_create and to_delete | |
643 | * @param err: the int to get the error value for omap_cmp | |
644 | */ | |
645 | void set_up_ops( | |
646 | const vector<object_data> &create_vector, | |
647 | const vector<object_data> &delete_vector, | |
648 | vector<pair<pair<int, string>, librados::ObjectWriteOperation*> > * ops, | |
649 | const index_data &idata, | |
650 | int * err); | |
651 | ||
652 | /** | |
653 | * sets up owo to exclusive create, set omap to to_set, and set | |
654 | * unwritable to "0" | |
655 | */ | |
656 | void set_up_make_object( | |
657 | const map<std::string, bufferlist> &to_set, | |
658 | librados::ObjectWriteOperation *owo); | |
659 | ||
660 | /** | |
661 | * sets up owo to assert object version and that object version is | |
662 | * writable, | |
663 | * then mark it unwritable. | |
664 | * | |
665 | * @param ver: if this is 0, no version is asserted. | |
666 | */ | |
667 | void set_up_unwrite_object( | |
668 | const int &ver, librados::ObjectWriteOperation *owo); | |
669 | ||
670 | /** | |
671 | * sets up owo to assert that an object is unwritable and then mark it | |
672 | * writable | |
673 | */ | |
674 | void set_up_restore_object( | |
675 | librados::ObjectWriteOperation *owo); | |
676 | ||
677 | /** | |
678 | * sets up owo to assert that the object is unwritable and then remove it | |
679 | */ | |
680 | void set_up_delete_object( | |
681 | librados::ObjectWriteOperation *owo); | |
682 | ||
683 | /** | |
684 | * perform the operations in ops and handles errors. | |
685 | * | |
686 | * @param debug_prefix: what to print at the beginning of debug output | |
687 | * @param idata: the idata for the object being operated on, to be | |
688 | * passed to cleanup if necessary | |
689 | * @param ops: this contains an int identifying the type of op, | |
690 | * a string that is the name of the object to operate on, and a pointer | |
691 | * to the ObjectWriteOperation to use. All of this must be complete. | |
692 | * @post: all operations are performed and most errors are handled | |
693 | * (e.g., cleans up if an assertion fails). If an unknown error is found, | |
694 | * returns it. | |
695 | */ | |
696 | int perform_ops( const string &debug_prefix, | |
697 | const index_data &idata, | |
698 | vector<pair<pair<int, string>, librados::ObjectWriteOperation*> > * ops); | |
699 | ||
700 | /** | |
701 | * Called when a client discovers that another client has died during a | |
702 | * split or a merge. cleans up after that client. | |
703 | * | |
704 | * @param idata: the index data parsed from the index entry left by the dead | |
705 | * client. | |
706 | * @param error: the error that caused the client to realize the other client | |
707 | * died (should be -ENOENT or -ETIMEDOUT) | |
708 | * @post: rolls forward if -ENOENT, otherwise rolls back. | |
709 | */ | |
710 | int cleanup(const index_data &idata, const int &error); | |
711 | ||
712 | /** | |
713 | * does the ObjectWriteOperation and splits, reads the index, and/or retries | |
714 | * until success. | |
715 | */ | |
716 | int set_op(const string &key, const bufferlist &val, | |
717 | bool update_on_existing, index_data &idata); | |
718 | ||
719 | /** | |
720 | * does the ObjectWriteOperation and merges, reads the index, and/or retries | |
721 | * until success. | |
722 | */ | |
723 | int remove_op(const string &key, index_data &idata, index_data &next_idata); | |
724 | ||
725 | /** | |
726 | * does the ObjectWriteOperation and reads the index and/or retries | |
727 | * until success. | |
728 | */ | |
729 | int get_op(const string &key, bufferlist * val, index_data &idata); | |
730 | ||
731 | /** | |
732 | * does the ObjectWriteOperation and splits, reads the index, and/or retries | |
733 | * until success. | |
734 | */ | |
735 | int handle_set_rm_errors(int &err, string key, string obj, | |
736 | index_data * idata, index_data * next_idata); | |
737 | ||
738 | /** | |
739 | * called by aio_set, aio_remove, and aio_get, respectively. | |
740 | */ | |
741 | static void* pset(void *ptr); | |
742 | static void* prm(void *ptr); | |
743 | static void* pget(void *ptr); | |
744 | public: | |
745 | ||
746 | ||
747 | //interruption methods, for correctness testing | |
748 | /** | |
749 | * returns 0 | |
750 | */ | |
751 | int nothing() override; | |
752 | /** | |
753 | * 10% chance of waiting wait_ms seconds | |
754 | */ | |
755 | int wait() override; | |
756 | /** | |
757 | * 10% chance of killing the client. | |
758 | */ | |
759 | int suicide() override; | |
760 | ||
761 | KvFlatBtreeAsync(int k_val, string name, int cache, double cache_r, | |
762 | bool verb) | |
763 | : k(k_val), | |
764 | index_name("index_object"), | |
765 | rados_id(name), | |
766 | client_name(string(name).append(".")), | |
767 | pool_name("rbd"), | |
768 | interrupt(&KeyValueStructure::nothing), | |
769 | wait_ms(0), | |
770 | timeout(100000,0), | |
771 | cache_size(cache), | |
772 | cache_refresh(cache_r), | |
773 | verbose(verb), | |
774 | client_index_lock("client_index_lock"), | |
775 | client_index(0), | |
776 | icache_lock("icache_lock"), | |
777 | icache(cache) | |
778 | {} | |
779 | ||
780 | /** | |
781 | * creates a string with an int at the end. | |
782 | * | |
783 | * @param s: the string on the left | |
784 | * @param i: the int to be appended to the string | |
785 | * @return the string | |
786 | */ | |
787 | static string to_string(string s, int i); | |
788 | ||
789 | /** | |
790 | * returns in encoded | |
791 | */ | |
792 | static bufferlist to_bl(const string &in) { | |
793 | bufferlist bl; | |
794 | bl.append(in); | |
795 | return bl; | |
796 | } | |
797 | ||
798 | /** | |
799 | * returns idata encoded; | |
800 | */ | |
801 | static bufferlist to_bl(const index_data &idata) { | |
802 | bufferlist bl; | |
803 | idata.encode(bl); | |
804 | return bl; | |
805 | } | |
806 | ||
807 | /** | |
808 | * returns the rados_id of this KvFlatBtreeAsync | |
809 | */ | |
810 | string get_name(); | |
811 | ||
812 | /** | |
813 | * sets this kvba to call inject before every ObjectWriteOperation. | |
814 | * If inject is wait and wait_time is set, wait will have a 10% chance of | |
11fdf7f2 | 815 | * sleeping for waite_time milliseconds. |
7c673cae FG |
816 | */ |
817 | void set_inject(injection_t inject, int wait_time) override; | |
818 | ||
819 | /** | |
820 | * sets up the rados and io_ctx of this KvFlatBtreeAsync. If the don't already | |
821 | * exist, creates the index and max object. | |
822 | */ | |
823 | int setup(int argc, const char** argv) override; | |
824 | ||
825 | int set(const string &key, const bufferlist &val, | |
826 | bool update_on_existing) override; | |
827 | ||
828 | int remove(const string &key) override; | |
829 | ||
830 | /** | |
831 | * returns true if all of the following are true: | |
832 | * | |
833 | * all objects are accounted for in the index or a prefix | |
834 | * (i.e., no floating objects) | |
835 | * all objects have k <= size <= 2k | |
836 | * all keys in an object are within the specified predicted by the index | |
837 | * | |
838 | * if any of those fails, states that the problem(s) are, and prints str(). | |
839 | * | |
840 | * @pre: no operations are in progress | |
841 | */ | |
842 | bool is_consistent() override; | |
843 | ||
844 | /** | |
845 | * returns an ASCII representation of the index and sub objects, showing | |
846 | * stats about each object and all omaps. Don't use if you have more than | |
847 | * about 10 objects. | |
848 | */ | |
849 | string str() override; | |
850 | ||
851 | int get(const string &key, bufferlist *val) override; | |
852 | ||
853 | //async versions of these methods | |
854 | void aio_get(const string &key, bufferlist *val, callback cb, | |
855 | void *cb_args, int * err) override; | |
856 | void aio_set(const string &key, const bufferlist &val, bool exclusive, | |
857 | callback cb, void * cb_args, int * err) override; | |
858 | void aio_remove(const string &key, callback cb, void *cb_args, int * err) override; | |
859 | ||
860 | //these methods that deal with multiple keys at once are efficient, but make | |
861 | //no guarantees about atomicity! | |
862 | ||
863 | /** | |
864 | * Removes all objects and resets the store as if setup had just run. Makes no | |
865 | * attempt to do this safely - make sure this is the only operation running | |
866 | * when it is called! | |
867 | */ | |
868 | int remove_all() override; | |
869 | ||
870 | /** | |
871 | * This does not add prefixes to the index and therefore DOES NOT guarantee | |
872 | * consistency! It is ONLY safe if there is only one instance at a time. | |
873 | * It follows the same general logic as a rebalance, but | |
874 | * with all objects that contain any of the keys in in_map. It is O(n), where | |
875 | * n is the number of librados objects it has to change. Higher object sizes | |
876 | * (i.e., k values) also decrease the efficiency of this method because it | |
877 | * copies all of the entries in each object it modifies. Writing new objects | |
878 | * is done in parallel. | |
879 | * | |
880 | * This is efficient if: | |
881 | * * other clients are very unlikely to be modifying any of the objects while | |
882 | * this operation is in progress | |
883 | * * The entries in in_map are close together | |
884 | * * It is especially efficient for initially entering lots of entries into | |
885 | * an empty structure. | |
886 | * | |
887 | * It is very inefficient compared to setting one key and/or will starve if: | |
888 | * * other clients are modifying the objects it tries to modify | |
889 | * * The keys are distributed across the range of keys in the store | |
890 | * * there is a small number of keys compared to k | |
891 | */ | |
892 | int set_many(const map<string, bufferlist> &in_map) override; | |
893 | ||
894 | int get_all_keys(std::set<string> *keys) override; | |
895 | int get_all_keys_and_values(map<string,bufferlist> *kv_map) override; | |
896 | ||
897 | }; | |
898 | ||
899 | #endif /* KVFLATBTREEASYNC_H_ */ |