]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | #ifndef CEPH_OBJECTSTORE_H | |
15 | #define CEPH_OBJECTSTORE_H | |
16 | ||
17 | #include "include/Context.h" | |
18 | #include "include/buffer.h" | |
19 | #include "include/types.h" | |
20 | #include "osd/osd_types.h" | |
21 | #include "common/TrackedOp.h" | |
22 | #include "common/WorkQueue.h" | |
23 | #include "ObjectMap.h" | |
24 | ||
25 | #include <errno.h> | |
26 | #include <sys/stat.h> | |
27 | #include <vector> | |
28 | #include <map> | |
29 | ||
30 | #if defined(DARWIN) || defined(__FreeBSD__) || defined(__sun) | |
31 | #include <sys/statvfs.h> | |
32 | #else | |
33 | #include <sys/vfs.h> /* or <sys/statfs.h> */ | |
34 | #endif /* DARWIN */ | |
35 | ||
36 | #define OPS_PER_PTR 32 | |
37 | ||
38 | class CephContext; | |
39 | ||
40 | using std::vector; | |
41 | using std::string; | |
42 | using std::map; | |
43 | ||
44 | namespace ceph { | |
45 | class Formatter; | |
46 | } | |
47 | ||
48 | /* | |
49 | * low-level interface to the local OSD file system | |
50 | */ | |
51 | ||
52 | class Logger; | |
53 | ||
54 | ||
55 | static inline void encode(const map<string,bufferptr> *attrset, bufferlist &bl) { | |
56 | ::encode(*attrset, bl); | |
57 | } | |
58 | ||
59 | // this isn't the best place for these, but... | |
60 | void decode_str_str_map_to_bl(bufferlist::iterator& p, bufferlist *out); | |
61 | void decode_str_set_to_bl(bufferlist::iterator& p, bufferlist *out); | |
62 | ||
63 | // Flag bits | |
64 | typedef uint32_t osflagbits_t; | |
65 | const int SKIP_JOURNAL_REPLAY = 1 << 0; | |
66 | const int SKIP_MOUNT_OMAP = 1 << 1; | |
67 | ||
68 | class ObjectStore { | |
69 | protected: | |
70 | string path; | |
71 | ||
72 | public: | |
73 | CephContext* cct; | |
74 | /** | |
75 | * create - create an ObjectStore instance. | |
76 | * | |
77 | * This is invoked once at initialization time. | |
78 | * | |
79 | * @param type type of store. This is a string from the configuration file. | |
80 | * @param data path (or other descriptor) for data | |
81 | * @param journal path (or other descriptor) for journal (optional) | |
82 | * @param flags which filestores should check if applicable | |
83 | */ | |
84 | static ObjectStore *create(CephContext *cct, | |
85 | const string& type, | |
86 | const string& data, | |
87 | const string& journal, | |
88 | osflagbits_t flags = 0); | |
89 | ||
90 | /** | |
91 | * probe a block device to learn the uuid of the owning OSD | |
92 | * | |
93 | * @param cct cct | |
94 | * @param path path to device | |
95 | * @param fsid [out] osd uuid | |
96 | */ | |
97 | static int probe_block_device_fsid( | |
98 | CephContext *cct, | |
99 | const string& path, | |
100 | uuid_d *fsid); | |
101 | ||
102 | /** | |
103 | * Fetch Object Store statistics. | |
104 | * | |
105 | * Currently only latency of write and apply times are measured. | |
106 | * | |
107 | * This appears to be called with nothing locked. | |
108 | */ | |
109 | virtual objectstore_perf_stat_t get_cur_stats() = 0; | |
110 | ||
111 | /** | |
112 | * Fetch Object Store performance counters. | |
113 | * | |
114 | * | |
115 | * This appears to be called with nothing locked. | |
116 | */ | |
117 | virtual const PerfCounters* get_perf_counters() const = 0; | |
118 | ||
119 | /** | |
120 | * a sequencer orders transactions | |
121 | * | |
122 | * Any transactions queued under a given sequencer will be applied in | |
123 | * sequence. Transactions queued under different sequencers may run | |
124 | * in parallel. | |
125 | * | |
126 | * Clients of ObjectStore create and maintain their own Sequencer objects. | |
127 | * When a list of transactions is queued the caller specifies a Sequencer to be used. | |
128 | * | |
129 | */ | |
130 | ||
131 | /** | |
132 | * ABC for Sequencer implementation, private to the ObjectStore derived class. | |
133 | * created in ...::queue_transaction(s) | |
134 | */ | |
135 | struct Sequencer_impl : public RefCountedObject { | |
136 | CephContext* cct; | |
137 | ||
138 | // block until any previous transactions are visible. specifically, | |
139 | // collection_list and collection_empty need to reflect prior operations. | |
140 | virtual void flush() = 0; | |
141 | ||
142 | // called when we are done with the impl. the impl may have a different | |
143 | // (longer) lifecycle than the Sequencer. | |
144 | virtual void discard() {} | |
145 | ||
146 | /** | |
147 | * Async flush_commit | |
148 | * | |
149 | * There are two cases: | |
150 | * 1) sequencer is currently idle: the method returns true. c is | |
151 | * not touched. | |
152 | * 2) sequencer is not idle: the method returns false and c is | |
153 | * called asyncronously with a value of 0 once all transactions | |
154 | * queued on this sequencer prior to the call have been applied | |
155 | * and committed. | |
156 | */ | |
157 | virtual bool flush_commit( | |
158 | Context *c ///< [in] context to call upon flush/commit | |
159 | ) = 0; ///< @return true if idle, false otherwise | |
160 | ||
161 | Sequencer_impl(CephContext* cct) : RefCountedObject(NULL, 0), cct(cct) {} | |
162 | ~Sequencer_impl() override {} | |
163 | }; | |
164 | typedef boost::intrusive_ptr<Sequencer_impl> Sequencer_implRef; | |
165 | ||
166 | /** | |
167 | * External (opaque) sequencer implementation | |
168 | */ | |
169 | struct Sequencer { | |
170 | string name; | |
171 | spg_t shard_hint; | |
172 | Sequencer_implRef p; | |
173 | ||
174 | explicit Sequencer(string n) | |
175 | : name(n), shard_hint(spg_t()), p(NULL) { | |
176 | } | |
177 | ~Sequencer() { | |
178 | if (p) | |
179 | p->discard(); // tell impl we are done with it | |
180 | } | |
181 | ||
182 | /// return a unique string identifier for this sequencer | |
183 | const string& get_name() const { | |
184 | return name; | |
185 | } | |
186 | /// wait for any queued transactions on this sequencer to apply | |
187 | void flush() { | |
188 | if (p) | |
189 | p->flush(); | |
190 | } | |
191 | ||
192 | /// @see Sequencer_impl::flush_commit() | |
193 | bool flush_commit(Context *c) { | |
194 | if (!p) { | |
195 | return true; | |
196 | } else { | |
197 | return p->flush_commit(c); | |
198 | } | |
199 | } | |
200 | }; | |
201 | ||
202 | struct CollectionImpl : public RefCountedObject { | |
203 | virtual const coll_t &get_cid() = 0; | |
204 | CollectionImpl() : RefCountedObject(NULL, 0) {} | |
205 | }; | |
206 | typedef boost::intrusive_ptr<CollectionImpl> CollectionHandle; | |
207 | ||
208 | struct CompatCollectionHandle : public CollectionImpl { | |
209 | coll_t cid; | |
210 | explicit CompatCollectionHandle(coll_t c) : cid(c) {} | |
211 | const coll_t &get_cid() override { | |
212 | return cid; | |
213 | } | |
214 | }; | |
215 | ||
216 | /********************************* | |
217 | * | |
218 | * Object Contents and semantics | |
219 | * | |
220 | * All ObjectStore objects are identified as a named object | |
221 | * (ghobject_t and hobject_t) in a named collection (coll_t). | |
222 | * ObjectStore operations support the creation, mutation, deletion | |
223 | * and enumeration of objects within a collection. Enumeration is | |
224 | * in sorted key order (where keys are sorted by hash). Object names | |
225 | * are globally unique. | |
226 | * | |
227 | * Each object has four distinct parts: byte data, xattrs, omap_header | |
228 | * and omap entries. | |
229 | * | |
230 | * The data portion of an object is conceptually equivalent to a | |
231 | * file in a file system. Random and Partial access for both read | |
232 | * and write operations is required. The ability to have a sparse | |
233 | * implementation of the data portion of an object is beneficial for | |
234 | * some workloads, but not required. There is a system-wide limit on | |
235 | * the maximum size of an object, which is typically around 100 MB. | |
236 | * | |
237 | * Xattrs are equivalent to the extended attributes of file | |
238 | * systems. Xattrs are a set of key/value pairs. Sub-value access | |
239 | * is not required. It is possible to enumerate the set of xattrs in | |
240 | * key order. At the implementation level, xattrs are used | |
241 | * exclusively internal to Ceph and the implementer can expect the | |
242 | * total size of all of the xattrs on an object to be relatively | |
243 | * small, i.e., less than 64KB. Much of Ceph assumes that accessing | |
244 | * xattrs on temporally adjacent object accesses (recent past or | |
245 | * near future) is inexpensive. | |
246 | * | |
247 | * omap_header is a single blob of data. It can be read or written | |
248 | * in total. | |
249 | * | |
250 | * Omap entries are conceptually the same as xattrs | |
251 | * but in a different address space. In other words, you can have | |
252 | * the same key as an xattr and an omap entry and they have distinct | |
253 | * values. Enumeration of xattrs doesn't include omap entries and | |
254 | * vice versa. The size and access characteristics of omap entries | |
255 | * are very different from xattrs. In particular, the value portion | |
256 | * of an omap entry can be quite large (MBs). More importantly, the | |
257 | * interface must support efficient range queries on omap entries even | |
258 | * when there are a large numbers of entries. | |
259 | * | |
260 | *********************************/ | |
261 | ||
262 | /******************************* | |
263 | * | |
264 | * Collections | |
265 | * | |
266 | * A collection is simply a grouping of objects. Collections have | |
267 | * names (coll_t) and can be enumerated in order. Like an | |
268 | * individual object, a collection also has a set of xattrs. | |
269 | * | |
270 | * | |
271 | */ | |
272 | ||
273 | ||
274 | /********************************* | |
275 | * transaction | |
276 | * | |
277 | * A Transaction represents a sequence of primitive mutation | |
278 | * operations. | |
279 | * | |
280 | * Three events in the life of a Transaction result in | |
281 | * callbacks. Any Transaction can contain any number of callback | |
282 | * objects (Context) for any combination of the three classes of | |
283 | * callbacks: | |
284 | * | |
285 | * on_applied_sync, on_applied, and on_commit. | |
286 | * | |
287 | * The "on_applied" and "on_applied_sync" callbacks are invoked when | |
288 | * the modifications requested by the Transaction are visible to | |
289 | * subsequent ObjectStore operations, i.e., the results are | |
290 | * readable. The only conceptual difference between on_applied and | |
291 | * on_applied_sync is the specific thread and locking environment in | |
292 | * which the callbacks operate. "on_applied_sync" is called | |
293 | * directly by an ObjectStore execution thread. It is expected to | |
294 | * execute quickly and must not acquire any locks of the calling | |
295 | * environment. Conversely, "on_applied" is called from the separate | |
296 | * Finisher thread, meaning that it can contend for calling | |
297 | * environment locks. NB, on_applied and on_applied_sync are | |
298 | * sometimes called on_readable and on_readable_sync. | |
299 | * | |
300 | * The "on_commit" callback is also called from the Finisher thread | |
301 | * and indicates that all of the mutations have been durably | |
302 | * committed to stable storage (i.e., are now software/hardware | |
303 | * crashproof). | |
304 | * | |
305 | * At the implementation level, each mutation primitive (and its | |
306 | * associated data) can be serialized to a single buffer. That | |
307 | * serialization, however, does not copy any data, but (using the | |
308 | * bufferlist library) will reference the original buffers. This | |
309 | * implies that the buffer that contains the data being submitted | |
310 | * must remain stable until the on_commit callback completes. In | |
311 | * practice, bufferlist handles all of this for you and this | |
312 | * subtlety is only relevant if you are referencing an existing | |
313 | * buffer via buffer::raw_static. | |
314 | * | |
315 | * Some implementations of ObjectStore choose to implement their own | |
316 | * form of journaling that uses the serialized form of a | |
317 | * Transaction. This requires that the encode/decode logic properly | |
318 | * version itself and handle version upgrades that might change the | |
319 | * format of the encoded Transaction. This has already happened a | |
320 | * couple of times and the Transaction object contains some helper | |
321 | * variables that aid in this legacy decoding: | |
322 | * | |
323 | * sobject_encoding detects an older/simpler version of oid | |
324 | * present in pre-bobtail versions of ceph. use_pool_override | |
325 | * also detects a situation where the pool of an oid can be | |
326 | * override for legacy operations/buffers. For non-legacy | |
327 | * implementation of ObjectStore, neither of these fields is | |
328 | * relevant. | |
329 | * | |
330 | * | |
331 | * TRANSACTION ISOLATION | |
332 | * | |
333 | * Except as noted below, isolation is the responsibility of the | |
334 | * caller. In other words, if any storage element (storage element | |
335 | * == any of the four portions of an object as described above) is | |
336 | * altered by a transaction (including deletion), the caller | |
337 | * promises not to attempt to read that element while the | |
338 | * transaction is pending (here pending means from the time of | |
339 | * issuance until the "on_applied_sync" callback has been | |
340 | * received). Violations of isolation need not be detected by | |
341 | * ObjectStore and there is no corresponding error mechanism for | |
342 | * reporting an isolation violation (crashing would be the | |
343 | * appropriate way to report an isolation violation if detected). | |
344 | * | |
345 | * Enumeration operations may violate transaction isolation as | |
346 | * described above when a storage element is being created or | |
347 | * deleted as part of a transaction. In this case, ObjectStore is | |
348 | * allowed to consider the enumeration operation to either precede | |
349 | * or follow the violating transaction element. In other words, the | |
350 | * presence/absence of the mutated element in the enumeration is | |
351 | * entirely at the discretion of ObjectStore. The arbitrary ordering | |
352 | * applies independently to each transaction element. For example, | |
353 | * if a transaction contains two mutating elements "create A" and | |
354 | * "delete B". And an enumeration operation is performed while this | |
355 | * transaction is pending. It is permissable for ObjectStore to | |
356 | * report any of the four possible combinations of the existence of | |
357 | * A and B. | |
358 | * | |
359 | */ | |
360 | class Transaction { | |
361 | public: | |
362 | enum { | |
363 | OP_NOP = 0, | |
364 | OP_TOUCH = 9, // cid, oid | |
365 | OP_WRITE = 10, // cid, oid, offset, len, bl | |
366 | OP_ZERO = 11, // cid, oid, offset, len | |
367 | OP_TRUNCATE = 12, // cid, oid, len | |
368 | OP_REMOVE = 13, // cid, oid | |
369 | OP_SETATTR = 14, // cid, oid, attrname, bl | |
370 | OP_SETATTRS = 15, // cid, oid, attrset | |
371 | OP_RMATTR = 16, // cid, oid, attrname | |
372 | OP_CLONE = 17, // cid, oid, newoid | |
373 | OP_CLONERANGE = 18, // cid, oid, newoid, offset, len | |
374 | OP_CLONERANGE2 = 30, // cid, oid, newoid, srcoff, len, dstoff | |
375 | ||
376 | OP_TRIMCACHE = 19, // cid, oid, offset, len **DEPRECATED** | |
377 | ||
378 | OP_MKCOLL = 20, // cid | |
379 | OP_RMCOLL = 21, // cid | |
380 | OP_COLL_ADD = 22, // cid, oldcid, oid | |
381 | OP_COLL_REMOVE = 23, // cid, oid | |
382 | OP_COLL_SETATTR = 24, // cid, attrname, bl | |
383 | OP_COLL_RMATTR = 25, // cid, attrname | |
384 | OP_COLL_SETATTRS = 26, // cid, attrset | |
385 | OP_COLL_MOVE = 8, // newcid, oldcid, oid | |
386 | ||
387 | OP_STARTSYNC = 27, // start a sync | |
388 | ||
389 | OP_RMATTRS = 28, // cid, oid | |
390 | OP_COLL_RENAME = 29, // cid, newcid | |
391 | ||
392 | OP_OMAP_CLEAR = 31, // cid | |
393 | OP_OMAP_SETKEYS = 32, // cid, attrset | |
394 | OP_OMAP_RMKEYS = 33, // cid, keyset | |
395 | OP_OMAP_SETHEADER = 34, // cid, header | |
396 | OP_SPLIT_COLLECTION = 35, // cid, bits, destination | |
397 | OP_SPLIT_COLLECTION2 = 36, /* cid, bits, destination | |
398 | doesn't create the destination */ | |
399 | OP_OMAP_RMKEYRANGE = 37, // cid, oid, firstkey, lastkey | |
400 | OP_COLL_MOVE_RENAME = 38, // oldcid, oldoid, newcid, newoid | |
401 | ||
402 | OP_SETALLOCHINT = 39, // cid, oid, object_size, write_size | |
403 | OP_COLL_HINT = 40, // cid, type, bl | |
404 | ||
405 | OP_TRY_RENAME = 41, // oldcid, oldoid, newoid | |
406 | ||
407 | OP_COLL_SET_BITS = 42, // cid, bits | |
408 | }; | |
409 | ||
410 | // Transaction hint type | |
411 | enum { | |
412 | COLL_HINT_EXPECTED_NUM_OBJECTS = 1, | |
413 | }; | |
414 | ||
415 | struct Op { | |
416 | __le32 op; | |
417 | __le32 cid; | |
418 | __le32 oid; | |
419 | __le64 off; | |
420 | __le64 len; | |
421 | __le32 dest_cid; | |
422 | __le32 dest_oid; //OP_CLONE, OP_CLONERANGE | |
423 | __le64 dest_off; //OP_CLONERANGE | |
424 | union { | |
425 | struct { | |
426 | __le32 hint_type; //OP_COLL_HINT | |
427 | }; | |
428 | struct { | |
429 | __le32 alloc_hint_flags; //OP_SETALLOCHINT | |
430 | }; | |
431 | }; | |
432 | __le64 expected_object_size; //OP_SETALLOCHINT | |
433 | __le64 expected_write_size; //OP_SETALLOCHINT | |
434 | __le32 split_bits; //OP_SPLIT_COLLECTION2,OP_COLL_SET_BITS, | |
435 | //OP_MKCOLL | |
436 | __le32 split_rem; //OP_SPLIT_COLLECTION2 | |
437 | } __attribute__ ((packed)) ; | |
438 | ||
439 | struct TransactionData { | |
440 | __le64 ops; | |
441 | __le32 largest_data_len; | |
442 | __le32 largest_data_off; | |
443 | __le32 largest_data_off_in_data_bl; | |
444 | __le32 fadvise_flags; | |
445 | ||
446 | TransactionData() noexcept : | |
447 | ops(0), | |
448 | largest_data_len(0), | |
449 | largest_data_off(0), | |
450 | largest_data_off_in_data_bl(0), | |
451 | fadvise_flags(0) { } | |
452 | ||
453 | // override default move operations to reset default values | |
454 | TransactionData(TransactionData&& other) noexcept : | |
455 | ops(other.ops), | |
456 | largest_data_len(other.largest_data_len), | |
457 | largest_data_off(other.largest_data_off), | |
458 | largest_data_off_in_data_bl(other.largest_data_off_in_data_bl), | |
459 | fadvise_flags(other.fadvise_flags) { | |
460 | other.ops = 0; | |
461 | other.largest_data_len = 0; | |
462 | other.largest_data_off = 0; | |
463 | other.largest_data_off_in_data_bl = 0; | |
464 | other.fadvise_flags = 0; | |
465 | } | |
466 | TransactionData& operator=(TransactionData&& other) noexcept { | |
467 | ops = other.ops; | |
468 | largest_data_len = other.largest_data_len; | |
469 | largest_data_off = other.largest_data_off; | |
470 | largest_data_off_in_data_bl = other.largest_data_off_in_data_bl; | |
471 | fadvise_flags = other.fadvise_flags; | |
472 | other.ops = 0; | |
473 | other.largest_data_len = 0; | |
474 | other.largest_data_off = 0; | |
475 | other.largest_data_off_in_data_bl = 0; | |
476 | other.fadvise_flags = 0; | |
477 | return *this; | |
478 | } | |
479 | ||
480 | TransactionData(const TransactionData& other) = default; | |
481 | TransactionData& operator=(const TransactionData& other) = default; | |
482 | ||
483 | void encode(bufferlist& bl) const { | |
484 | bl.append((char*)this, sizeof(TransactionData)); | |
485 | } | |
486 | void decode(bufferlist::iterator &bl) { | |
487 | bl.copy(sizeof(TransactionData), (char*)this); | |
488 | } | |
489 | } __attribute__ ((packed)) ; | |
490 | ||
491 | private: | |
492 | TransactionData data; | |
493 | ||
494 | void *osr {nullptr}; // NULL on replay | |
495 | ||
496 | map<coll_t, __le32> coll_index; | |
497 | map<ghobject_t, __le32> object_index; | |
498 | ||
499 | __le32 coll_id {0}; | |
500 | __le32 object_id {0}; | |
501 | ||
502 | bufferlist data_bl; | |
503 | bufferlist op_bl; | |
504 | ||
505 | bufferptr op_ptr; | |
506 | ||
507 | list<Context *> on_applied; | |
508 | list<Context *> on_commit; | |
509 | list<Context *> on_applied_sync; | |
510 | ||
511 | public: | |
512 | Transaction() = default; | |
513 | ||
514 | explicit Transaction(bufferlist::iterator &dp) { | |
515 | decode(dp); | |
516 | } | |
517 | explicit Transaction(bufferlist &nbl) { | |
518 | bufferlist::iterator dp = nbl.begin(); | |
519 | decode(dp); | |
520 | } | |
521 | ||
522 | // override default move operations to reset default values | |
523 | Transaction(Transaction&& other) noexcept : | |
524 | data(std::move(other.data)), | |
525 | osr(other.osr), | |
526 | coll_index(std::move(other.coll_index)), | |
527 | object_index(std::move(other.object_index)), | |
528 | coll_id(other.coll_id), | |
529 | object_id(other.object_id), | |
530 | data_bl(std::move(other.data_bl)), | |
531 | op_bl(std::move(other.op_bl)), | |
532 | op_ptr(std::move(other.op_ptr)), | |
533 | on_applied(std::move(other.on_applied)), | |
534 | on_commit(std::move(other.on_commit)), | |
535 | on_applied_sync(std::move(other.on_applied_sync)) { | |
536 | other.osr = nullptr; | |
537 | other.coll_id = 0; | |
538 | other.object_id = 0; | |
539 | } | |
540 | ||
541 | Transaction& operator=(Transaction&& other) noexcept { | |
542 | data = std::move(other.data); | |
543 | osr = other.osr; | |
544 | coll_index = std::move(other.coll_index); | |
545 | object_index = std::move(other.object_index); | |
546 | coll_id = other.coll_id; | |
547 | object_id = other.object_id; | |
548 | data_bl = std::move(other.data_bl); | |
549 | op_bl = std::move(other.op_bl); | |
550 | op_ptr = std::move(other.op_ptr); | |
551 | on_applied = std::move(other.on_applied); | |
552 | on_commit = std::move(other.on_commit); | |
553 | on_applied_sync = std::move(other.on_applied_sync); | |
554 | other.osr = nullptr; | |
555 | other.coll_id = 0; | |
556 | other.object_id = 0; | |
557 | return *this; | |
558 | } | |
559 | ||
560 | Transaction(const Transaction& other) = default; | |
561 | Transaction& operator=(const Transaction& other) = default; | |
562 | ||
563 | /* Operations on callback contexts */ | |
564 | void register_on_applied(Context *c) { | |
565 | if (!c) return; | |
566 | on_applied.push_back(c); | |
567 | } | |
568 | void register_on_commit(Context *c) { | |
569 | if (!c) return; | |
570 | on_commit.push_back(c); | |
571 | } | |
572 | void register_on_applied_sync(Context *c) { | |
573 | if (!c) return; | |
574 | on_applied_sync.push_back(c); | |
575 | } | |
576 | void register_on_complete(Context *c) { | |
577 | if (!c) return; | |
578 | RunOnDeleteRef _complete (std::make_shared<RunOnDelete>(c)); | |
579 | register_on_applied(new ContainerContext<RunOnDeleteRef>(_complete)); | |
580 | register_on_commit(new ContainerContext<RunOnDeleteRef>(_complete)); | |
581 | } | |
582 | ||
583 | static void collect_contexts( | |
584 | vector<Transaction>& t, | |
585 | Context **out_on_applied, | |
586 | Context **out_on_commit, | |
587 | Context **out_on_applied_sync) { | |
588 | assert(out_on_applied); | |
589 | assert(out_on_commit); | |
590 | assert(out_on_applied_sync); | |
591 | list<Context *> on_applied, on_commit, on_applied_sync; | |
592 | for (vector<Transaction>::iterator i = t.begin(); | |
593 | i != t.end(); | |
594 | ++i) { | |
595 | on_applied.splice(on_applied.end(), (*i).on_applied); | |
596 | on_commit.splice(on_commit.end(), (*i).on_commit); | |
597 | on_applied_sync.splice(on_applied_sync.end(), (*i).on_applied_sync); | |
598 | } | |
599 | *out_on_applied = C_Contexts::list_to_context(on_applied); | |
600 | *out_on_commit = C_Contexts::list_to_context(on_commit); | |
601 | *out_on_applied_sync = C_Contexts::list_to_context(on_applied_sync); | |
602 | } | |
603 | ||
604 | Context *get_on_applied() { | |
605 | return C_Contexts::list_to_context(on_applied); | |
606 | } | |
607 | Context *get_on_commit() { | |
608 | return C_Contexts::list_to_context(on_commit); | |
609 | } | |
610 | Context *get_on_applied_sync() { | |
611 | return C_Contexts::list_to_context(on_applied_sync); | |
612 | } | |
613 | ||
614 | void set_fadvise_flags(uint32_t flags) { | |
615 | data.fadvise_flags = flags; | |
616 | } | |
617 | void set_fadvise_flag(uint32_t flag) { | |
618 | data.fadvise_flags = data.fadvise_flags | flag; | |
619 | } | |
620 | uint32_t get_fadvise_flags() { return data.fadvise_flags; } | |
621 | ||
622 | void swap(Transaction& other) noexcept { | |
623 | std::swap(data, other.data); | |
624 | std::swap(on_applied, other.on_applied); | |
625 | std::swap(on_commit, other.on_commit); | |
626 | std::swap(on_applied_sync, other.on_applied_sync); | |
627 | ||
628 | std::swap(coll_index, other.coll_index); | |
629 | std::swap(object_index, other.object_index); | |
630 | std::swap(coll_id, other.coll_id); | |
631 | std::swap(object_id, other.object_id); | |
632 | op_bl.swap(other.op_bl); | |
633 | data_bl.swap(other.data_bl); | |
634 | } | |
635 | ||
636 | void _update_op(Op* op, | |
637 | vector<__le32> &cm, | |
638 | vector<__le32> &om) { | |
639 | ||
640 | switch (op->op) { | |
641 | case OP_NOP: | |
642 | case OP_STARTSYNC: | |
643 | break; | |
644 | ||
645 | case OP_TOUCH: | |
646 | case OP_REMOVE: | |
647 | case OP_SETATTR: | |
648 | case OP_SETATTRS: | |
649 | case OP_RMATTR: | |
650 | case OP_RMATTRS: | |
651 | case OP_COLL_REMOVE: | |
652 | case OP_OMAP_CLEAR: | |
653 | case OP_OMAP_SETKEYS: | |
654 | case OP_OMAP_RMKEYS: | |
655 | case OP_OMAP_RMKEYRANGE: | |
656 | case OP_OMAP_SETHEADER: | |
657 | case OP_WRITE: | |
658 | case OP_ZERO: | |
659 | case OP_TRUNCATE: | |
660 | case OP_SETALLOCHINT: | |
661 | assert(op->cid < cm.size()); | |
662 | assert(op->oid < om.size()); | |
663 | op->cid = cm[op->cid]; | |
664 | op->oid = om[op->oid]; | |
665 | break; | |
666 | ||
667 | case OP_CLONERANGE2: | |
668 | case OP_CLONE: | |
669 | assert(op->cid < cm.size()); | |
670 | assert(op->oid < om.size()); | |
671 | assert(op->dest_oid < om.size()); | |
672 | op->cid = cm[op->cid]; | |
673 | op->oid = om[op->oid]; | |
674 | op->dest_oid = om[op->dest_oid]; | |
675 | break; | |
676 | ||
677 | case OP_MKCOLL: | |
678 | case OP_RMCOLL: | |
679 | case OP_COLL_SETATTR: | |
680 | case OP_COLL_RMATTR: | |
681 | case OP_COLL_SETATTRS: | |
682 | case OP_COLL_HINT: | |
683 | case OP_COLL_SET_BITS: | |
684 | assert(op->cid < cm.size()); | |
685 | op->cid = cm[op->cid]; | |
686 | break; | |
687 | ||
688 | case OP_COLL_ADD: | |
689 | assert(op->cid < cm.size()); | |
690 | assert(op->oid < om.size()); | |
691 | assert(op->dest_cid < om.size()); | |
692 | op->cid = cm[op->cid]; | |
693 | op->dest_cid = cm[op->dest_cid]; | |
694 | op->oid = om[op->oid]; | |
695 | break; | |
696 | ||
697 | case OP_COLL_MOVE_RENAME: | |
698 | assert(op->cid < cm.size()); | |
699 | assert(op->oid < om.size()); | |
700 | assert(op->dest_cid < cm.size()); | |
701 | assert(op->dest_oid < om.size()); | |
702 | op->cid = cm[op->cid]; | |
703 | op->oid = om[op->oid]; | |
704 | op->dest_cid = cm[op->dest_cid]; | |
705 | op->dest_oid = om[op->dest_oid]; | |
706 | break; | |
707 | ||
708 | case OP_TRY_RENAME: | |
709 | assert(op->cid < cm.size()); | |
710 | assert(op->oid < om.size()); | |
711 | assert(op->dest_oid < om.size()); | |
712 | op->cid = cm[op->cid]; | |
713 | op->oid = om[op->oid]; | |
714 | op->dest_oid = om[op->dest_oid]; | |
715 | break; | |
716 | ||
717 | case OP_SPLIT_COLLECTION2: | |
718 | assert(op->cid < cm.size()); | |
719 | assert(op->dest_cid < cm.size()); | |
720 | op->cid = cm[op->cid]; | |
721 | op->dest_cid = cm[op->dest_cid]; | |
722 | break; | |
723 | ||
724 | default: | |
725 | assert(0 == "Unkown OP"); | |
726 | } | |
727 | } | |
728 | void _update_op_bl( | |
729 | bufferlist& bl, | |
730 | vector<__le32> &cm, | |
731 | vector<__le32> &om) { | |
732 | ||
733 | list<bufferptr> list = bl.buffers(); | |
734 | std::list<bufferptr>::iterator p; | |
735 | ||
736 | for(p = list.begin(); p != list.end(); ++p) { | |
737 | assert(p->length() % sizeof(Op) == 0); | |
738 | ||
739 | char* raw_p = p->c_str(); | |
740 | char* raw_end = raw_p + p->length(); | |
741 | while (raw_p < raw_end) { | |
742 | _update_op(reinterpret_cast<Op*>(raw_p), cm, om); | |
743 | raw_p += sizeof(Op); | |
744 | } | |
745 | } | |
746 | } | |
747 | /// Append the operations of the parameter to this Transaction. Those operations are removed from the parameter Transaction | |
748 | void append(Transaction& other) { | |
749 | ||
750 | data.ops += other.data.ops; | |
751 | if (other.data.largest_data_len > data.largest_data_len) { | |
752 | data.largest_data_len = other.data.largest_data_len; | |
753 | data.largest_data_off = other.data.largest_data_off; | |
754 | data.largest_data_off_in_data_bl = data_bl.length() + other.data.largest_data_off_in_data_bl; | |
755 | } | |
756 | data.fadvise_flags |= other.data.fadvise_flags; | |
757 | on_applied.splice(on_applied.end(), other.on_applied); | |
758 | on_commit.splice(on_commit.end(), other.on_commit); | |
759 | on_applied_sync.splice(on_applied_sync.end(), other.on_applied_sync); | |
760 | ||
761 | //append coll_index & object_index | |
762 | vector<__le32> cm(other.coll_index.size()); | |
763 | map<coll_t, __le32>::iterator coll_index_p; | |
764 | for (coll_index_p = other.coll_index.begin(); | |
765 | coll_index_p != other.coll_index.end(); | |
766 | ++coll_index_p) { | |
767 | cm[coll_index_p->second] = _get_coll_id(coll_index_p->first); | |
768 | } | |
769 | ||
770 | vector<__le32> om(other.object_index.size()); | |
771 | map<ghobject_t, __le32>::iterator object_index_p; | |
772 | for (object_index_p = other.object_index.begin(); | |
773 | object_index_p != other.object_index.end(); | |
774 | ++object_index_p) { | |
775 | om[object_index_p->second] = _get_object_id(object_index_p->first); | |
776 | } | |
777 | ||
778 | //the other.op_bl SHOULD NOT be changes during append operation, | |
779 | //we use additional bufferlist to avoid this problem | |
780 | bufferptr other_op_bl_ptr(other.op_bl.length()); | |
781 | other.op_bl.copy(0, other.op_bl.length(), other_op_bl_ptr.c_str()); | |
782 | bufferlist other_op_bl; | |
783 | other_op_bl.append(other_op_bl_ptr); | |
784 | ||
785 | //update other_op_bl with cm & om | |
786 | //When the other is appended to current transaction, all coll_index and | |
787 | //object_index in other.op_buffer should be updated by new index of the | |
788 | //combined transaction | |
789 | _update_op_bl(other_op_bl, cm, om); | |
790 | ||
791 | //append op_bl | |
792 | op_bl.append(other_op_bl); | |
793 | //append data_bl | |
794 | data_bl.append(other.data_bl); | |
795 | } | |
796 | ||
797 | /** Inquires about the Transaction as a whole. */ | |
798 | ||
799 | /// How big is the encoded Transaction buffer? | |
800 | uint64_t get_encoded_bytes() { | |
801 | //layout: data_bl + op_bl + coll_index + object_index + data | |
802 | ||
803 | // coll_index size, object_index size and sizeof(transaction_data) | |
804 | // all here, so they may be computed at compile-time | |
805 | size_t final_size = sizeof(__u32) * 2 + sizeof(data); | |
806 | ||
807 | // coll_index second and object_index second | |
808 | final_size += (coll_index.size() + object_index.size()) * sizeof(__le32); | |
809 | ||
810 | // coll_index first | |
811 | for (auto p = coll_index.begin(); p != coll_index.end(); ++p) { | |
812 | final_size += p->first.encoded_size(); | |
813 | } | |
814 | ||
815 | // object_index first | |
816 | for (auto p = object_index.begin(); p != object_index.end(); ++p) { | |
817 | final_size += p->first.encoded_size(); | |
818 | } | |
819 | ||
820 | return data_bl.length() + | |
821 | op_bl.length() + | |
822 | final_size; | |
823 | } | |
824 | ||
825 | /// Retain old version for regression testing purposes | |
826 | uint64_t get_encoded_bytes_test() { | |
827 | //layout: data_bl + op_bl + coll_index + object_index + data | |
828 | bufferlist bl; | |
829 | ::encode(coll_index, bl); | |
830 | ::encode(object_index, bl); | |
831 | ||
832 | return data_bl.length() + | |
833 | op_bl.length() + | |
834 | bl.length() + | |
835 | sizeof(data); | |
836 | } | |
837 | ||
838 | uint64_t get_num_bytes() { | |
839 | return get_encoded_bytes(); | |
840 | } | |
841 | /// Size of largest data buffer to the "write" operation encountered so far | |
842 | uint32_t get_data_length() { | |
843 | return data.largest_data_len; | |
844 | } | |
845 | /// offset within the encoded buffer to the start of the largest data buffer that's encoded | |
846 | uint32_t get_data_offset() { | |
847 | if (data.largest_data_off_in_data_bl) { | |
848 | return data.largest_data_off_in_data_bl + | |
849 | sizeof(__u8) + // encode struct_v | |
850 | sizeof(__u8) + // encode compat_v | |
851 | sizeof(__u32) + // encode len | |
852 | sizeof(__u32); // data_bl len | |
853 | } | |
854 | return 0; // none | |
855 | } | |
856 | /// offset of buffer as aligned to destination within object. | |
857 | int get_data_alignment() { | |
858 | if (!data.largest_data_len) | |
859 | return -1; | |
860 | return (0 - get_data_offset()) & ~CEPH_PAGE_MASK; | |
861 | } | |
862 | /// Is the Transaction empty (no operations) | |
863 | bool empty() { | |
864 | return !data.ops; | |
865 | } | |
866 | /// Number of operations in the transation | |
867 | int get_num_ops() { | |
868 | return data.ops; | |
869 | } | |
870 | ||
871 | void set_osr(void *s) { | |
872 | osr = s; | |
873 | } | |
874 | ||
875 | void *get_osr() { | |
876 | return osr; | |
877 | } | |
878 | ||
879 | /** | |
880 | * iterator | |
881 | * | |
882 | * Helper object to parse Transactions. | |
883 | * | |
884 | * ObjectStore instances use this object to step down the encoded | |
885 | * buffer decoding operation codes and parameters as we go. | |
886 | * | |
887 | */ | |
888 | class iterator { | |
889 | Transaction *t; | |
890 | ||
891 | uint64_t ops; | |
892 | char* op_buffer_p; | |
893 | ||
894 | bufferlist::iterator data_bl_p; | |
895 | ||
896 | public: | |
897 | vector<coll_t> colls; | |
898 | vector<ghobject_t> objects; | |
899 | ||
900 | private: | |
901 | explicit iterator(Transaction *t) | |
902 | : t(t), | |
903 | data_bl_p(t->data_bl.begin()), | |
904 | colls(t->coll_index.size()), | |
905 | objects(t->object_index.size()) { | |
906 | ||
907 | ops = t->data.ops; | |
908 | op_buffer_p = t->op_bl.get_contiguous(0, t->data.ops * sizeof(Op)); | |
909 | ||
910 | map<coll_t, __le32>::iterator coll_index_p; | |
911 | for (coll_index_p = t->coll_index.begin(); | |
912 | coll_index_p != t->coll_index.end(); | |
913 | ++coll_index_p) { | |
914 | colls[coll_index_p->second] = coll_index_p->first; | |
915 | } | |
916 | ||
917 | map<ghobject_t, __le32>::iterator object_index_p; | |
918 | for (object_index_p = t->object_index.begin(); | |
919 | object_index_p != t->object_index.end(); | |
920 | ++object_index_p) { | |
921 | objects[object_index_p->second] = object_index_p->first; | |
922 | } | |
923 | } | |
924 | ||
925 | friend class Transaction; | |
926 | ||
927 | public: | |
928 | ||
929 | bool have_op() { | |
930 | return ops > 0; | |
931 | } | |
932 | Op* decode_op() { | |
933 | assert(ops > 0); | |
934 | ||
935 | Op* op = reinterpret_cast<Op*>(op_buffer_p); | |
936 | op_buffer_p += sizeof(Op); | |
937 | ops--; | |
938 | ||
939 | return op; | |
940 | } | |
941 | string decode_string() { | |
942 | string s; | |
943 | ::decode(s, data_bl_p); | |
944 | return s; | |
945 | } | |
946 | void decode_bp(bufferptr& bp) { | |
947 | ::decode(bp, data_bl_p); | |
948 | } | |
949 | void decode_bl(bufferlist& bl) { | |
950 | ::decode(bl, data_bl_p); | |
951 | } | |
952 | void decode_attrset(map<string,bufferptr>& aset) { | |
953 | ::decode(aset, data_bl_p); | |
954 | } | |
955 | void decode_attrset(map<string,bufferlist>& aset) { | |
956 | ::decode(aset, data_bl_p); | |
957 | } | |
958 | void decode_attrset_bl(bufferlist *pbl) { | |
959 | decode_str_str_map_to_bl(data_bl_p, pbl); | |
960 | } | |
961 | void decode_keyset(set<string> &keys){ | |
962 | ::decode(keys, data_bl_p); | |
963 | } | |
964 | void decode_keyset_bl(bufferlist *pbl){ | |
965 | decode_str_set_to_bl(data_bl_p, pbl); | |
966 | } | |
967 | ||
968 | const ghobject_t &get_oid(__le32 oid_id) { | |
969 | assert(oid_id < objects.size()); | |
970 | return objects[oid_id]; | |
971 | } | |
972 | const coll_t &get_cid(__le32 cid_id) { | |
973 | assert(cid_id < colls.size()); | |
974 | return colls[cid_id]; | |
975 | } | |
976 | uint32_t get_fadvise_flags() const { | |
977 | return t->get_fadvise_flags(); | |
978 | } | |
979 | }; | |
980 | ||
981 | iterator begin() { | |
982 | return iterator(this); | |
983 | } | |
984 | ||
985 | private: | |
986 | void _build_actions_from_tbl(); | |
987 | ||
988 | /** | |
989 | * Helper functions to encode the various mutation elements of a | |
990 | * transaction. These are 1:1 with the operation codes (see | |
991 | * enumeration above). These routines ensure that the | |
992 | * encoder/creator of a transaction gets the right data in the | |
993 | * right place. Sadly, there's no corresponding version nor any | |
994 | * form of seat belts for the decoder. | |
995 | */ | |
996 | Op* _get_next_op() { | |
997 | if (op_ptr.length() == 0 || op_ptr.offset() >= op_ptr.length()) { | |
998 | op_ptr = bufferptr(sizeof(Op) * OPS_PER_PTR); | |
999 | } | |
1000 | bufferptr ptr(op_ptr, 0, sizeof(Op)); | |
1001 | op_bl.append(ptr); | |
1002 | ||
1003 | op_ptr.set_offset(op_ptr.offset() + sizeof(Op)); | |
1004 | ||
1005 | char* p = ptr.c_str(); | |
1006 | memset(p, 0, sizeof(Op)); | |
1007 | return reinterpret_cast<Op*>(p); | |
1008 | } | |
1009 | __le32 _get_coll_id(const coll_t& coll) { | |
1010 | map<coll_t, __le32>::iterator c = coll_index.find(coll); | |
1011 | if (c != coll_index.end()) | |
1012 | return c->second; | |
1013 | ||
1014 | __le32 index_id = coll_id++; | |
1015 | coll_index[coll] = index_id; | |
1016 | return index_id; | |
1017 | } | |
1018 | __le32 _get_object_id(const ghobject_t& oid) { | |
1019 | map<ghobject_t, __le32>::iterator o = object_index.find(oid); | |
1020 | if (o != object_index.end()) | |
1021 | return o->second; | |
1022 | ||
1023 | __le32 index_id = object_id++; | |
1024 | object_index[oid] = index_id; | |
1025 | return index_id; | |
1026 | } | |
1027 | ||
1028 | public: | |
1029 | /// Commence a global file system sync operation. | |
1030 | void start_sync() { | |
1031 | Op* _op = _get_next_op(); | |
1032 | _op->op = OP_STARTSYNC; | |
1033 | data.ops++; | |
1034 | } | |
1035 | /// noop. 'nuf said | |
1036 | void nop() { | |
1037 | Op* _op = _get_next_op(); | |
1038 | _op->op = OP_NOP; | |
1039 | data.ops++; | |
1040 | } | |
1041 | /** | |
1042 | * touch | |
1043 | * | |
1044 | * Ensure the existance of an object in a collection. Create an | |
1045 | * empty object if necessary | |
1046 | */ | |
1047 | void touch(const coll_t& cid, const ghobject_t& oid) { | |
1048 | Op* _op = _get_next_op(); | |
1049 | _op->op = OP_TOUCH; | |
1050 | _op->cid = _get_coll_id(cid); | |
1051 | _op->oid = _get_object_id(oid); | |
1052 | data.ops++; | |
1053 | } | |
1054 | /** | |
1055 | * Write data to an offset within an object. If the object is too | |
1056 | * small, it is expanded as needed. It is possible to specify an | |
1057 | * offset beyond the current end of an object and it will be | |
1058 | * expanded as needed. Simple implementations of ObjectStore will | |
1059 | * just zero the data between the old end of the object and the | |
1060 | * newly provided data. More sophisticated implementations of | |
1061 | * ObjectStore will omit the untouched data and store it as a | |
1062 | * "hole" in the file. | |
1063 | */ | |
1064 | void write(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len, | |
1065 | const bufferlist& write_data, uint32_t flags = 0) { | |
1066 | uint32_t orig_len = data_bl.length(); | |
1067 | Op* _op = _get_next_op(); | |
1068 | _op->op = OP_WRITE; | |
1069 | _op->cid = _get_coll_id(cid); | |
1070 | _op->oid = _get_object_id(oid); | |
1071 | _op->off = off; | |
1072 | _op->len = len; | |
1073 | ::encode(write_data, data_bl); | |
1074 | ||
1075 | assert(len == write_data.length()); | |
1076 | data.fadvise_flags = data.fadvise_flags | flags; | |
1077 | if (write_data.length() > data.largest_data_len) { | |
1078 | data.largest_data_len = write_data.length(); | |
1079 | data.largest_data_off = off; | |
1080 | data.largest_data_off_in_data_bl = orig_len + sizeof(__u32); // we are about to | |
1081 | } | |
1082 | data.ops++; | |
1083 | } | |
1084 | /** | |
1085 | * zero out the indicated byte range within an object. Some | |
1086 | * ObjectStore instances may optimize this to release the | |
1087 | * underlying storage space. | |
1088 | */ | |
1089 | void zero(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len) { | |
1090 | Op* _op = _get_next_op(); | |
1091 | _op->op = OP_ZERO; | |
1092 | _op->cid = _get_coll_id(cid); | |
1093 | _op->oid = _get_object_id(oid); | |
1094 | _op->off = off; | |
1095 | _op->len = len; | |
1096 | data.ops++; | |
1097 | } | |
1098 | /// Discard all data in the object beyond the specified size. | |
1099 | void truncate(const coll_t& cid, const ghobject_t& oid, uint64_t off) { | |
1100 | Op* _op = _get_next_op(); | |
1101 | _op->op = OP_TRUNCATE; | |
1102 | _op->cid = _get_coll_id(cid); | |
1103 | _op->oid = _get_object_id(oid); | |
1104 | _op->off = off; | |
1105 | data.ops++; | |
1106 | } | |
1107 | /// Remove an object. All four parts of the object are removed. | |
1108 | void remove(const coll_t& cid, const ghobject_t& oid) { | |
1109 | Op* _op = _get_next_op(); | |
1110 | _op->op = OP_REMOVE; | |
1111 | _op->cid = _get_coll_id(cid); | |
1112 | _op->oid = _get_object_id(oid); | |
1113 | data.ops++; | |
1114 | } | |
1115 | /// Set an xattr of an object | |
1116 | void setattr(const coll_t& cid, const ghobject_t& oid, const char* name, bufferlist& val) { | |
1117 | string n(name); | |
1118 | setattr(cid, oid, n, val); | |
1119 | } | |
1120 | /// Set an xattr of an object | |
1121 | void setattr(const coll_t& cid, const ghobject_t& oid, const string& s, bufferlist& val) { | |
1122 | Op* _op = _get_next_op(); | |
1123 | _op->op = OP_SETATTR; | |
1124 | _op->cid = _get_coll_id(cid); | |
1125 | _op->oid = _get_object_id(oid); | |
1126 | ::encode(s, data_bl); | |
1127 | ::encode(val, data_bl); | |
1128 | data.ops++; | |
1129 | } | |
1130 | /// Set multiple xattrs of an object | |
1131 | void setattrs(const coll_t& cid, const ghobject_t& oid, const map<string,bufferptr>& attrset) { | |
1132 | Op* _op = _get_next_op(); | |
1133 | _op->op = OP_SETATTRS; | |
1134 | _op->cid = _get_coll_id(cid); | |
1135 | _op->oid = _get_object_id(oid); | |
1136 | ::encode(attrset, data_bl); | |
1137 | data.ops++; | |
1138 | } | |
1139 | /// Set multiple xattrs of an object | |
1140 | void setattrs(const coll_t& cid, const ghobject_t& oid, const map<string,bufferlist>& attrset) { | |
1141 | Op* _op = _get_next_op(); | |
1142 | _op->op = OP_SETATTRS; | |
1143 | _op->cid = _get_coll_id(cid); | |
1144 | _op->oid = _get_object_id(oid); | |
1145 | ::encode(attrset, data_bl); | |
1146 | data.ops++; | |
1147 | } | |
1148 | /// remove an xattr from an object | |
1149 | void rmattr(const coll_t& cid, const ghobject_t& oid, const char *name) { | |
1150 | string n(name); | |
1151 | rmattr(cid, oid, n); | |
1152 | } | |
1153 | /// remove an xattr from an object | |
1154 | void rmattr(const coll_t& cid, const ghobject_t& oid, const string& s) { | |
1155 | Op* _op = _get_next_op(); | |
1156 | _op->op = OP_RMATTR; | |
1157 | _op->cid = _get_coll_id(cid); | |
1158 | _op->oid = _get_object_id(oid); | |
1159 | ::encode(s, data_bl); | |
1160 | data.ops++; | |
1161 | } | |
1162 | /// remove all xattrs from an object | |
1163 | void rmattrs(const coll_t& cid, const ghobject_t& oid) { | |
1164 | Op* _op = _get_next_op(); | |
1165 | _op->op = OP_RMATTRS; | |
1166 | _op->cid = _get_coll_id(cid); | |
1167 | _op->oid = _get_object_id(oid); | |
1168 | data.ops++; | |
1169 | } | |
1170 | /** | |
1171 | * Clone an object into another object. | |
1172 | * | |
1173 | * Low-cost (e.g., O(1)) cloning (if supported) is best, but | |
1174 | * fallback to an O(n) copy is allowed. All four parts of the | |
1175 | * object are cloned (data, xattrs, omap header, omap | |
1176 | * entries). | |
1177 | * | |
1178 | * The destination named object may already exist, in | |
1179 | * which case its previous contents are discarded. | |
1180 | */ | |
1181 | void clone(const coll_t& cid, const ghobject_t& oid, | |
1182 | const ghobject_t& noid) { | |
1183 | Op* _op = _get_next_op(); | |
1184 | _op->op = OP_CLONE; | |
1185 | _op->cid = _get_coll_id(cid); | |
1186 | _op->oid = _get_object_id(oid); | |
1187 | _op->dest_oid = _get_object_id(noid); | |
1188 | data.ops++; | |
1189 | } | |
1190 | /** | |
1191 | * Clone a byte range from one object to another. | |
1192 | * | |
1193 | * The data portion of the destination object receives a copy of a | |
1194 | * portion of the data from the source object. None of the other | |
1195 | * three parts of an object is copied from the source. | |
1196 | * | |
1197 | * The destination object size may be extended to the dstoff + len. | |
1198 | * | |
1199 | * The source range *must* overlap with the source object data. If it does | |
1200 | * not the result is undefined. | |
1201 | */ | |
1202 | void clone_range(const coll_t& cid, const ghobject_t& oid, | |
1203 | const ghobject_t& noid, | |
1204 | uint64_t srcoff, uint64_t srclen, uint64_t dstoff) { | |
1205 | Op* _op = _get_next_op(); | |
1206 | _op->op = OP_CLONERANGE2; | |
1207 | _op->cid = _get_coll_id(cid); | |
1208 | _op->oid = _get_object_id(oid); | |
1209 | _op->dest_oid = _get_object_id(noid); | |
1210 | _op->off = srcoff; | |
1211 | _op->len = srclen; | |
1212 | _op->dest_off = dstoff; | |
1213 | data.ops++; | |
1214 | } | |
1215 | ||
1216 | /// Create the collection | |
1217 | void create_collection(const coll_t& cid, int bits) { | |
1218 | Op* _op = _get_next_op(); | |
1219 | _op->op = OP_MKCOLL; | |
1220 | _op->cid = _get_coll_id(cid); | |
1221 | _op->split_bits = bits; | |
1222 | data.ops++; | |
1223 | } | |
1224 | ||
1225 | /** | |
1226 | * Give the collection a hint. | |
1227 | * | |
1228 | * @param cid - collection id. | |
1229 | * @param type - hint type. | |
1230 | * @param hint - the hint payload, which contains the customized | |
1231 | * data along with the hint type. | |
1232 | */ | |
1233 | void collection_hint(const coll_t& cid, uint32_t type, const bufferlist& hint) { | |
1234 | Op* _op = _get_next_op(); | |
1235 | _op->op = OP_COLL_HINT; | |
1236 | _op->cid = _get_coll_id(cid); | |
1237 | _op->hint_type = type; | |
1238 | ::encode(hint, data_bl); | |
1239 | data.ops++; | |
1240 | } | |
1241 | ||
1242 | /// remove the collection, the collection must be empty | |
1243 | void remove_collection(const coll_t& cid) { | |
1244 | Op* _op = _get_next_op(); | |
1245 | _op->op = OP_RMCOLL; | |
1246 | _op->cid = _get_coll_id(cid); | |
1247 | data.ops++; | |
1248 | } | |
1249 | void collection_move(const coll_t& cid, coll_t oldcid, const ghobject_t& oid) | |
1250 | __attribute__ ((deprecated)) { | |
1251 | // NOTE: we encode this as a fixed combo of ADD + REMOVE. they | |
1252 | // always appear together, so this is effectively a single MOVE. | |
1253 | Op* _op = _get_next_op(); | |
1254 | _op->op = OP_COLL_ADD; | |
1255 | _op->cid = _get_coll_id(oldcid); | |
1256 | _op->oid = _get_object_id(oid); | |
1257 | _op->dest_cid = _get_coll_id(cid); | |
1258 | data.ops++; | |
1259 | ||
1260 | _op = _get_next_op(); | |
1261 | _op->op = OP_COLL_REMOVE; | |
1262 | _op->cid = _get_coll_id(oldcid); | |
1263 | _op->oid = _get_object_id(oid); | |
1264 | data.ops++; | |
1265 | } | |
1266 | void collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, | |
1267 | coll_t cid, const ghobject_t& oid) { | |
1268 | Op* _op = _get_next_op(); | |
1269 | _op->op = OP_COLL_MOVE_RENAME; | |
1270 | _op->cid = _get_coll_id(oldcid); | |
1271 | _op->oid = _get_object_id(oldoid); | |
1272 | _op->dest_cid = _get_coll_id(cid); | |
1273 | _op->dest_oid = _get_object_id(oid); | |
1274 | data.ops++; | |
1275 | } | |
1276 | void try_rename(coll_t cid, const ghobject_t& oldoid, | |
1277 | const ghobject_t& oid) { | |
1278 | Op* _op = _get_next_op(); | |
1279 | _op->op = OP_TRY_RENAME; | |
1280 | _op->cid = _get_coll_id(cid); | |
1281 | _op->oid = _get_object_id(oldoid); | |
1282 | _op->dest_oid = _get_object_id(oid); | |
1283 | data.ops++; | |
1284 | } | |
1285 | ||
1286 | /// Remove omap from oid | |
1287 | void omap_clear( | |
1288 | coll_t cid, ///< [in] Collection containing oid | |
1289 | const ghobject_t &oid ///< [in] Object from which to remove omap | |
1290 | ) { | |
1291 | Op* _op = _get_next_op(); | |
1292 | _op->op = OP_OMAP_CLEAR; | |
1293 | _op->cid = _get_coll_id(cid); | |
1294 | _op->oid = _get_object_id(oid); | |
1295 | data.ops++; | |
1296 | } | |
1297 | /// Set keys on oid omap. Replaces duplicate keys. | |
1298 | void omap_setkeys( | |
1299 | const coll_t& cid, ///< [in] Collection containing oid | |
1300 | const ghobject_t &oid, ///< [in] Object to update | |
1301 | const map<string, bufferlist> &attrset ///< [in] Replacement keys and values | |
1302 | ) { | |
1303 | Op* _op = _get_next_op(); | |
1304 | _op->op = OP_OMAP_SETKEYS; | |
1305 | _op->cid = _get_coll_id(cid); | |
1306 | _op->oid = _get_object_id(oid); | |
1307 | ::encode(attrset, data_bl); | |
1308 | data.ops++; | |
1309 | } | |
1310 | ||
1311 | /// Set keys on an oid omap (bufferlist variant). | |
1312 | void omap_setkeys( | |
1313 | coll_t cid, ///< [in] Collection containing oid | |
1314 | const ghobject_t &oid, ///< [in] Object to update | |
1315 | const bufferlist &attrset_bl ///< [in] Replacement keys and values | |
1316 | ) { | |
1317 | Op* _op = _get_next_op(); | |
1318 | _op->op = OP_OMAP_SETKEYS; | |
1319 | _op->cid = _get_coll_id(cid); | |
1320 | _op->oid = _get_object_id(oid); | |
1321 | data_bl.append(attrset_bl); | |
1322 | data.ops++; | |
1323 | } | |
1324 | ||
1325 | /// Remove keys from oid omap | |
1326 | void omap_rmkeys( | |
1327 | coll_t cid, ///< [in] Collection containing oid | |
1328 | const ghobject_t &oid, ///< [in] Object from which to remove the omap | |
1329 | const set<string> &keys ///< [in] Keys to clear | |
1330 | ) { | |
1331 | Op* _op = _get_next_op(); | |
1332 | _op->op = OP_OMAP_RMKEYS; | |
1333 | _op->cid = _get_coll_id(cid); | |
1334 | _op->oid = _get_object_id(oid); | |
1335 | ::encode(keys, data_bl); | |
1336 | data.ops++; | |
1337 | } | |
1338 | ||
1339 | /// Remove keys from oid omap | |
1340 | void omap_rmkeys( | |
1341 | coll_t cid, ///< [in] Collection containing oid | |
1342 | const ghobject_t &oid, ///< [in] Object from which to remove the omap | |
1343 | const bufferlist &keys_bl ///< [in] Keys to clear | |
1344 | ) { | |
1345 | Op* _op = _get_next_op(); | |
1346 | _op->op = OP_OMAP_RMKEYS; | |
1347 | _op->cid = _get_coll_id(cid); | |
1348 | _op->oid = _get_object_id(oid); | |
1349 | data_bl.append(keys_bl); | |
1350 | data.ops++; | |
1351 | } | |
1352 | ||
1353 | /// Remove key range from oid omap | |
1354 | void omap_rmkeyrange( | |
1355 | coll_t cid, ///< [in] Collection containing oid | |
1356 | const ghobject_t &oid, ///< [in] Object from which to remove the omap keys | |
1357 | const string& first, ///< [in] first key in range | |
1358 | const string& last ///< [in] first key past range, range is [first,last) | |
1359 | ) { | |
1360 | Op* _op = _get_next_op(); | |
1361 | _op->op = OP_OMAP_RMKEYRANGE; | |
1362 | _op->cid = _get_coll_id(cid); | |
1363 | _op->oid = _get_object_id(oid); | |
1364 | ::encode(first, data_bl); | |
1365 | ::encode(last, data_bl); | |
1366 | data.ops++; | |
1367 | } | |
1368 | ||
1369 | /// Set omap header | |
1370 | void omap_setheader( | |
1371 | coll_t cid, ///< [in] Collection containing oid | |
1372 | const ghobject_t &oid, ///< [in] Object | |
1373 | const bufferlist &bl ///< [in] Header value | |
1374 | ) { | |
1375 | Op* _op = _get_next_op(); | |
1376 | _op->op = OP_OMAP_SETHEADER; | |
1377 | _op->cid = _get_coll_id(cid); | |
1378 | _op->oid = _get_object_id(oid); | |
1379 | ::encode(bl, data_bl); | |
1380 | data.ops++; | |
1381 | } | |
1382 | ||
1383 | /// Split collection based on given prefixes, objects matching the specified bits/rem are | |
1384 | /// moved to the new collection | |
1385 | void split_collection( | |
1386 | coll_t cid, | |
1387 | uint32_t bits, | |
1388 | uint32_t rem, | |
1389 | coll_t destination) { | |
1390 | Op* _op = _get_next_op(); | |
1391 | _op->op = OP_SPLIT_COLLECTION2; | |
1392 | _op->cid = _get_coll_id(cid); | |
1393 | _op->dest_cid = _get_coll_id(destination); | |
1394 | _op->split_bits = bits; | |
1395 | _op->split_rem = rem; | |
1396 | data.ops++; | |
1397 | } | |
1398 | ||
1399 | void collection_set_bits( | |
1400 | coll_t cid, | |
1401 | int bits) { | |
1402 | Op* _op = _get_next_op(); | |
1403 | _op->op = OP_COLL_SET_BITS; | |
1404 | _op->cid = _get_coll_id(cid); | |
1405 | _op->split_bits = bits; | |
1406 | data.ops++; | |
1407 | } | |
1408 | ||
1409 | /// Set allocation hint for an object | |
1410 | /// make 0 values(expected_object_size, expected_write_size) noops for all implementations | |
1411 | void set_alloc_hint( | |
1412 | coll_t cid, | |
1413 | const ghobject_t &oid, | |
1414 | uint64_t expected_object_size, | |
1415 | uint64_t expected_write_size, | |
1416 | uint32_t flags | |
1417 | ) { | |
1418 | Op* _op = _get_next_op(); | |
1419 | _op->op = OP_SETALLOCHINT; | |
1420 | _op->cid = _get_coll_id(cid); | |
1421 | _op->oid = _get_object_id(oid); | |
1422 | _op->expected_object_size = expected_object_size; | |
1423 | _op->expected_write_size = expected_write_size; | |
1424 | _op->alloc_hint_flags = flags; | |
1425 | data.ops++; | |
1426 | } | |
1427 | ||
1428 | void encode(bufferlist& bl) const { | |
1429 | //layout: data_bl + op_bl + coll_index + object_index + data | |
1430 | ENCODE_START(9, 9, bl); | |
1431 | ::encode(data_bl, bl); | |
1432 | ::encode(op_bl, bl); | |
1433 | ::encode(coll_index, bl); | |
1434 | ::encode(object_index, bl); | |
1435 | data.encode(bl); | |
1436 | ENCODE_FINISH(bl); | |
1437 | } | |
1438 | ||
1439 | void decode(bufferlist::iterator &bl) { | |
1440 | DECODE_START(9, bl); | |
1441 | DECODE_OLDEST(9); | |
1442 | ||
1443 | ::decode(data_bl, bl); | |
1444 | ::decode(op_bl, bl); | |
1445 | ::decode(coll_index, bl); | |
1446 | ::decode(object_index, bl); | |
1447 | data.decode(bl); | |
1448 | coll_id = coll_index.size(); | |
1449 | object_id = object_index.size(); | |
1450 | ||
1451 | DECODE_FINISH(bl); | |
1452 | } | |
1453 | ||
1454 | void dump(ceph::Formatter *f); | |
1455 | static void generate_test_instances(list<Transaction*>& o); | |
1456 | }; | |
1457 | ||
1458 | // synchronous wrappers | |
1459 | unsigned apply_transaction(Sequencer *osr, Transaction&& t, Context *ondisk=0) { | |
1460 | vector<Transaction> tls; | |
1461 | tls.push_back(std::move(t)); | |
1462 | return apply_transactions(osr, tls, ondisk); | |
1463 | } | |
1464 | unsigned apply_transactions(Sequencer *osr, vector<Transaction>& tls, Context *ondisk=0); | |
1465 | ||
1466 | int queue_transaction(Sequencer *osr, Transaction&& t, Context *onreadable, Context *ondisk=0, | |
1467 | Context *onreadable_sync=0, | |
1468 | TrackedOpRef op = TrackedOpRef(), | |
1469 | ThreadPool::TPHandle *handle = NULL) { | |
1470 | vector<Transaction> tls; | |
1471 | tls.push_back(std::move(t)); | |
1472 | return queue_transactions(osr, tls, onreadable, ondisk, onreadable_sync, | |
1473 | op, handle); | |
1474 | } | |
1475 | ||
1476 | int queue_transactions(Sequencer *osr, vector<Transaction>& tls, | |
1477 | Context *onreadable, Context *ondisk=0, | |
1478 | Context *onreadable_sync=0, | |
1479 | TrackedOpRef op = TrackedOpRef(), | |
1480 | ThreadPool::TPHandle *handle = NULL) { | |
1481 | assert(!tls.empty()); | |
1482 | tls.back().register_on_applied(onreadable); | |
1483 | tls.back().register_on_commit(ondisk); | |
1484 | tls.back().register_on_applied_sync(onreadable_sync); | |
1485 | return queue_transactions(osr, tls, op, handle); | |
1486 | } | |
1487 | ||
1488 | virtual int queue_transactions( | |
1489 | Sequencer *osr, vector<Transaction>& tls, | |
1490 | TrackedOpRef op = TrackedOpRef(), | |
1491 | ThreadPool::TPHandle *handle = NULL) = 0; | |
1492 | ||
1493 | ||
1494 | int queue_transactions( | |
1495 | Sequencer *osr, | |
1496 | vector<Transaction>& tls, | |
1497 | Context *onreadable, | |
1498 | Context *oncommit, | |
1499 | Context *onreadable_sync, | |
1500 | Context *oncomplete, | |
1501 | TrackedOpRef op); | |
1502 | ||
1503 | int queue_transaction( | |
1504 | Sequencer *osr, | |
1505 | Transaction&& t, | |
1506 | Context *onreadable, | |
1507 | Context *oncommit, | |
1508 | Context *onreadable_sync, | |
1509 | Context *oncomplete, | |
1510 | TrackedOpRef op) { | |
1511 | ||
1512 | vector<Transaction> tls; | |
1513 | tls.push_back(std::move(t)); | |
1514 | return queue_transactions( | |
1515 | osr, tls, onreadable, oncommit, onreadable_sync, oncomplete, op); | |
1516 | } | |
1517 | ||
1518 | public: | |
1519 | ObjectStore(CephContext* cct, | |
1520 | const std::string& path_) : path(path_), cct(cct) {} | |
1521 | virtual ~ObjectStore() {} | |
1522 | ||
1523 | // no copying | |
1524 | explicit ObjectStore(const ObjectStore& o) = delete; | |
1525 | const ObjectStore& operator=(const ObjectStore& o) = delete; | |
1526 | ||
1527 | // versioning | |
1528 | virtual int upgrade() { | |
1529 | return 0; | |
1530 | } | |
1531 | ||
1532 | virtual void get_db_statistics(Formatter *f) { } | |
1533 | virtual void generate_db_histogram(Formatter *f) { } | |
1534 | virtual void flush_cache() { } | |
1535 | virtual void dump_perf_counters(Formatter *f) {} | |
1536 | ||
1537 | virtual string get_type() = 0; | |
1538 | ||
1539 | // mgmt | |
1540 | virtual bool test_mount_in_use() = 0; | |
1541 | virtual int mount() = 0; | |
1542 | virtual int umount() = 0; | |
1543 | virtual int fsck(bool deep) { | |
1544 | return -EOPNOTSUPP; | |
1545 | } | |
3efd9988 FG |
1546 | virtual int repair(bool deep) { |
1547 | return -EOPNOTSUPP; | |
1548 | } | |
7c673cae FG |
1549 | |
1550 | virtual void set_cache_shards(unsigned num) { } | |
1551 | ||
1552 | /** | |
1553 | * Returns 0 if the hobject is valid, -error otherwise | |
1554 | * | |
1555 | * Errors: | |
1556 | * -ENAMETOOLONG: locator/namespace/name too large | |
1557 | */ | |
1558 | virtual int validate_hobject_key(const hobject_t &obj) const = 0; | |
1559 | ||
1560 | virtual unsigned get_max_attr_name_length() = 0; | |
1561 | virtual int mkfs() = 0; // wipe | |
1562 | virtual int mkjournal() = 0; // journal only | |
1563 | virtual bool needs_journal() = 0; //< requires a journal | |
1564 | virtual bool wants_journal() = 0; //< prefers a journal | |
1565 | virtual bool allows_journal() = 0; //< allows a journal | |
1566 | ||
31f18b77 FG |
1567 | /** |
1568 | * is_rotational | |
1569 | * | |
1570 | * Check whether store is backed by a rotational (HDD) or non-rotational | |
1571 | * (SSD) device. | |
1572 | * | |
1573 | * This must be usable *before* the store is mounted. | |
1574 | * | |
1575 | * @return true for HDD, false for SSD | |
1576 | */ | |
1577 | virtual bool is_rotational() { | |
1578 | return true; | |
1579 | } | |
1580 | ||
d2e6a577 FG |
1581 | /** |
1582 | * is_journal_rotational | |
1583 | * | |
1584 | * Check whether journal is backed by a rotational (HDD) or non-rotational | |
1585 | * (SSD) device. | |
1586 | * | |
1587 | * | |
1588 | * @return true for HDD, false for SSD | |
1589 | */ | |
1590 | virtual bool is_journal_rotational() { | |
1591 | return true; | |
1592 | } | |
1593 | ||
224ce89b WB |
1594 | virtual string get_default_device_class() { |
1595 | return is_rotational() ? "hdd" : "ssd"; | |
1596 | } | |
1597 | ||
7c673cae FG |
1598 | virtual bool can_sort_nibblewise() { |
1599 | return false; // assume a backend cannot, unless it says otherwise | |
1600 | } | |
1601 | ||
1602 | virtual int statfs(struct store_statfs_t *buf) = 0; | |
1603 | ||
1604 | virtual void collect_metadata(map<string,string> *pm) { } | |
1605 | ||
1606 | /** | |
1607 | * write_meta - write a simple configuration key out-of-band | |
1608 | * | |
1609 | * Write a simple key/value pair for basic store configuration | |
1610 | * (e.g., a uuid or magic number) to an unopened/unmounted store. | |
1611 | * The default implementation writes this to a plaintext file in the | |
1612 | * path. | |
1613 | * | |
1614 | * A newline is appended. | |
1615 | * | |
1616 | * @param key key name (e.g., "fsid") | |
1617 | * @param value value (e.g., a uuid rendered as a string) | |
1618 | * @returns 0 for success, or an error code | |
1619 | */ | |
1620 | virtual int write_meta(const std::string& key, | |
1621 | const std::string& value); | |
1622 | ||
1623 | /** | |
1624 | * read_meta - read a simple configuration key out-of-band | |
1625 | * | |
1626 | * Read a simple key value to an unopened/mounted store. | |
1627 | * | |
1628 | * Trailing whitespace is stripped off. | |
1629 | * | |
1630 | * @param key key name | |
1631 | * @param value pointer to value string | |
1632 | * @returns 0 for success, or an error code | |
1633 | */ | |
1634 | virtual int read_meta(const std::string& key, | |
1635 | std::string *value); | |
1636 | ||
1637 | /** | |
1638 | * get ideal max value for collection_list() | |
1639 | * | |
1640 | * default to some arbitrary values; the implementation will override. | |
1641 | */ | |
1642 | virtual int get_ideal_list_max() { return 64; } | |
1643 | ||
1644 | ||
1645 | /** | |
1646 | * get a collection handle | |
1647 | * | |
1648 | * Provide a trivial handle as a default to avoid converting legacy | |
1649 | * implementations. | |
1650 | */ | |
1651 | virtual CollectionHandle open_collection(const coll_t &cid) { | |
1652 | return new CompatCollectionHandle(cid); | |
1653 | } | |
1654 | ||
1655 | ||
1656 | /** | |
1657 | * Synchronous read operations | |
1658 | */ | |
1659 | ||
1660 | /** | |
1661 | * exists -- Test for existance of object | |
1662 | * | |
1663 | * @param cid collection for object | |
1664 | * @param oid oid of object | |
1665 | * @returns true if object exists, false otherwise | |
1666 | */ | |
1667 | virtual bool exists(const coll_t& cid, const ghobject_t& oid) = 0; // useful? | |
1668 | virtual bool exists(CollectionHandle& c, const ghobject_t& oid) { | |
1669 | return exists(c->get_cid(), oid); | |
1670 | } | |
1671 | /** | |
1672 | * set_collection_opts -- set pool options for a collectioninformation for an object | |
1673 | * | |
1674 | * @param cid collection | |
1675 | * @param opts new collection options | |
1676 | * @returns 0 on success, negative error code on failure. | |
1677 | */ | |
1678 | virtual int set_collection_opts( | |
1679 | const coll_t& cid, | |
1680 | const pool_opts_t& opts) = 0; | |
1681 | ||
1682 | /** | |
1683 | * stat -- get information for an object | |
1684 | * | |
1685 | * @param cid collection for object | |
1686 | * @param oid oid of object | |
1687 | * @param st output information for the object | |
1688 | * @param allow_eio if false, assert on -EIO operation failure | |
1689 | * @returns 0 on success, negative error code on failure. | |
1690 | */ | |
1691 | virtual int stat( | |
1692 | const coll_t& cid, | |
1693 | const ghobject_t& oid, | |
1694 | struct stat *st, | |
1695 | bool allow_eio = false) = 0; // struct stat? | |
1696 | virtual int stat( | |
1697 | CollectionHandle &c, | |
1698 | const ghobject_t& oid, | |
1699 | struct stat *st, | |
1700 | bool allow_eio = false) { | |
1701 | return stat(c->get_cid(), oid, st, allow_eio); | |
1702 | } | |
1703 | ||
1704 | /** | |
1705 | * read -- read a byte range of data from an object | |
1706 | * | |
1707 | * Note: if reading from an offset past the end of the object, we | |
1708 | * return 0 (not, say, -EINVAL). | |
1709 | * | |
1710 | * @param cid collection for object | |
1711 | * @param oid oid of object | |
1712 | * @param offset location offset of first byte to be read | |
1713 | * @param len number of bytes to be read | |
1714 | * @param bl output bufferlist | |
1715 | * @param op_flags is CEPH_OSD_OP_FLAG_* | |
1716 | * @param allow_eio if false, assert on -EIO operation failure | |
1717 | * @returns number of bytes read on success, or negative error code on failure. | |
1718 | */ | |
1719 | virtual int read( | |
1720 | const coll_t& cid, | |
1721 | const ghobject_t& oid, | |
1722 | uint64_t offset, | |
1723 | size_t len, | |
1724 | bufferlist& bl, | |
224ce89b | 1725 | uint32_t op_flags = 0) = 0; |
7c673cae FG |
1726 | virtual int read( |
1727 | CollectionHandle &c, | |
1728 | const ghobject_t& oid, | |
1729 | uint64_t offset, | |
1730 | size_t len, | |
1731 | bufferlist& bl, | |
224ce89b WB |
1732 | uint32_t op_flags = 0) { |
1733 | return read(c->get_cid(), oid, offset, len, bl, op_flags); | |
7c673cae FG |
1734 | } |
1735 | ||
1736 | /** | |
1737 | * fiemap -- get extent map of data of an object | |
1738 | * | |
1739 | * Returns an encoded map of the extents of an object's data portion | |
1740 | * (map<offset,size>). | |
1741 | * | |
1742 | * A non-enlightened implementation is free to return the extent (offset, len) | |
1743 | * as the sole extent. | |
1744 | * | |
1745 | * @param cid collection for object | |
1746 | * @param oid oid of object | |
1747 | * @param offset location offset of first byte to be read | |
1748 | * @param len number of bytes to be read | |
1749 | * @param bl output bufferlist for extent map information. | |
1750 | * @returns 0 on success, negative error code on failure. | |
1751 | */ | |
1752 | virtual int fiemap(const coll_t& cid, const ghobject_t& oid, | |
1753 | uint64_t offset, size_t len, bufferlist& bl) = 0; | |
1754 | virtual int fiemap(const coll_t& cid, const ghobject_t& oid, | |
1755 | uint64_t offset, size_t len, | |
1756 | map<uint64_t, uint64_t>& destmap) = 0; | |
1757 | virtual int fiemap(CollectionHandle& c, const ghobject_t& oid, | |
1758 | uint64_t offset, size_t len, bufferlist& bl) { | |
1759 | return fiemap(c->get_cid(), oid, offset, len, bl); | |
1760 | } | |
1761 | virtual int fiemap(CollectionHandle& c, const ghobject_t& oid, | |
1762 | uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) { | |
1763 | return fiemap(c->get_cid(), oid, offset, len, destmap); | |
1764 | } | |
1765 | ||
1766 | /** | |
1767 | * getattr -- get an xattr of an object | |
1768 | * | |
1769 | * @param cid collection for object | |
1770 | * @param oid oid of object | |
1771 | * @param name name of attr to read | |
1772 | * @param value place to put output result. | |
1773 | * @returns 0 on success, negative error code on failure. | |
1774 | */ | |
1775 | virtual int getattr(const coll_t& cid, const ghobject_t& oid, | |
1776 | const char *name, bufferptr& value) = 0; | |
1777 | virtual int getattr(CollectionHandle &c, const ghobject_t& oid, | |
1778 | const char *name, bufferptr& value) { | |
1779 | return getattr(c->get_cid(), oid, name, value); | |
1780 | } | |
1781 | ||
1782 | /** | |
1783 | * getattr -- get an xattr of an object | |
1784 | * | |
1785 | * @param cid collection for object | |
1786 | * @param oid oid of object | |
1787 | * @param name name of attr to read | |
1788 | * @param value place to put output result. | |
1789 | * @returns 0 on success, negative error code on failure. | |
1790 | */ | |
1791 | int getattr(const coll_t& cid, const ghobject_t& oid, const char *name, bufferlist& value) { | |
1792 | bufferptr bp; | |
1793 | int r = getattr(cid, oid, name, bp); | |
1794 | if (bp.length()) | |
1795 | value.push_back(bp); | |
1796 | return r; | |
1797 | } | |
1798 | int getattr( | |
1799 | coll_t cid, const ghobject_t& oid, | |
1800 | const string& name, bufferlist& value) { | |
1801 | bufferptr bp; | |
1802 | int r = getattr(cid, oid, name.c_str(), bp); | |
1803 | value.push_back(bp); | |
1804 | return r; | |
1805 | } | |
1806 | int getattr( | |
1807 | CollectionHandle &c, const ghobject_t& oid, | |
1808 | const string& name, bufferlist& value) { | |
1809 | bufferptr bp; | |
1810 | int r = getattr(c, oid, name.c_str(), bp); | |
1811 | value.push_back(bp); | |
1812 | return r; | |
1813 | } | |
1814 | ||
1815 | /** | |
1816 | * getattrs -- get all of the xattrs of an object | |
1817 | * | |
1818 | * @param cid collection for object | |
1819 | * @param oid oid of object | |
1820 | * @param aset place to put output result. | |
1821 | * @returns 0 on success, negative error code on failure. | |
1822 | */ | |
1823 | virtual int getattrs(const coll_t& cid, const ghobject_t& oid, | |
1824 | map<string,bufferptr>& aset) = 0; | |
1825 | virtual int getattrs(CollectionHandle &c, const ghobject_t& oid, | |
1826 | map<string,bufferptr>& aset) { | |
1827 | return getattrs(c->get_cid(), oid, aset); | |
1828 | } | |
1829 | ||
1830 | /** | |
1831 | * getattrs -- get all of the xattrs of an object | |
1832 | * | |
1833 | * @param cid collection for object | |
1834 | * @param oid oid of object | |
1835 | * @param aset place to put output result. | |
1836 | * @returns 0 on success, negative error code on failure. | |
1837 | */ | |
1838 | int getattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferlist>& aset) { | |
1839 | map<string,bufferptr> bmap; | |
1840 | int r = getattrs(cid, oid, bmap); | |
1841 | for (map<string,bufferptr>::iterator i = bmap.begin(); | |
1842 | i != bmap.end(); | |
1843 | ++i) { | |
1844 | aset[i->first].append(i->second); | |
1845 | } | |
1846 | return r; | |
1847 | } | |
1848 | int getattrs(CollectionHandle &c, const ghobject_t& oid, | |
1849 | map<string,bufferlist>& aset) { | |
1850 | map<string,bufferptr> bmap; | |
1851 | int r = getattrs(c, oid, bmap); | |
1852 | for (map<string,bufferptr>::iterator i = bmap.begin(); | |
1853 | i != bmap.end(); | |
1854 | ++i) { | |
1855 | aset[i->first].append(i->second); | |
1856 | } | |
1857 | return r; | |
1858 | } | |
1859 | ||
1860 | ||
1861 | // collections | |
1862 | ||
1863 | /** | |
1864 | * list_collections -- get all of the collections known to this ObjectStore | |
1865 | * | |
1866 | * @param ls list of the collections in sorted order. | |
1867 | * @returns 0 on success, negative error code on failure. | |
1868 | */ | |
1869 | virtual int list_collections(vector<coll_t>& ls) = 0; | |
1870 | ||
1871 | /** | |
1872 | * does a collection exist? | |
1873 | * | |
1874 | * @param c collection | |
1875 | * @returns true if it exists, false otherwise | |
1876 | */ | |
1877 | virtual bool collection_exists(const coll_t& c) = 0; | |
1878 | ||
1879 | /** | |
1880 | * is a collection empty? | |
1881 | * | |
1882 | * @param c collection | |
1883 | * @param empty true if the specified collection is empty, false otherwise | |
1884 | * @returns 0 on success, negative error code on failure. | |
1885 | */ | |
1886 | virtual int collection_empty(const coll_t& c, bool *empty) = 0; | |
1887 | ||
1888 | /** | |
1889 | * return the number of significant bits of the coll_t::pgid. | |
1890 | * | |
1891 | * This should return what the last create_collection or split_collection | |
1892 | * set. A legacy backend may return -EAGAIN if the value is unavailable | |
1893 | * (because we upgraded from an older version, e.g., FileStore). | |
1894 | */ | |
1895 | virtual int collection_bits(const coll_t& c) = 0; | |
1896 | ||
1897 | ||
1898 | /** | |
1899 | * list contents of a collection that fall in the range [start, end) and no more than a specified many result | |
1900 | * | |
1901 | * @param c collection | |
1902 | * @param start list object that sort >= this value | |
1903 | * @param end list objects that sort < this value | |
1904 | * @param max return no more than this many results | |
1905 | * @param seq return no objects with snap < seq | |
1906 | * @param ls [out] result | |
1907 | * @param next [out] next item sorts >= this value | |
1908 | * @return zero on success, or negative error | |
1909 | */ | |
1910 | virtual int collection_list(const coll_t& c, | |
1911 | const ghobject_t& start, const ghobject_t& end, | |
1912 | int max, | |
1913 | vector<ghobject_t> *ls, ghobject_t *next) = 0; | |
1914 | virtual int collection_list(CollectionHandle &c, | |
1915 | const ghobject_t& start, const ghobject_t& end, | |
1916 | int max, | |
1917 | vector<ghobject_t> *ls, ghobject_t *next) { | |
1918 | return collection_list(c->get_cid(), start, end, max, ls, next); | |
1919 | } | |
1920 | ||
1921 | ||
1922 | /// OMAP | |
1923 | /// Get omap contents | |
1924 | virtual int omap_get( | |
1925 | const coll_t& c, ///< [in] Collection containing oid | |
1926 | const ghobject_t &oid, ///< [in] Object containing omap | |
1927 | bufferlist *header, ///< [out] omap header | |
1928 | map<string, bufferlist> *out /// < [out] Key to value map | |
1929 | ) = 0; | |
1930 | virtual int omap_get( | |
1931 | CollectionHandle &c, ///< [in] Collection containing oid | |
1932 | const ghobject_t &oid, ///< [in] Object containing omap | |
1933 | bufferlist *header, ///< [out] omap header | |
1934 | map<string, bufferlist> *out /// < [out] Key to value map | |
1935 | ) { | |
1936 | return omap_get(c->get_cid(), oid, header, out); | |
1937 | } | |
1938 | ||
1939 | /// Get omap header | |
1940 | virtual int omap_get_header( | |
1941 | const coll_t& c, ///< [in] Collection containing oid | |
1942 | const ghobject_t &oid, ///< [in] Object containing omap | |
1943 | bufferlist *header, ///< [out] omap header | |
1944 | bool allow_eio = false ///< [in] don't assert on eio | |
1945 | ) = 0; | |
1946 | virtual int omap_get_header( | |
1947 | CollectionHandle &c, ///< [in] Collection containing oid | |
1948 | const ghobject_t &oid, ///< [in] Object containing omap | |
1949 | bufferlist *header, ///< [out] omap header | |
1950 | bool allow_eio = false ///< [in] don't assert on eio | |
1951 | ) { | |
1952 | return omap_get_header(c->get_cid(), oid, header, allow_eio); | |
1953 | } | |
1954 | ||
1955 | /// Get keys defined on oid | |
1956 | virtual int omap_get_keys( | |
1957 | const coll_t& c, ///< [in] Collection containing oid | |
1958 | const ghobject_t &oid, ///< [in] Object containing omap | |
1959 | set<string> *keys ///< [out] Keys defined on oid | |
1960 | ) = 0; | |
1961 | virtual int omap_get_keys( | |
1962 | CollectionHandle &c, ///< [in] Collection containing oid | |
1963 | const ghobject_t &oid, ///< [in] Object containing omap | |
1964 | set<string> *keys ///< [out] Keys defined on oid | |
1965 | ) { | |
1966 | return omap_get_keys(c->get_cid(), oid, keys); | |
1967 | } | |
1968 | ||
1969 | /// Get key values | |
1970 | virtual int omap_get_values( | |
1971 | const coll_t& c, ///< [in] Collection containing oid | |
1972 | const ghobject_t &oid, ///< [in] Object containing omap | |
1973 | const set<string> &keys, ///< [in] Keys to get | |
1974 | map<string, bufferlist> *out ///< [out] Returned keys and values | |
1975 | ) = 0; | |
1976 | virtual int omap_get_values( | |
1977 | CollectionHandle &c, ///< [in] Collection containing oid | |
1978 | const ghobject_t &oid, ///< [in] Object containing omap | |
1979 | const set<string> &keys, ///< [in] Keys to get | |
1980 | map<string, bufferlist> *out ///< [out] Returned keys and values | |
1981 | ) { | |
1982 | return omap_get_values(c->get_cid(), oid, keys, out); | |
1983 | } | |
1984 | ||
1985 | /// Filters keys into out which are defined on oid | |
1986 | virtual int omap_check_keys( | |
1987 | const coll_t& c, ///< [in] Collection containing oid | |
1988 | const ghobject_t &oid, ///< [in] Object containing omap | |
1989 | const set<string> &keys, ///< [in] Keys to check | |
1990 | set<string> *out ///< [out] Subset of keys defined on oid | |
1991 | ) = 0; | |
1992 | virtual int omap_check_keys( | |
1993 | CollectionHandle &c, ///< [in] Collection containing oid | |
1994 | const ghobject_t &oid, ///< [in] Object containing omap | |
1995 | const set<string> &keys, ///< [in] Keys to check | |
1996 | set<string> *out ///< [out] Subset of keys defined on oid | |
1997 | ) { | |
1998 | return omap_check_keys(c->get_cid(), oid, keys, out); | |
1999 | } | |
2000 | ||
2001 | /** | |
2002 | * Returns an object map iterator | |
2003 | * | |
2004 | * Warning! The returned iterator is an implicit lock on filestore | |
2005 | * operations in c. Do not use filestore methods on c while the returned | |
2006 | * iterator is live. (Filling in a transaction is no problem). | |
2007 | * | |
2008 | * @return iterator, null on error | |
2009 | */ | |
2010 | virtual ObjectMap::ObjectMapIterator get_omap_iterator( | |
2011 | const coll_t& c, ///< [in] collection | |
2012 | const ghobject_t &oid ///< [in] object | |
2013 | ) = 0; | |
2014 | virtual ObjectMap::ObjectMapIterator get_omap_iterator( | |
2015 | CollectionHandle &c, ///< [in] collection | |
2016 | const ghobject_t &oid ///< [in] object | |
2017 | ) { | |
2018 | return get_omap_iterator(c->get_cid(), oid); | |
2019 | } | |
2020 | ||
2021 | virtual int flush_journal() { return -EOPNOTSUPP; } | |
2022 | ||
2023 | virtual int dump_journal(ostream& out) { return -EOPNOTSUPP; } | |
2024 | ||
2025 | virtual int snapshot(const string& name) { return -EOPNOTSUPP; } | |
2026 | ||
2027 | /** | |
2028 | * Set and get internal fsid for this instance. No external data is modified | |
2029 | */ | |
2030 | virtual void set_fsid(uuid_d u) = 0; | |
2031 | virtual uuid_d get_fsid() = 0; | |
2032 | ||
2033 | /** | |
2034 | * Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store | |
2035 | * - num objects - total (including witeouts) object count to measure used space for. | |
2036 | */ | |
2037 | virtual uint64_t estimate_objects_overhead(uint64_t num_objects) = 0; | |
2038 | ||
2039 | ||
2040 | // DEBUG | |
2041 | virtual void inject_data_error(const ghobject_t &oid) {} | |
2042 | virtual void inject_mdata_error(const ghobject_t &oid) {} | |
224ce89b WB |
2043 | |
2044 | virtual void compact() {} | |
7c673cae FG |
2045 | }; |
2046 | WRITE_CLASS_ENCODER(ObjectStore::Transaction) | |
2047 | WRITE_CLASS_ENCODER(ObjectStore::Transaction::TransactionData) | |
2048 | ||
2049 | static inline void intrusive_ptr_add_ref(ObjectStore::Sequencer_impl *s) { | |
2050 | s->get(); | |
2051 | } | |
2052 | static inline void intrusive_ptr_release(ObjectStore::Sequencer_impl *s) { | |
2053 | s->put(); | |
2054 | } | |
2055 | ||
2056 | ostream& operator<<(ostream& out, const ObjectStore::Sequencer& s); | |
2057 | ostream& operator<<(ostream& out, const ObjectStore::Transaction& tx); | |
2058 | ||
2059 | #endif |