]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | #ifndef CEPH_OBJECTSTORE_H | |
15 | #define CEPH_OBJECTSTORE_H | |
16 | ||
17 | #include "include/Context.h" | |
18 | #include "include/buffer.h" | |
19 | #include "include/types.h" | |
11fdf7f2 | 20 | #include "include/stringify.h" |
7c673cae FG |
21 | #include "osd/osd_types.h" |
22 | #include "common/TrackedOp.h" | |
23 | #include "common/WorkQueue.h" | |
24 | #include "ObjectMap.h" | |
25 | ||
26 | #include <errno.h> | |
27 | #include <sys/stat.h> | |
28 | #include <vector> | |
29 | #include <map> | |
30 | ||
11fdf7f2 | 31 | #if defined(__APPLE__) || defined(__FreeBSD__) || defined(__sun) |
7c673cae FG |
32 | #include <sys/statvfs.h> |
33 | #else | |
34 | #include <sys/vfs.h> /* or <sys/statfs.h> */ | |
11fdf7f2 | 35 | #endif |
7c673cae FG |
36 | |
37 | #define OPS_PER_PTR 32 | |
38 | ||
39 | class CephContext; | |
40 | ||
41 | using std::vector; | |
42 | using std::string; | |
43 | using std::map; | |
44 | ||
45 | namespace ceph { | |
46 | class Formatter; | |
47 | } | |
48 | ||
49 | /* | |
50 | * low-level interface to the local OSD file system | |
51 | */ | |
52 | ||
53 | class Logger; | |
11fdf7f2 | 54 | class ContextQueue; |
7c673cae FG |
55 | |
56 | static inline void encode(const map<string,bufferptr> *attrset, bufferlist &bl) { | |
11fdf7f2 | 57 | encode(*attrset, bl); |
7c673cae FG |
58 | } |
59 | ||
60 | // this isn't the best place for these, but... | |
11fdf7f2 TL |
61 | void decode_str_str_map_to_bl(bufferlist::const_iterator& p, bufferlist *out); |
62 | void decode_str_set_to_bl(bufferlist::const_iterator& p, bufferlist *out); | |
7c673cae FG |
63 | |
64 | // Flag bits | |
65 | typedef uint32_t osflagbits_t; | |
66 | const int SKIP_JOURNAL_REPLAY = 1 << 0; | |
67 | const int SKIP_MOUNT_OMAP = 1 << 1; | |
68 | ||
69 | class ObjectStore { | |
70 | protected: | |
71 | string path; | |
72 | ||
73 | public: | |
74 | CephContext* cct; | |
75 | /** | |
76 | * create - create an ObjectStore instance. | |
77 | * | |
78 | * This is invoked once at initialization time. | |
79 | * | |
80 | * @param type type of store. This is a string from the configuration file. | |
81 | * @param data path (or other descriptor) for data | |
82 | * @param journal path (or other descriptor) for journal (optional) | |
83 | * @param flags which filestores should check if applicable | |
84 | */ | |
85 | static ObjectStore *create(CephContext *cct, | |
86 | const string& type, | |
87 | const string& data, | |
88 | const string& journal, | |
89 | osflagbits_t flags = 0); | |
90 | ||
91 | /** | |
92 | * probe a block device to learn the uuid of the owning OSD | |
93 | * | |
94 | * @param cct cct | |
95 | * @param path path to device | |
96 | * @param fsid [out] osd uuid | |
97 | */ | |
98 | static int probe_block_device_fsid( | |
99 | CephContext *cct, | |
100 | const string& path, | |
101 | uuid_d *fsid); | |
102 | ||
103 | /** | |
104 | * Fetch Object Store statistics. | |
105 | * | |
106 | * Currently only latency of write and apply times are measured. | |
107 | * | |
108 | * This appears to be called with nothing locked. | |
109 | */ | |
110 | virtual objectstore_perf_stat_t get_cur_stats() = 0; | |
111 | ||
112 | /** | |
113 | * Fetch Object Store performance counters. | |
114 | * | |
115 | * | |
116 | * This appears to be called with nothing locked. | |
117 | */ | |
118 | virtual const PerfCounters* get_perf_counters() const = 0; | |
119 | ||
120 | /** | |
11fdf7f2 | 121 | * a collection also orders transactions |
7c673cae | 122 | * |
11fdf7f2 TL |
123 | * Any transactions queued under a given collection will be applied in |
124 | * sequence. Transactions queued under different collections may run | |
7c673cae FG |
125 | * in parallel. |
126 | * | |
11fdf7f2 TL |
127 | * ObjectStore users my get collection handles with open_collection() (or, |
128 | * for bootstrapping a new collection, create_new_collection()). | |
7c673cae | 129 | */ |
11fdf7f2 TL |
130 | struct CollectionImpl : public RefCountedObject { |
131 | const coll_t cid; | |
7c673cae | 132 | |
11fdf7f2 TL |
133 | CollectionImpl(const coll_t& c) |
134 | : RefCountedObject(NULL, 0), | |
135 | cid(c) {} | |
7c673cae | 136 | |
11fdf7f2 | 137 | /// wait for any queued transactions to apply |
7c673cae FG |
138 | // block until any previous transactions are visible. specifically, |
139 | // collection_list and collection_empty need to reflect prior operations. | |
140 | virtual void flush() = 0; | |
141 | ||
7c673cae FG |
142 | /** |
143 | * Async flush_commit | |
144 | * | |
145 | * There are two cases: | |
11fdf7f2 | 146 | * 1) collection is currently idle: the method returns true. c is |
7c673cae | 147 | * not touched. |
11fdf7f2 TL |
148 | * 2) collection is not idle: the method returns false and c is |
149 | * called asynchronously with a value of 0 once all transactions | |
150 | * queued on this collection prior to the call have been applied | |
7c673cae FG |
151 | * and committed. |
152 | */ | |
11fdf7f2 | 153 | virtual bool flush_commit(Context *c) = 0; |
7c673cae | 154 | |
11fdf7f2 TL |
155 | const coll_t &get_cid() { |
156 | return cid; | |
7c673cae FG |
157 | } |
158 | }; | |
7c673cae FG |
159 | typedef boost::intrusive_ptr<CollectionImpl> CollectionHandle; |
160 | ||
7c673cae FG |
161 | |
162 | /********************************* | |
163 | * | |
164 | * Object Contents and semantics | |
165 | * | |
166 | * All ObjectStore objects are identified as a named object | |
167 | * (ghobject_t and hobject_t) in a named collection (coll_t). | |
168 | * ObjectStore operations support the creation, mutation, deletion | |
169 | * and enumeration of objects within a collection. Enumeration is | |
170 | * in sorted key order (where keys are sorted by hash). Object names | |
171 | * are globally unique. | |
172 | * | |
173 | * Each object has four distinct parts: byte data, xattrs, omap_header | |
174 | * and omap entries. | |
175 | * | |
176 | * The data portion of an object is conceptually equivalent to a | |
177 | * file in a file system. Random and Partial access for both read | |
178 | * and write operations is required. The ability to have a sparse | |
179 | * implementation of the data portion of an object is beneficial for | |
180 | * some workloads, but not required. There is a system-wide limit on | |
181 | * the maximum size of an object, which is typically around 100 MB. | |
182 | * | |
183 | * Xattrs are equivalent to the extended attributes of file | |
184 | * systems. Xattrs are a set of key/value pairs. Sub-value access | |
185 | * is not required. It is possible to enumerate the set of xattrs in | |
186 | * key order. At the implementation level, xattrs are used | |
187 | * exclusively internal to Ceph and the implementer can expect the | |
188 | * total size of all of the xattrs on an object to be relatively | |
189 | * small, i.e., less than 64KB. Much of Ceph assumes that accessing | |
190 | * xattrs on temporally adjacent object accesses (recent past or | |
191 | * near future) is inexpensive. | |
192 | * | |
193 | * omap_header is a single blob of data. It can be read or written | |
194 | * in total. | |
195 | * | |
196 | * Omap entries are conceptually the same as xattrs | |
197 | * but in a different address space. In other words, you can have | |
198 | * the same key as an xattr and an omap entry and they have distinct | |
199 | * values. Enumeration of xattrs doesn't include omap entries and | |
200 | * vice versa. The size and access characteristics of omap entries | |
201 | * are very different from xattrs. In particular, the value portion | |
202 | * of an omap entry can be quite large (MBs). More importantly, the | |
203 | * interface must support efficient range queries on omap entries even | |
204 | * when there are a large numbers of entries. | |
205 | * | |
206 | *********************************/ | |
207 | ||
208 | /******************************* | |
209 | * | |
210 | * Collections | |
211 | * | |
212 | * A collection is simply a grouping of objects. Collections have | |
213 | * names (coll_t) and can be enumerated in order. Like an | |
214 | * individual object, a collection also has a set of xattrs. | |
215 | * | |
216 | * | |
217 | */ | |
218 | ||
219 | ||
220 | /********************************* | |
221 | * transaction | |
222 | * | |
223 | * A Transaction represents a sequence of primitive mutation | |
224 | * operations. | |
225 | * | |
226 | * Three events in the life of a Transaction result in | |
227 | * callbacks. Any Transaction can contain any number of callback | |
228 | * objects (Context) for any combination of the three classes of | |
229 | * callbacks: | |
230 | * | |
231 | * on_applied_sync, on_applied, and on_commit. | |
232 | * | |
233 | * The "on_applied" and "on_applied_sync" callbacks are invoked when | |
234 | * the modifications requested by the Transaction are visible to | |
235 | * subsequent ObjectStore operations, i.e., the results are | |
236 | * readable. The only conceptual difference between on_applied and | |
237 | * on_applied_sync is the specific thread and locking environment in | |
238 | * which the callbacks operate. "on_applied_sync" is called | |
239 | * directly by an ObjectStore execution thread. It is expected to | |
240 | * execute quickly and must not acquire any locks of the calling | |
241 | * environment. Conversely, "on_applied" is called from the separate | |
242 | * Finisher thread, meaning that it can contend for calling | |
243 | * environment locks. NB, on_applied and on_applied_sync are | |
244 | * sometimes called on_readable and on_readable_sync. | |
245 | * | |
246 | * The "on_commit" callback is also called from the Finisher thread | |
247 | * and indicates that all of the mutations have been durably | |
248 | * committed to stable storage (i.e., are now software/hardware | |
249 | * crashproof). | |
250 | * | |
251 | * At the implementation level, each mutation primitive (and its | |
252 | * associated data) can be serialized to a single buffer. That | |
253 | * serialization, however, does not copy any data, but (using the | |
254 | * bufferlist library) will reference the original buffers. This | |
255 | * implies that the buffer that contains the data being submitted | |
256 | * must remain stable until the on_commit callback completes. In | |
257 | * practice, bufferlist handles all of this for you and this | |
258 | * subtlety is only relevant if you are referencing an existing | |
259 | * buffer via buffer::raw_static. | |
260 | * | |
261 | * Some implementations of ObjectStore choose to implement their own | |
262 | * form of journaling that uses the serialized form of a | |
263 | * Transaction. This requires that the encode/decode logic properly | |
264 | * version itself and handle version upgrades that might change the | |
265 | * format of the encoded Transaction. This has already happened a | |
266 | * couple of times and the Transaction object contains some helper | |
267 | * variables that aid in this legacy decoding: | |
268 | * | |
269 | * sobject_encoding detects an older/simpler version of oid | |
270 | * present in pre-bobtail versions of ceph. use_pool_override | |
271 | * also detects a situation where the pool of an oid can be | |
11fdf7f2 TL |
272 | * overridden for legacy operations/buffers. For non-legacy |
273 | * implementations of ObjectStore, neither of these fields are | |
7c673cae FG |
274 | * relevant. |
275 | * | |
276 | * | |
277 | * TRANSACTION ISOLATION | |
278 | * | |
11fdf7f2 | 279 | * Except as noted above, isolation is the responsibility of the |
7c673cae FG |
280 | * caller. In other words, if any storage element (storage element |
281 | * == any of the four portions of an object as described above) is | |
282 | * altered by a transaction (including deletion), the caller | |
283 | * promises not to attempt to read that element while the | |
284 | * transaction is pending (here pending means from the time of | |
285 | * issuance until the "on_applied_sync" callback has been | |
286 | * received). Violations of isolation need not be detected by | |
287 | * ObjectStore and there is no corresponding error mechanism for | |
288 | * reporting an isolation violation (crashing would be the | |
289 | * appropriate way to report an isolation violation if detected). | |
290 | * | |
291 | * Enumeration operations may violate transaction isolation as | |
292 | * described above when a storage element is being created or | |
293 | * deleted as part of a transaction. In this case, ObjectStore is | |
294 | * allowed to consider the enumeration operation to either precede | |
295 | * or follow the violating transaction element. In other words, the | |
296 | * presence/absence of the mutated element in the enumeration is | |
297 | * entirely at the discretion of ObjectStore. The arbitrary ordering | |
298 | * applies independently to each transaction element. For example, | |
299 | * if a transaction contains two mutating elements "create A" and | |
300 | * "delete B". And an enumeration operation is performed while this | |
11fdf7f2 | 301 | * transaction is pending. It is permissible for ObjectStore to |
7c673cae FG |
302 | * report any of the four possible combinations of the existence of |
303 | * A and B. | |
304 | * | |
305 | */ | |
306 | class Transaction { | |
307 | public: | |
308 | enum { | |
309 | OP_NOP = 0, | |
310 | OP_TOUCH = 9, // cid, oid | |
311 | OP_WRITE = 10, // cid, oid, offset, len, bl | |
312 | OP_ZERO = 11, // cid, oid, offset, len | |
313 | OP_TRUNCATE = 12, // cid, oid, len | |
314 | OP_REMOVE = 13, // cid, oid | |
315 | OP_SETATTR = 14, // cid, oid, attrname, bl | |
316 | OP_SETATTRS = 15, // cid, oid, attrset | |
317 | OP_RMATTR = 16, // cid, oid, attrname | |
318 | OP_CLONE = 17, // cid, oid, newoid | |
319 | OP_CLONERANGE = 18, // cid, oid, newoid, offset, len | |
320 | OP_CLONERANGE2 = 30, // cid, oid, newoid, srcoff, len, dstoff | |
321 | ||
322 | OP_TRIMCACHE = 19, // cid, oid, offset, len **DEPRECATED** | |
323 | ||
324 | OP_MKCOLL = 20, // cid | |
325 | OP_RMCOLL = 21, // cid | |
326 | OP_COLL_ADD = 22, // cid, oldcid, oid | |
327 | OP_COLL_REMOVE = 23, // cid, oid | |
328 | OP_COLL_SETATTR = 24, // cid, attrname, bl | |
329 | OP_COLL_RMATTR = 25, // cid, attrname | |
330 | OP_COLL_SETATTRS = 26, // cid, attrset | |
331 | OP_COLL_MOVE = 8, // newcid, oldcid, oid | |
332 | ||
7c673cae FG |
333 | OP_RMATTRS = 28, // cid, oid |
334 | OP_COLL_RENAME = 29, // cid, newcid | |
335 | ||
336 | OP_OMAP_CLEAR = 31, // cid | |
337 | OP_OMAP_SETKEYS = 32, // cid, attrset | |
338 | OP_OMAP_RMKEYS = 33, // cid, keyset | |
339 | OP_OMAP_SETHEADER = 34, // cid, header | |
340 | OP_SPLIT_COLLECTION = 35, // cid, bits, destination | |
341 | OP_SPLIT_COLLECTION2 = 36, /* cid, bits, destination | |
342 | doesn't create the destination */ | |
343 | OP_OMAP_RMKEYRANGE = 37, // cid, oid, firstkey, lastkey | |
344 | OP_COLL_MOVE_RENAME = 38, // oldcid, oldoid, newcid, newoid | |
345 | ||
346 | OP_SETALLOCHINT = 39, // cid, oid, object_size, write_size | |
347 | OP_COLL_HINT = 40, // cid, type, bl | |
348 | ||
349 | OP_TRY_RENAME = 41, // oldcid, oldoid, newoid | |
350 | ||
351 | OP_COLL_SET_BITS = 42, // cid, bits | |
11fdf7f2 TL |
352 | |
353 | OP_MERGE_COLLECTION = 43, // cid, destination | |
7c673cae FG |
354 | }; |
355 | ||
356 | // Transaction hint type | |
357 | enum { | |
358 | COLL_HINT_EXPECTED_NUM_OBJECTS = 1, | |
359 | }; | |
360 | ||
361 | struct Op { | |
362 | __le32 op; | |
363 | __le32 cid; | |
364 | __le32 oid; | |
365 | __le64 off; | |
366 | __le64 len; | |
367 | __le32 dest_cid; | |
368 | __le32 dest_oid; //OP_CLONE, OP_CLONERANGE | |
369 | __le64 dest_off; //OP_CLONERANGE | |
370 | union { | |
371 | struct { | |
372 | __le32 hint_type; //OP_COLL_HINT | |
373 | }; | |
374 | struct { | |
375 | __le32 alloc_hint_flags; //OP_SETALLOCHINT | |
376 | }; | |
377 | }; | |
378 | __le64 expected_object_size; //OP_SETALLOCHINT | |
379 | __le64 expected_write_size; //OP_SETALLOCHINT | |
380 | __le32 split_bits; //OP_SPLIT_COLLECTION2,OP_COLL_SET_BITS, | |
381 | //OP_MKCOLL | |
382 | __le32 split_rem; //OP_SPLIT_COLLECTION2 | |
383 | } __attribute__ ((packed)) ; | |
384 | ||
385 | struct TransactionData { | |
386 | __le64 ops; | |
387 | __le32 largest_data_len; | |
388 | __le32 largest_data_off; | |
389 | __le32 largest_data_off_in_data_bl; | |
390 | __le32 fadvise_flags; | |
391 | ||
392 | TransactionData() noexcept : | |
393 | ops(0), | |
394 | largest_data_len(0), | |
395 | largest_data_off(0), | |
396 | largest_data_off_in_data_bl(0), | |
397 | fadvise_flags(0) { } | |
398 | ||
399 | // override default move operations to reset default values | |
400 | TransactionData(TransactionData&& other) noexcept : | |
401 | ops(other.ops), | |
402 | largest_data_len(other.largest_data_len), | |
403 | largest_data_off(other.largest_data_off), | |
404 | largest_data_off_in_data_bl(other.largest_data_off_in_data_bl), | |
405 | fadvise_flags(other.fadvise_flags) { | |
406 | other.ops = 0; | |
407 | other.largest_data_len = 0; | |
408 | other.largest_data_off = 0; | |
409 | other.largest_data_off_in_data_bl = 0; | |
410 | other.fadvise_flags = 0; | |
411 | } | |
412 | TransactionData& operator=(TransactionData&& other) noexcept { | |
413 | ops = other.ops; | |
414 | largest_data_len = other.largest_data_len; | |
415 | largest_data_off = other.largest_data_off; | |
416 | largest_data_off_in_data_bl = other.largest_data_off_in_data_bl; | |
417 | fadvise_flags = other.fadvise_flags; | |
418 | other.ops = 0; | |
419 | other.largest_data_len = 0; | |
420 | other.largest_data_off = 0; | |
421 | other.largest_data_off_in_data_bl = 0; | |
422 | other.fadvise_flags = 0; | |
423 | return *this; | |
424 | } | |
425 | ||
426 | TransactionData(const TransactionData& other) = default; | |
427 | TransactionData& operator=(const TransactionData& other) = default; | |
428 | ||
429 | void encode(bufferlist& bl) const { | |
430 | bl.append((char*)this, sizeof(TransactionData)); | |
431 | } | |
11fdf7f2 | 432 | void decode(bufferlist::const_iterator &bl) { |
7c673cae FG |
433 | bl.copy(sizeof(TransactionData), (char*)this); |
434 | } | |
435 | } __attribute__ ((packed)) ; | |
436 | ||
437 | private: | |
438 | TransactionData data; | |
439 | ||
7c673cae FG |
440 | map<coll_t, __le32> coll_index; |
441 | map<ghobject_t, __le32> object_index; | |
442 | ||
443 | __le32 coll_id {0}; | |
444 | __le32 object_id {0}; | |
445 | ||
446 | bufferlist data_bl; | |
447 | bufferlist op_bl; | |
448 | ||
7c673cae FG |
449 | list<Context *> on_applied; |
450 | list<Context *> on_commit; | |
451 | list<Context *> on_applied_sync; | |
452 | ||
453 | public: | |
454 | Transaction() = default; | |
455 | ||
11fdf7f2 | 456 | explicit Transaction(bufferlist::const_iterator &dp) { |
7c673cae FG |
457 | decode(dp); |
458 | } | |
459 | explicit Transaction(bufferlist &nbl) { | |
11fdf7f2 | 460 | auto dp = nbl.cbegin(); |
7c673cae FG |
461 | decode(dp); |
462 | } | |
463 | ||
464 | // override default move operations to reset default values | |
465 | Transaction(Transaction&& other) noexcept : | |
466 | data(std::move(other.data)), | |
7c673cae FG |
467 | coll_index(std::move(other.coll_index)), |
468 | object_index(std::move(other.object_index)), | |
469 | coll_id(other.coll_id), | |
470 | object_id(other.object_id), | |
471 | data_bl(std::move(other.data_bl)), | |
472 | op_bl(std::move(other.op_bl)), | |
7c673cae FG |
473 | on_applied(std::move(other.on_applied)), |
474 | on_commit(std::move(other.on_commit)), | |
475 | on_applied_sync(std::move(other.on_applied_sync)) { | |
7c673cae FG |
476 | other.coll_id = 0; |
477 | other.object_id = 0; | |
478 | } | |
479 | ||
480 | Transaction& operator=(Transaction&& other) noexcept { | |
481 | data = std::move(other.data); | |
7c673cae FG |
482 | coll_index = std::move(other.coll_index); |
483 | object_index = std::move(other.object_index); | |
484 | coll_id = other.coll_id; | |
485 | object_id = other.object_id; | |
486 | data_bl = std::move(other.data_bl); | |
487 | op_bl = std::move(other.op_bl); | |
7c673cae FG |
488 | on_applied = std::move(other.on_applied); |
489 | on_commit = std::move(other.on_commit); | |
490 | on_applied_sync = std::move(other.on_applied_sync); | |
7c673cae FG |
491 | other.coll_id = 0; |
492 | other.object_id = 0; | |
493 | return *this; | |
494 | } | |
495 | ||
496 | Transaction(const Transaction& other) = default; | |
497 | Transaction& operator=(const Transaction& other) = default; | |
498 | ||
11fdf7f2 TL |
499 | // expose object_index for FileStore::Op's benefit |
500 | const map<ghobject_t, __le32>& get_object_index() const { | |
501 | return object_index; | |
502 | } | |
503 | ||
7c673cae FG |
504 | /* Operations on callback contexts */ |
505 | void register_on_applied(Context *c) { | |
506 | if (!c) return; | |
507 | on_applied.push_back(c); | |
508 | } | |
509 | void register_on_commit(Context *c) { | |
510 | if (!c) return; | |
511 | on_commit.push_back(c); | |
512 | } | |
513 | void register_on_applied_sync(Context *c) { | |
514 | if (!c) return; | |
515 | on_applied_sync.push_back(c); | |
516 | } | |
517 | void register_on_complete(Context *c) { | |
518 | if (!c) return; | |
519 | RunOnDeleteRef _complete (std::make_shared<RunOnDelete>(c)); | |
520 | register_on_applied(new ContainerContext<RunOnDeleteRef>(_complete)); | |
521 | register_on_commit(new ContainerContext<RunOnDeleteRef>(_complete)); | |
522 | } | |
11fdf7f2 TL |
523 | bool has_contexts() const { |
524 | return | |
525 | !on_commit.empty() || | |
526 | !on_applied.empty() || | |
527 | !on_applied_sync.empty(); | |
528 | } | |
7c673cae FG |
529 | |
530 | static void collect_contexts( | |
531 | vector<Transaction>& t, | |
532 | Context **out_on_applied, | |
533 | Context **out_on_commit, | |
534 | Context **out_on_applied_sync) { | |
11fdf7f2 TL |
535 | ceph_assert(out_on_applied); |
536 | ceph_assert(out_on_commit); | |
537 | ceph_assert(out_on_applied_sync); | |
7c673cae | 538 | list<Context *> on_applied, on_commit, on_applied_sync; |
11fdf7f2 TL |
539 | for (auto& i : t) { |
540 | on_applied.splice(on_applied.end(), i.on_applied); | |
541 | on_commit.splice(on_commit.end(), i.on_commit); | |
542 | on_applied_sync.splice(on_applied_sync.end(), i.on_applied_sync); | |
7c673cae FG |
543 | } |
544 | *out_on_applied = C_Contexts::list_to_context(on_applied); | |
545 | *out_on_commit = C_Contexts::list_to_context(on_commit); | |
546 | *out_on_applied_sync = C_Contexts::list_to_context(on_applied_sync); | |
547 | } | |
11fdf7f2 TL |
548 | static void collect_contexts( |
549 | vector<Transaction>& t, | |
550 | list<Context*> *out_on_applied, | |
551 | list<Context*> *out_on_commit, | |
552 | list<Context*> *out_on_applied_sync) { | |
553 | ceph_assert(out_on_applied); | |
554 | ceph_assert(out_on_commit); | |
555 | ceph_assert(out_on_applied_sync); | |
556 | for (auto& i : t) { | |
557 | out_on_applied->splice(out_on_applied->end(), i.on_applied); | |
558 | out_on_commit->splice(out_on_commit->end(), i.on_commit); | |
559 | out_on_applied_sync->splice(out_on_applied_sync->end(), | |
560 | i.on_applied_sync); | |
561 | } | |
562 | } | |
7c673cae FG |
563 | |
564 | Context *get_on_applied() { | |
565 | return C_Contexts::list_to_context(on_applied); | |
566 | } | |
567 | Context *get_on_commit() { | |
568 | return C_Contexts::list_to_context(on_commit); | |
569 | } | |
570 | Context *get_on_applied_sync() { | |
571 | return C_Contexts::list_to_context(on_applied_sync); | |
572 | } | |
573 | ||
574 | void set_fadvise_flags(uint32_t flags) { | |
575 | data.fadvise_flags = flags; | |
576 | } | |
577 | void set_fadvise_flag(uint32_t flag) { | |
578 | data.fadvise_flags = data.fadvise_flags | flag; | |
579 | } | |
580 | uint32_t get_fadvise_flags() { return data.fadvise_flags; } | |
581 | ||
582 | void swap(Transaction& other) noexcept { | |
583 | std::swap(data, other.data); | |
584 | std::swap(on_applied, other.on_applied); | |
585 | std::swap(on_commit, other.on_commit); | |
586 | std::swap(on_applied_sync, other.on_applied_sync); | |
587 | ||
588 | std::swap(coll_index, other.coll_index); | |
589 | std::swap(object_index, other.object_index); | |
590 | std::swap(coll_id, other.coll_id); | |
591 | std::swap(object_id, other.object_id); | |
592 | op_bl.swap(other.op_bl); | |
593 | data_bl.swap(other.data_bl); | |
594 | } | |
595 | ||
596 | void _update_op(Op* op, | |
597 | vector<__le32> &cm, | |
598 | vector<__le32> &om) { | |
599 | ||
600 | switch (op->op) { | |
601 | case OP_NOP: | |
7c673cae FG |
602 | break; |
603 | ||
604 | case OP_TOUCH: | |
605 | case OP_REMOVE: | |
606 | case OP_SETATTR: | |
607 | case OP_SETATTRS: | |
608 | case OP_RMATTR: | |
609 | case OP_RMATTRS: | |
610 | case OP_COLL_REMOVE: | |
611 | case OP_OMAP_CLEAR: | |
612 | case OP_OMAP_SETKEYS: | |
613 | case OP_OMAP_RMKEYS: | |
614 | case OP_OMAP_RMKEYRANGE: | |
615 | case OP_OMAP_SETHEADER: | |
616 | case OP_WRITE: | |
617 | case OP_ZERO: | |
618 | case OP_TRUNCATE: | |
619 | case OP_SETALLOCHINT: | |
11fdf7f2 TL |
620 | ceph_assert(op->cid < cm.size()); |
621 | ceph_assert(op->oid < om.size()); | |
7c673cae FG |
622 | op->cid = cm[op->cid]; |
623 | op->oid = om[op->oid]; | |
624 | break; | |
625 | ||
626 | case OP_CLONERANGE2: | |
627 | case OP_CLONE: | |
11fdf7f2 TL |
628 | ceph_assert(op->cid < cm.size()); |
629 | ceph_assert(op->oid < om.size()); | |
630 | ceph_assert(op->dest_oid < om.size()); | |
7c673cae FG |
631 | op->cid = cm[op->cid]; |
632 | op->oid = om[op->oid]; | |
633 | op->dest_oid = om[op->dest_oid]; | |
634 | break; | |
635 | ||
636 | case OP_MKCOLL: | |
637 | case OP_RMCOLL: | |
638 | case OP_COLL_SETATTR: | |
639 | case OP_COLL_RMATTR: | |
640 | case OP_COLL_SETATTRS: | |
641 | case OP_COLL_HINT: | |
642 | case OP_COLL_SET_BITS: | |
11fdf7f2 | 643 | ceph_assert(op->cid < cm.size()); |
7c673cae FG |
644 | op->cid = cm[op->cid]; |
645 | break; | |
646 | ||
647 | case OP_COLL_ADD: | |
11fdf7f2 TL |
648 | ceph_assert(op->cid < cm.size()); |
649 | ceph_assert(op->oid < om.size()); | |
650 | ceph_assert(op->dest_cid < om.size()); | |
7c673cae FG |
651 | op->cid = cm[op->cid]; |
652 | op->dest_cid = cm[op->dest_cid]; | |
653 | op->oid = om[op->oid]; | |
654 | break; | |
655 | ||
656 | case OP_COLL_MOVE_RENAME: | |
11fdf7f2 TL |
657 | ceph_assert(op->cid < cm.size()); |
658 | ceph_assert(op->oid < om.size()); | |
659 | ceph_assert(op->dest_cid < cm.size()); | |
660 | ceph_assert(op->dest_oid < om.size()); | |
7c673cae FG |
661 | op->cid = cm[op->cid]; |
662 | op->oid = om[op->oid]; | |
663 | op->dest_cid = cm[op->dest_cid]; | |
664 | op->dest_oid = om[op->dest_oid]; | |
665 | break; | |
666 | ||
667 | case OP_TRY_RENAME: | |
11fdf7f2 TL |
668 | ceph_assert(op->cid < cm.size()); |
669 | ceph_assert(op->oid < om.size()); | |
670 | ceph_assert(op->dest_oid < om.size()); | |
7c673cae FG |
671 | op->cid = cm[op->cid]; |
672 | op->oid = om[op->oid]; | |
673 | op->dest_oid = om[op->dest_oid]; | |
674 | break; | |
675 | ||
676 | case OP_SPLIT_COLLECTION2: | |
11fdf7f2 TL |
677 | ceph_assert(op->cid < cm.size()); |
678 | ceph_assert(op->dest_cid < cm.size()); | |
679 | op->cid = cm[op->cid]; | |
680 | op->dest_cid = cm[op->dest_cid]; | |
681 | break; | |
682 | ||
683 | case OP_MERGE_COLLECTION: | |
684 | ceph_assert(op->cid < cm.size()); | |
685 | ceph_assert(op->dest_cid < cm.size()); | |
7c673cae FG |
686 | op->cid = cm[op->cid]; |
687 | op->dest_cid = cm[op->dest_cid]; | |
688 | break; | |
689 | ||
690 | default: | |
11fdf7f2 | 691 | ceph_abort_msg("Unknown OP"); |
7c673cae FG |
692 | } |
693 | } | |
694 | void _update_op_bl( | |
695 | bufferlist& bl, | |
696 | vector<__le32> &cm, | |
697 | vector<__le32> &om) { | |
11fdf7f2 TL |
698 | for (auto& bp : bl.buffers()) { |
699 | ceph_assert(bp.length() % sizeof(Op) == 0); | |
7c673cae | 700 | |
11fdf7f2 TL |
701 | char* raw_p = const_cast<char*>(bp.c_str()); |
702 | char* raw_end = raw_p + bp.length(); | |
7c673cae FG |
703 | while (raw_p < raw_end) { |
704 | _update_op(reinterpret_cast<Op*>(raw_p), cm, om); | |
705 | raw_p += sizeof(Op); | |
706 | } | |
707 | } | |
708 | } | |
709 | /// Append the operations of the parameter to this Transaction. Those operations are removed from the parameter Transaction | |
710 | void append(Transaction& other) { | |
711 | ||
712 | data.ops += other.data.ops; | |
713 | if (other.data.largest_data_len > data.largest_data_len) { | |
714 | data.largest_data_len = other.data.largest_data_len; | |
715 | data.largest_data_off = other.data.largest_data_off; | |
716 | data.largest_data_off_in_data_bl = data_bl.length() + other.data.largest_data_off_in_data_bl; | |
717 | } | |
718 | data.fadvise_flags |= other.data.fadvise_flags; | |
719 | on_applied.splice(on_applied.end(), other.on_applied); | |
720 | on_commit.splice(on_commit.end(), other.on_commit); | |
721 | on_applied_sync.splice(on_applied_sync.end(), other.on_applied_sync); | |
722 | ||
723 | //append coll_index & object_index | |
724 | vector<__le32> cm(other.coll_index.size()); | |
725 | map<coll_t, __le32>::iterator coll_index_p; | |
726 | for (coll_index_p = other.coll_index.begin(); | |
727 | coll_index_p != other.coll_index.end(); | |
728 | ++coll_index_p) { | |
729 | cm[coll_index_p->second] = _get_coll_id(coll_index_p->first); | |
730 | } | |
731 | ||
732 | vector<__le32> om(other.object_index.size()); | |
733 | map<ghobject_t, __le32>::iterator object_index_p; | |
734 | for (object_index_p = other.object_index.begin(); | |
735 | object_index_p != other.object_index.end(); | |
736 | ++object_index_p) { | |
737 | om[object_index_p->second] = _get_object_id(object_index_p->first); | |
738 | } | |
739 | ||
740 | //the other.op_bl SHOULD NOT be changes during append operation, | |
741 | //we use additional bufferlist to avoid this problem | |
7c673cae | 742 | bufferlist other_op_bl; |
11fdf7f2 TL |
743 | { |
744 | bufferptr other_op_bl_ptr(other.op_bl.length()); | |
745 | other.op_bl.copy(0, other.op_bl.length(), other_op_bl_ptr.c_str()); | |
746 | other_op_bl.append(std::move(other_op_bl_ptr)); | |
747 | } | |
7c673cae FG |
748 | |
749 | //update other_op_bl with cm & om | |
750 | //When the other is appended to current transaction, all coll_index and | |
751 | //object_index in other.op_buffer should be updated by new index of the | |
752 | //combined transaction | |
753 | _update_op_bl(other_op_bl, cm, om); | |
754 | ||
755 | //append op_bl | |
756 | op_bl.append(other_op_bl); | |
757 | //append data_bl | |
758 | data_bl.append(other.data_bl); | |
759 | } | |
760 | ||
761 | /** Inquires about the Transaction as a whole. */ | |
762 | ||
763 | /// How big is the encoded Transaction buffer? | |
764 | uint64_t get_encoded_bytes() { | |
765 | //layout: data_bl + op_bl + coll_index + object_index + data | |
766 | ||
767 | // coll_index size, object_index size and sizeof(transaction_data) | |
768 | // all here, so they may be computed at compile-time | |
769 | size_t final_size = sizeof(__u32) * 2 + sizeof(data); | |
770 | ||
771 | // coll_index second and object_index second | |
772 | final_size += (coll_index.size() + object_index.size()) * sizeof(__le32); | |
773 | ||
774 | // coll_index first | |
775 | for (auto p = coll_index.begin(); p != coll_index.end(); ++p) { | |
776 | final_size += p->first.encoded_size(); | |
777 | } | |
778 | ||
779 | // object_index first | |
780 | for (auto p = object_index.begin(); p != object_index.end(); ++p) { | |
781 | final_size += p->first.encoded_size(); | |
782 | } | |
783 | ||
784 | return data_bl.length() + | |
785 | op_bl.length() + | |
786 | final_size; | |
787 | } | |
788 | ||
789 | /// Retain old version for regression testing purposes | |
790 | uint64_t get_encoded_bytes_test() { | |
11fdf7f2 | 791 | using ceph::encode; |
7c673cae FG |
792 | //layout: data_bl + op_bl + coll_index + object_index + data |
793 | bufferlist bl; | |
11fdf7f2 TL |
794 | encode(coll_index, bl); |
795 | encode(object_index, bl); | |
7c673cae FG |
796 | |
797 | return data_bl.length() + | |
798 | op_bl.length() + | |
799 | bl.length() + | |
800 | sizeof(data); | |
801 | } | |
802 | ||
803 | uint64_t get_num_bytes() { | |
804 | return get_encoded_bytes(); | |
805 | } | |
806 | /// Size of largest data buffer to the "write" operation encountered so far | |
807 | uint32_t get_data_length() { | |
808 | return data.largest_data_len; | |
809 | } | |
810 | /// offset within the encoded buffer to the start of the largest data buffer that's encoded | |
811 | uint32_t get_data_offset() { | |
812 | if (data.largest_data_off_in_data_bl) { | |
813 | return data.largest_data_off_in_data_bl + | |
814 | sizeof(__u8) + // encode struct_v | |
815 | sizeof(__u8) + // encode compat_v | |
816 | sizeof(__u32) + // encode len | |
817 | sizeof(__u32); // data_bl len | |
818 | } | |
819 | return 0; // none | |
820 | } | |
821 | /// offset of buffer as aligned to destination within object. | |
822 | int get_data_alignment() { | |
823 | if (!data.largest_data_len) | |
11fdf7f2 | 824 | return 0; |
7c673cae FG |
825 | return (0 - get_data_offset()) & ~CEPH_PAGE_MASK; |
826 | } | |
827 | /// Is the Transaction empty (no operations) | |
828 | bool empty() { | |
829 | return !data.ops; | |
830 | } | |
11fdf7f2 | 831 | /// Number of operations in the transaction |
7c673cae FG |
832 | int get_num_ops() { |
833 | return data.ops; | |
834 | } | |
835 | ||
7c673cae FG |
836 | /** |
837 | * iterator | |
838 | * | |
839 | * Helper object to parse Transactions. | |
840 | * | |
841 | * ObjectStore instances use this object to step down the encoded | |
842 | * buffer decoding operation codes and parameters as we go. | |
843 | * | |
844 | */ | |
845 | class iterator { | |
846 | Transaction *t; | |
847 | ||
848 | uint64_t ops; | |
849 | char* op_buffer_p; | |
850 | ||
11fdf7f2 | 851 | bufferlist::const_iterator data_bl_p; |
7c673cae FG |
852 | |
853 | public: | |
854 | vector<coll_t> colls; | |
855 | vector<ghobject_t> objects; | |
856 | ||
857 | private: | |
858 | explicit iterator(Transaction *t) | |
859 | : t(t), | |
11fdf7f2 | 860 | data_bl_p(t->data_bl.cbegin()), |
7c673cae FG |
861 | colls(t->coll_index.size()), |
862 | objects(t->object_index.size()) { | |
863 | ||
864 | ops = t->data.ops; | |
11fdf7f2 | 865 | op_buffer_p = t->op_bl.c_str(); |
7c673cae FG |
866 | |
867 | map<coll_t, __le32>::iterator coll_index_p; | |
868 | for (coll_index_p = t->coll_index.begin(); | |
869 | coll_index_p != t->coll_index.end(); | |
870 | ++coll_index_p) { | |
871 | colls[coll_index_p->second] = coll_index_p->first; | |
872 | } | |
873 | ||
874 | map<ghobject_t, __le32>::iterator object_index_p; | |
875 | for (object_index_p = t->object_index.begin(); | |
876 | object_index_p != t->object_index.end(); | |
877 | ++object_index_p) { | |
878 | objects[object_index_p->second] = object_index_p->first; | |
879 | } | |
880 | } | |
881 | ||
882 | friend class Transaction; | |
883 | ||
884 | public: | |
885 | ||
886 | bool have_op() { | |
887 | return ops > 0; | |
888 | } | |
889 | Op* decode_op() { | |
11fdf7f2 | 890 | ceph_assert(ops > 0); |
7c673cae FG |
891 | |
892 | Op* op = reinterpret_cast<Op*>(op_buffer_p); | |
893 | op_buffer_p += sizeof(Op); | |
894 | ops--; | |
895 | ||
896 | return op; | |
897 | } | |
898 | string decode_string() { | |
11fdf7f2 | 899 | using ceph::decode; |
7c673cae | 900 | string s; |
11fdf7f2 | 901 | decode(s, data_bl_p); |
7c673cae FG |
902 | return s; |
903 | } | |
904 | void decode_bp(bufferptr& bp) { | |
11fdf7f2 TL |
905 | using ceph::decode; |
906 | decode(bp, data_bl_p); | |
7c673cae FG |
907 | } |
908 | void decode_bl(bufferlist& bl) { | |
11fdf7f2 TL |
909 | using ceph::decode; |
910 | decode(bl, data_bl_p); | |
7c673cae FG |
911 | } |
912 | void decode_attrset(map<string,bufferptr>& aset) { | |
11fdf7f2 TL |
913 | using ceph::decode; |
914 | decode(aset, data_bl_p); | |
7c673cae FG |
915 | } |
916 | void decode_attrset(map<string,bufferlist>& aset) { | |
11fdf7f2 TL |
917 | using ceph::decode; |
918 | decode(aset, data_bl_p); | |
7c673cae FG |
919 | } |
920 | void decode_attrset_bl(bufferlist *pbl) { | |
921 | decode_str_str_map_to_bl(data_bl_p, pbl); | |
922 | } | |
923 | void decode_keyset(set<string> &keys){ | |
11fdf7f2 TL |
924 | using ceph::decode; |
925 | decode(keys, data_bl_p); | |
7c673cae FG |
926 | } |
927 | void decode_keyset_bl(bufferlist *pbl){ | |
928 | decode_str_set_to_bl(data_bl_p, pbl); | |
929 | } | |
930 | ||
931 | const ghobject_t &get_oid(__le32 oid_id) { | |
11fdf7f2 | 932 | ceph_assert(oid_id < objects.size()); |
7c673cae FG |
933 | return objects[oid_id]; |
934 | } | |
935 | const coll_t &get_cid(__le32 cid_id) { | |
11fdf7f2 | 936 | ceph_assert(cid_id < colls.size()); |
7c673cae FG |
937 | return colls[cid_id]; |
938 | } | |
939 | uint32_t get_fadvise_flags() const { | |
940 | return t->get_fadvise_flags(); | |
941 | } | |
942 | }; | |
943 | ||
944 | iterator begin() { | |
945 | return iterator(this); | |
946 | } | |
947 | ||
948 | private: | |
949 | void _build_actions_from_tbl(); | |
950 | ||
951 | /** | |
952 | * Helper functions to encode the various mutation elements of a | |
953 | * transaction. These are 1:1 with the operation codes (see | |
954 | * enumeration above). These routines ensure that the | |
955 | * encoder/creator of a transaction gets the right data in the | |
956 | * right place. Sadly, there's no corresponding version nor any | |
957 | * form of seat belts for the decoder. | |
958 | */ | |
959 | Op* _get_next_op() { | |
11fdf7f2 TL |
960 | if (op_bl.get_append_buffer_unused_tail_length() < sizeof(Op)) { |
961 | op_bl.reserve(sizeof(Op) * OPS_PER_PTR); | |
7c673cae | 962 | } |
11fdf7f2 TL |
963 | // append_hole ensures bptr merging. Even huge number of ops |
964 | // shouldn't result in overpopulating bl::_buffers. | |
965 | char* const p = op_bl.append_hole(sizeof(Op)).c_str(); | |
7c673cae FG |
966 | memset(p, 0, sizeof(Op)); |
967 | return reinterpret_cast<Op*>(p); | |
968 | } | |
969 | __le32 _get_coll_id(const coll_t& coll) { | |
970 | map<coll_t, __le32>::iterator c = coll_index.find(coll); | |
971 | if (c != coll_index.end()) | |
972 | return c->second; | |
973 | ||
974 | __le32 index_id = coll_id++; | |
975 | coll_index[coll] = index_id; | |
976 | return index_id; | |
977 | } | |
978 | __le32 _get_object_id(const ghobject_t& oid) { | |
979 | map<ghobject_t, __le32>::iterator o = object_index.find(oid); | |
980 | if (o != object_index.end()) | |
981 | return o->second; | |
982 | ||
983 | __le32 index_id = object_id++; | |
984 | object_index[oid] = index_id; | |
985 | return index_id; | |
986 | } | |
987 | ||
988 | public: | |
7c673cae FG |
989 | /// noop. 'nuf said |
990 | void nop() { | |
991 | Op* _op = _get_next_op(); | |
992 | _op->op = OP_NOP; | |
993 | data.ops++; | |
994 | } | |
995 | /** | |
996 | * touch | |
997 | * | |
998 | * Ensure the existance of an object in a collection. Create an | |
999 | * empty object if necessary | |
1000 | */ | |
1001 | void touch(const coll_t& cid, const ghobject_t& oid) { | |
1002 | Op* _op = _get_next_op(); | |
1003 | _op->op = OP_TOUCH; | |
1004 | _op->cid = _get_coll_id(cid); | |
1005 | _op->oid = _get_object_id(oid); | |
1006 | data.ops++; | |
1007 | } | |
1008 | /** | |
1009 | * Write data to an offset within an object. If the object is too | |
1010 | * small, it is expanded as needed. It is possible to specify an | |
1011 | * offset beyond the current end of an object and it will be | |
1012 | * expanded as needed. Simple implementations of ObjectStore will | |
1013 | * just zero the data between the old end of the object and the | |
1014 | * newly provided data. More sophisticated implementations of | |
1015 | * ObjectStore will omit the untouched data and store it as a | |
1016 | * "hole" in the file. | |
b32b8144 FG |
1017 | * |
1018 | * Note that a 0-length write does not affect the size of the object. | |
7c673cae FG |
1019 | */ |
1020 | void write(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len, | |
1021 | const bufferlist& write_data, uint32_t flags = 0) { | |
11fdf7f2 | 1022 | using ceph::encode; |
7c673cae FG |
1023 | uint32_t orig_len = data_bl.length(); |
1024 | Op* _op = _get_next_op(); | |
1025 | _op->op = OP_WRITE; | |
1026 | _op->cid = _get_coll_id(cid); | |
1027 | _op->oid = _get_object_id(oid); | |
1028 | _op->off = off; | |
1029 | _op->len = len; | |
11fdf7f2 | 1030 | encode(write_data, data_bl); |
7c673cae | 1031 | |
11fdf7f2 | 1032 | ceph_assert(len == write_data.length()); |
7c673cae FG |
1033 | data.fadvise_flags = data.fadvise_flags | flags; |
1034 | if (write_data.length() > data.largest_data_len) { | |
1035 | data.largest_data_len = write_data.length(); | |
1036 | data.largest_data_off = off; | |
1037 | data.largest_data_off_in_data_bl = orig_len + sizeof(__u32); // we are about to | |
1038 | } | |
1039 | data.ops++; | |
1040 | } | |
1041 | /** | |
1042 | * zero out the indicated byte range within an object. Some | |
1043 | * ObjectStore instances may optimize this to release the | |
1044 | * underlying storage space. | |
b32b8144 FG |
1045 | * |
1046 | * If the zero range extends beyond the end of the object, the object | |
1047 | * size is extended, just as if we were writing a buffer full of zeros. | |
1048 | * EXCEPT if the length is 0, in which case (just like a 0-length write) | |
1049 | * we do not adjust the object size. | |
7c673cae FG |
1050 | */ |
1051 | void zero(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len) { | |
1052 | Op* _op = _get_next_op(); | |
1053 | _op->op = OP_ZERO; | |
1054 | _op->cid = _get_coll_id(cid); | |
1055 | _op->oid = _get_object_id(oid); | |
1056 | _op->off = off; | |
1057 | _op->len = len; | |
1058 | data.ops++; | |
1059 | } | |
1060 | /// Discard all data in the object beyond the specified size. | |
1061 | void truncate(const coll_t& cid, const ghobject_t& oid, uint64_t off) { | |
1062 | Op* _op = _get_next_op(); | |
1063 | _op->op = OP_TRUNCATE; | |
1064 | _op->cid = _get_coll_id(cid); | |
1065 | _op->oid = _get_object_id(oid); | |
1066 | _op->off = off; | |
1067 | data.ops++; | |
1068 | } | |
1069 | /// Remove an object. All four parts of the object are removed. | |
1070 | void remove(const coll_t& cid, const ghobject_t& oid) { | |
1071 | Op* _op = _get_next_op(); | |
1072 | _op->op = OP_REMOVE; | |
1073 | _op->cid = _get_coll_id(cid); | |
1074 | _op->oid = _get_object_id(oid); | |
1075 | data.ops++; | |
1076 | } | |
1077 | /// Set an xattr of an object | |
1078 | void setattr(const coll_t& cid, const ghobject_t& oid, const char* name, bufferlist& val) { | |
1079 | string n(name); | |
1080 | setattr(cid, oid, n, val); | |
1081 | } | |
1082 | /// Set an xattr of an object | |
1083 | void setattr(const coll_t& cid, const ghobject_t& oid, const string& s, bufferlist& val) { | |
11fdf7f2 | 1084 | using ceph::encode; |
7c673cae FG |
1085 | Op* _op = _get_next_op(); |
1086 | _op->op = OP_SETATTR; | |
1087 | _op->cid = _get_coll_id(cid); | |
1088 | _op->oid = _get_object_id(oid); | |
11fdf7f2 TL |
1089 | encode(s, data_bl); |
1090 | encode(val, data_bl); | |
7c673cae FG |
1091 | data.ops++; |
1092 | } | |
1093 | /// Set multiple xattrs of an object | |
1094 | void setattrs(const coll_t& cid, const ghobject_t& oid, const map<string,bufferptr>& attrset) { | |
11fdf7f2 | 1095 | using ceph::encode; |
7c673cae FG |
1096 | Op* _op = _get_next_op(); |
1097 | _op->op = OP_SETATTRS; | |
1098 | _op->cid = _get_coll_id(cid); | |
1099 | _op->oid = _get_object_id(oid); | |
11fdf7f2 | 1100 | encode(attrset, data_bl); |
7c673cae FG |
1101 | data.ops++; |
1102 | } | |
1103 | /// Set multiple xattrs of an object | |
1104 | void setattrs(const coll_t& cid, const ghobject_t& oid, const map<string,bufferlist>& attrset) { | |
11fdf7f2 | 1105 | using ceph::encode; |
7c673cae FG |
1106 | Op* _op = _get_next_op(); |
1107 | _op->op = OP_SETATTRS; | |
1108 | _op->cid = _get_coll_id(cid); | |
1109 | _op->oid = _get_object_id(oid); | |
11fdf7f2 | 1110 | encode(attrset, data_bl); |
7c673cae FG |
1111 | data.ops++; |
1112 | } | |
1113 | /// remove an xattr from an object | |
1114 | void rmattr(const coll_t& cid, const ghobject_t& oid, const char *name) { | |
1115 | string n(name); | |
1116 | rmattr(cid, oid, n); | |
1117 | } | |
1118 | /// remove an xattr from an object | |
1119 | void rmattr(const coll_t& cid, const ghobject_t& oid, const string& s) { | |
11fdf7f2 | 1120 | using ceph::encode; |
7c673cae FG |
1121 | Op* _op = _get_next_op(); |
1122 | _op->op = OP_RMATTR; | |
1123 | _op->cid = _get_coll_id(cid); | |
1124 | _op->oid = _get_object_id(oid); | |
11fdf7f2 | 1125 | encode(s, data_bl); |
7c673cae FG |
1126 | data.ops++; |
1127 | } | |
1128 | /// remove all xattrs from an object | |
1129 | void rmattrs(const coll_t& cid, const ghobject_t& oid) { | |
1130 | Op* _op = _get_next_op(); | |
1131 | _op->op = OP_RMATTRS; | |
1132 | _op->cid = _get_coll_id(cid); | |
1133 | _op->oid = _get_object_id(oid); | |
1134 | data.ops++; | |
1135 | } | |
1136 | /** | |
1137 | * Clone an object into another object. | |
1138 | * | |
1139 | * Low-cost (e.g., O(1)) cloning (if supported) is best, but | |
1140 | * fallback to an O(n) copy is allowed. All four parts of the | |
1141 | * object are cloned (data, xattrs, omap header, omap | |
1142 | * entries). | |
1143 | * | |
1144 | * The destination named object may already exist, in | |
1145 | * which case its previous contents are discarded. | |
1146 | */ | |
1147 | void clone(const coll_t& cid, const ghobject_t& oid, | |
1148 | const ghobject_t& noid) { | |
1149 | Op* _op = _get_next_op(); | |
1150 | _op->op = OP_CLONE; | |
1151 | _op->cid = _get_coll_id(cid); | |
1152 | _op->oid = _get_object_id(oid); | |
1153 | _op->dest_oid = _get_object_id(noid); | |
1154 | data.ops++; | |
1155 | } | |
1156 | /** | |
1157 | * Clone a byte range from one object to another. | |
1158 | * | |
1159 | * The data portion of the destination object receives a copy of a | |
1160 | * portion of the data from the source object. None of the other | |
1161 | * three parts of an object is copied from the source. | |
1162 | * | |
1163 | * The destination object size may be extended to the dstoff + len. | |
1164 | * | |
1165 | * The source range *must* overlap with the source object data. If it does | |
1166 | * not the result is undefined. | |
1167 | */ | |
1168 | void clone_range(const coll_t& cid, const ghobject_t& oid, | |
1169 | const ghobject_t& noid, | |
1170 | uint64_t srcoff, uint64_t srclen, uint64_t dstoff) { | |
1171 | Op* _op = _get_next_op(); | |
1172 | _op->op = OP_CLONERANGE2; | |
1173 | _op->cid = _get_coll_id(cid); | |
1174 | _op->oid = _get_object_id(oid); | |
1175 | _op->dest_oid = _get_object_id(noid); | |
1176 | _op->off = srcoff; | |
1177 | _op->len = srclen; | |
1178 | _op->dest_off = dstoff; | |
1179 | data.ops++; | |
1180 | } | |
1181 | ||
1182 | /// Create the collection | |
1183 | void create_collection(const coll_t& cid, int bits) { | |
1184 | Op* _op = _get_next_op(); | |
1185 | _op->op = OP_MKCOLL; | |
1186 | _op->cid = _get_coll_id(cid); | |
1187 | _op->split_bits = bits; | |
1188 | data.ops++; | |
1189 | } | |
1190 | ||
1191 | /** | |
1192 | * Give the collection a hint. | |
1193 | * | |
1194 | * @param cid - collection id. | |
1195 | * @param type - hint type. | |
1196 | * @param hint - the hint payload, which contains the customized | |
1197 | * data along with the hint type. | |
1198 | */ | |
1199 | void collection_hint(const coll_t& cid, uint32_t type, const bufferlist& hint) { | |
11fdf7f2 | 1200 | using ceph::encode; |
7c673cae FG |
1201 | Op* _op = _get_next_op(); |
1202 | _op->op = OP_COLL_HINT; | |
1203 | _op->cid = _get_coll_id(cid); | |
1204 | _op->hint_type = type; | |
11fdf7f2 | 1205 | encode(hint, data_bl); |
7c673cae FG |
1206 | data.ops++; |
1207 | } | |
1208 | ||
1209 | /// remove the collection, the collection must be empty | |
1210 | void remove_collection(const coll_t& cid) { | |
1211 | Op* _op = _get_next_op(); | |
1212 | _op->op = OP_RMCOLL; | |
1213 | _op->cid = _get_coll_id(cid); | |
1214 | data.ops++; | |
1215 | } | |
11fdf7f2 | 1216 | void collection_move(const coll_t& cid, const coll_t &oldcid, const ghobject_t& oid) |
7c673cae FG |
1217 | __attribute__ ((deprecated)) { |
1218 | // NOTE: we encode this as a fixed combo of ADD + REMOVE. they | |
1219 | // always appear together, so this is effectively a single MOVE. | |
1220 | Op* _op = _get_next_op(); | |
1221 | _op->op = OP_COLL_ADD; | |
1222 | _op->cid = _get_coll_id(oldcid); | |
1223 | _op->oid = _get_object_id(oid); | |
1224 | _op->dest_cid = _get_coll_id(cid); | |
1225 | data.ops++; | |
1226 | ||
1227 | _op = _get_next_op(); | |
1228 | _op->op = OP_COLL_REMOVE; | |
1229 | _op->cid = _get_coll_id(oldcid); | |
1230 | _op->oid = _get_object_id(oid); | |
1231 | data.ops++; | |
1232 | } | |
1233 | void collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, | |
11fdf7f2 | 1234 | const coll_t &cid, const ghobject_t& oid) { |
7c673cae FG |
1235 | Op* _op = _get_next_op(); |
1236 | _op->op = OP_COLL_MOVE_RENAME; | |
1237 | _op->cid = _get_coll_id(oldcid); | |
1238 | _op->oid = _get_object_id(oldoid); | |
1239 | _op->dest_cid = _get_coll_id(cid); | |
1240 | _op->dest_oid = _get_object_id(oid); | |
1241 | data.ops++; | |
1242 | } | |
11fdf7f2 | 1243 | void try_rename(const coll_t &cid, const ghobject_t& oldoid, |
7c673cae FG |
1244 | const ghobject_t& oid) { |
1245 | Op* _op = _get_next_op(); | |
1246 | _op->op = OP_TRY_RENAME; | |
1247 | _op->cid = _get_coll_id(cid); | |
1248 | _op->oid = _get_object_id(oldoid); | |
1249 | _op->dest_oid = _get_object_id(oid); | |
1250 | data.ops++; | |
1251 | } | |
1252 | ||
1253 | /// Remove omap from oid | |
1254 | void omap_clear( | |
11fdf7f2 | 1255 | const coll_t &cid, ///< [in] Collection containing oid |
7c673cae FG |
1256 | const ghobject_t &oid ///< [in] Object from which to remove omap |
1257 | ) { | |
1258 | Op* _op = _get_next_op(); | |
1259 | _op->op = OP_OMAP_CLEAR; | |
1260 | _op->cid = _get_coll_id(cid); | |
1261 | _op->oid = _get_object_id(oid); | |
1262 | data.ops++; | |
1263 | } | |
1264 | /// Set keys on oid omap. Replaces duplicate keys. | |
1265 | void omap_setkeys( | |
1266 | const coll_t& cid, ///< [in] Collection containing oid | |
1267 | const ghobject_t &oid, ///< [in] Object to update | |
1268 | const map<string, bufferlist> &attrset ///< [in] Replacement keys and values | |
1269 | ) { | |
11fdf7f2 | 1270 | using ceph::encode; |
7c673cae FG |
1271 | Op* _op = _get_next_op(); |
1272 | _op->op = OP_OMAP_SETKEYS; | |
1273 | _op->cid = _get_coll_id(cid); | |
1274 | _op->oid = _get_object_id(oid); | |
11fdf7f2 | 1275 | encode(attrset, data_bl); |
7c673cae FG |
1276 | data.ops++; |
1277 | } | |
1278 | ||
1279 | /// Set keys on an oid omap (bufferlist variant). | |
1280 | void omap_setkeys( | |
11fdf7f2 | 1281 | const coll_t &cid, ///< [in] Collection containing oid |
7c673cae FG |
1282 | const ghobject_t &oid, ///< [in] Object to update |
1283 | const bufferlist &attrset_bl ///< [in] Replacement keys and values | |
1284 | ) { | |
1285 | Op* _op = _get_next_op(); | |
1286 | _op->op = OP_OMAP_SETKEYS; | |
1287 | _op->cid = _get_coll_id(cid); | |
1288 | _op->oid = _get_object_id(oid); | |
1289 | data_bl.append(attrset_bl); | |
1290 | data.ops++; | |
1291 | } | |
1292 | ||
1293 | /// Remove keys from oid omap | |
1294 | void omap_rmkeys( | |
11fdf7f2 | 1295 | const coll_t &cid, ///< [in] Collection containing oid |
7c673cae FG |
1296 | const ghobject_t &oid, ///< [in] Object from which to remove the omap |
1297 | const set<string> &keys ///< [in] Keys to clear | |
1298 | ) { | |
11fdf7f2 | 1299 | using ceph::encode; |
7c673cae FG |
1300 | Op* _op = _get_next_op(); |
1301 | _op->op = OP_OMAP_RMKEYS; | |
1302 | _op->cid = _get_coll_id(cid); | |
1303 | _op->oid = _get_object_id(oid); | |
11fdf7f2 | 1304 | encode(keys, data_bl); |
7c673cae FG |
1305 | data.ops++; |
1306 | } | |
1307 | ||
1308 | /// Remove keys from oid omap | |
1309 | void omap_rmkeys( | |
11fdf7f2 | 1310 | const coll_t &cid, ///< [in] Collection containing oid |
7c673cae FG |
1311 | const ghobject_t &oid, ///< [in] Object from which to remove the omap |
1312 | const bufferlist &keys_bl ///< [in] Keys to clear | |
1313 | ) { | |
1314 | Op* _op = _get_next_op(); | |
1315 | _op->op = OP_OMAP_RMKEYS; | |
1316 | _op->cid = _get_coll_id(cid); | |
1317 | _op->oid = _get_object_id(oid); | |
1318 | data_bl.append(keys_bl); | |
1319 | data.ops++; | |
1320 | } | |
1321 | ||
1322 | /// Remove key range from oid omap | |
1323 | void omap_rmkeyrange( | |
11fdf7f2 | 1324 | const coll_t &cid, ///< [in] Collection containing oid |
7c673cae FG |
1325 | const ghobject_t &oid, ///< [in] Object from which to remove the omap keys |
1326 | const string& first, ///< [in] first key in range | |
1327 | const string& last ///< [in] first key past range, range is [first,last) | |
1328 | ) { | |
11fdf7f2 | 1329 | using ceph::encode; |
7c673cae FG |
1330 | Op* _op = _get_next_op(); |
1331 | _op->op = OP_OMAP_RMKEYRANGE; | |
1332 | _op->cid = _get_coll_id(cid); | |
1333 | _op->oid = _get_object_id(oid); | |
11fdf7f2 TL |
1334 | encode(first, data_bl); |
1335 | encode(last, data_bl); | |
7c673cae FG |
1336 | data.ops++; |
1337 | } | |
1338 | ||
1339 | /// Set omap header | |
1340 | void omap_setheader( | |
11fdf7f2 | 1341 | const coll_t &cid, ///< [in] Collection containing oid |
7c673cae FG |
1342 | const ghobject_t &oid, ///< [in] Object |
1343 | const bufferlist &bl ///< [in] Header value | |
1344 | ) { | |
11fdf7f2 | 1345 | using ceph::encode; |
7c673cae FG |
1346 | Op* _op = _get_next_op(); |
1347 | _op->op = OP_OMAP_SETHEADER; | |
1348 | _op->cid = _get_coll_id(cid); | |
1349 | _op->oid = _get_object_id(oid); | |
11fdf7f2 | 1350 | encode(bl, data_bl); |
7c673cae FG |
1351 | data.ops++; |
1352 | } | |
1353 | ||
1354 | /// Split collection based on given prefixes, objects matching the specified bits/rem are | |
1355 | /// moved to the new collection | |
1356 | void split_collection( | |
11fdf7f2 | 1357 | const coll_t &cid, |
7c673cae FG |
1358 | uint32_t bits, |
1359 | uint32_t rem, | |
11fdf7f2 | 1360 | const coll_t &destination) { |
7c673cae FG |
1361 | Op* _op = _get_next_op(); |
1362 | _op->op = OP_SPLIT_COLLECTION2; | |
1363 | _op->cid = _get_coll_id(cid); | |
1364 | _op->dest_cid = _get_coll_id(destination); | |
1365 | _op->split_bits = bits; | |
1366 | _op->split_rem = rem; | |
1367 | data.ops++; | |
1368 | } | |
1369 | ||
11fdf7f2 TL |
1370 | /// Merge collection into another. |
1371 | void merge_collection( | |
7c673cae | 1372 | coll_t cid, |
11fdf7f2 TL |
1373 | coll_t destination, |
1374 | uint32_t bits) { | |
1375 | Op* _op = _get_next_op(); | |
1376 | _op->op = OP_MERGE_COLLECTION; | |
1377 | _op->cid = _get_coll_id(cid); | |
1378 | _op->dest_cid = _get_coll_id(destination); | |
1379 | _op->split_bits = bits; | |
1380 | data.ops++; | |
1381 | } | |
1382 | ||
1383 | void collection_set_bits( | |
1384 | const coll_t &cid, | |
7c673cae FG |
1385 | int bits) { |
1386 | Op* _op = _get_next_op(); | |
1387 | _op->op = OP_COLL_SET_BITS; | |
1388 | _op->cid = _get_coll_id(cid); | |
1389 | _op->split_bits = bits; | |
1390 | data.ops++; | |
1391 | } | |
1392 | ||
1393 | /// Set allocation hint for an object | |
1394 | /// make 0 values(expected_object_size, expected_write_size) noops for all implementations | |
1395 | void set_alloc_hint( | |
11fdf7f2 | 1396 | const coll_t &cid, |
7c673cae FG |
1397 | const ghobject_t &oid, |
1398 | uint64_t expected_object_size, | |
1399 | uint64_t expected_write_size, | |
1400 | uint32_t flags | |
1401 | ) { | |
1402 | Op* _op = _get_next_op(); | |
1403 | _op->op = OP_SETALLOCHINT; | |
1404 | _op->cid = _get_coll_id(cid); | |
1405 | _op->oid = _get_object_id(oid); | |
1406 | _op->expected_object_size = expected_object_size; | |
1407 | _op->expected_write_size = expected_write_size; | |
1408 | _op->alloc_hint_flags = flags; | |
1409 | data.ops++; | |
1410 | } | |
1411 | ||
1412 | void encode(bufferlist& bl) const { | |
1413 | //layout: data_bl + op_bl + coll_index + object_index + data | |
1414 | ENCODE_START(9, 9, bl); | |
11fdf7f2 TL |
1415 | encode(data_bl, bl); |
1416 | encode(op_bl, bl); | |
1417 | encode(coll_index, bl); | |
1418 | encode(object_index, bl); | |
7c673cae FG |
1419 | data.encode(bl); |
1420 | ENCODE_FINISH(bl); | |
1421 | } | |
1422 | ||
11fdf7f2 | 1423 | void decode(bufferlist::const_iterator &bl) { |
7c673cae FG |
1424 | DECODE_START(9, bl); |
1425 | DECODE_OLDEST(9); | |
1426 | ||
11fdf7f2 TL |
1427 | decode(data_bl, bl); |
1428 | decode(op_bl, bl); | |
1429 | decode(coll_index, bl); | |
1430 | decode(object_index, bl); | |
7c673cae FG |
1431 | data.decode(bl); |
1432 | coll_id = coll_index.size(); | |
1433 | object_id = object_index.size(); | |
1434 | ||
1435 | DECODE_FINISH(bl); | |
1436 | } | |
1437 | ||
1438 | void dump(ceph::Formatter *f); | |
1439 | static void generate_test_instances(list<Transaction*>& o); | |
1440 | }; | |
1441 | ||
11fdf7f2 TL |
1442 | int queue_transaction(CollectionHandle& ch, |
1443 | Transaction&& t, | |
1444 | TrackedOpRef op = TrackedOpRef(), | |
1445 | ThreadPool::TPHandle *handle = NULL) { | |
7c673cae FG |
1446 | vector<Transaction> tls; |
1447 | tls.push_back(std::move(t)); | |
11fdf7f2 | 1448 | return queue_transactions(ch, tls, op, handle); |
7c673cae FG |
1449 | } |
1450 | ||
1451 | virtual int queue_transactions( | |
11fdf7f2 | 1452 | CollectionHandle& ch, vector<Transaction>& tls, |
7c673cae FG |
1453 | TrackedOpRef op = TrackedOpRef(), |
1454 | ThreadPool::TPHandle *handle = NULL) = 0; | |
1455 | ||
1456 | ||
7c673cae FG |
1457 | public: |
1458 | ObjectStore(CephContext* cct, | |
1459 | const std::string& path_) : path(path_), cct(cct) {} | |
1460 | virtual ~ObjectStore() {} | |
1461 | ||
1462 | // no copying | |
1463 | explicit ObjectStore(const ObjectStore& o) = delete; | |
1464 | const ObjectStore& operator=(const ObjectStore& o) = delete; | |
1465 | ||
1466 | // versioning | |
1467 | virtual int upgrade() { | |
1468 | return 0; | |
1469 | } | |
1470 | ||
1471 | virtual void get_db_statistics(Formatter *f) { } | |
1472 | virtual void generate_db_histogram(Formatter *f) { } | |
11fdf7f2 | 1473 | virtual int flush_cache(ostream *os = NULL) { return -1; } |
7c673cae | 1474 | virtual void dump_perf_counters(Formatter *f) {} |
11fdf7f2 TL |
1475 | virtual void dump_cache_stats(Formatter *f) {} |
1476 | virtual void dump_cache_stats(ostream& os) {} | |
7c673cae FG |
1477 | |
1478 | virtual string get_type() = 0; | |
1479 | ||
1480 | // mgmt | |
1481 | virtual bool test_mount_in_use() = 0; | |
1482 | virtual int mount() = 0; | |
1483 | virtual int umount() = 0; | |
1484 | virtual int fsck(bool deep) { | |
1485 | return -EOPNOTSUPP; | |
1486 | } | |
3efd9988 FG |
1487 | virtual int repair(bool deep) { |
1488 | return -EOPNOTSUPP; | |
1489 | } | |
7c673cae FG |
1490 | |
1491 | virtual void set_cache_shards(unsigned num) { } | |
1492 | ||
1493 | /** | |
1494 | * Returns 0 if the hobject is valid, -error otherwise | |
1495 | * | |
1496 | * Errors: | |
1497 | * -ENAMETOOLONG: locator/namespace/name too large | |
1498 | */ | |
1499 | virtual int validate_hobject_key(const hobject_t &obj) const = 0; | |
1500 | ||
1501 | virtual unsigned get_max_attr_name_length() = 0; | |
1502 | virtual int mkfs() = 0; // wipe | |
1503 | virtual int mkjournal() = 0; // journal only | |
1504 | virtual bool needs_journal() = 0; //< requires a journal | |
1505 | virtual bool wants_journal() = 0; //< prefers a journal | |
1506 | virtual bool allows_journal() = 0; //< allows a journal | |
1507 | ||
11fdf7f2 TL |
1508 | /// enumerate hardware devices (by 'devname', e.g., 'sda' as in /sys/block/sda) |
1509 | virtual int get_devices(std::set<string> *devls) { | |
1510 | return -EOPNOTSUPP; | |
1511 | } | |
1512 | ||
1513 | /// true if a txn is readable immediately after it is queued. | |
1514 | virtual bool is_sync_onreadable() const { | |
1515 | return true; | |
1516 | } | |
1517 | ||
31f18b77 FG |
1518 | /** |
1519 | * is_rotational | |
1520 | * | |
1521 | * Check whether store is backed by a rotational (HDD) or non-rotational | |
1522 | * (SSD) device. | |
1523 | * | |
1524 | * This must be usable *before* the store is mounted. | |
1525 | * | |
1526 | * @return true for HDD, false for SSD | |
1527 | */ | |
1528 | virtual bool is_rotational() { | |
1529 | return true; | |
1530 | } | |
1531 | ||
d2e6a577 FG |
1532 | /** |
1533 | * is_journal_rotational | |
1534 | * | |
1535 | * Check whether journal is backed by a rotational (HDD) or non-rotational | |
1536 | * (SSD) device. | |
1537 | * | |
1538 | * | |
1539 | * @return true for HDD, false for SSD | |
1540 | */ | |
1541 | virtual bool is_journal_rotational() { | |
1542 | return true; | |
1543 | } | |
1544 | ||
224ce89b WB |
1545 | virtual string get_default_device_class() { |
1546 | return is_rotational() ? "hdd" : "ssd"; | |
1547 | } | |
1548 | ||
11fdf7f2 TL |
1549 | virtual int get_numa_node( |
1550 | int *numa_node, | |
1551 | set<int> *nodes, | |
1552 | set<string> *failed) { | |
1553 | return -EOPNOTSUPP; | |
1554 | } | |
1555 | ||
1556 | ||
7c673cae FG |
1557 | virtual bool can_sort_nibblewise() { |
1558 | return false; // assume a backend cannot, unless it says otherwise | |
1559 | } | |
1560 | ||
11fdf7f2 TL |
1561 | virtual int statfs(struct store_statfs_t *buf, |
1562 | osd_alert_list_t* alerts = nullptr) = 0; | |
1563 | virtual int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) = 0; | |
7c673cae FG |
1564 | |
1565 | virtual void collect_metadata(map<string,string> *pm) { } | |
1566 | ||
1567 | /** | |
1568 | * write_meta - write a simple configuration key out-of-band | |
1569 | * | |
1570 | * Write a simple key/value pair for basic store configuration | |
1571 | * (e.g., a uuid or magic number) to an unopened/unmounted store. | |
1572 | * The default implementation writes this to a plaintext file in the | |
1573 | * path. | |
1574 | * | |
1575 | * A newline is appended. | |
1576 | * | |
1577 | * @param key key name (e.g., "fsid") | |
1578 | * @param value value (e.g., a uuid rendered as a string) | |
1579 | * @returns 0 for success, or an error code | |
1580 | */ | |
1581 | virtual int write_meta(const std::string& key, | |
1582 | const std::string& value); | |
1583 | ||
1584 | /** | |
1585 | * read_meta - read a simple configuration key out-of-band | |
1586 | * | |
1587 | * Read a simple key value to an unopened/mounted store. | |
1588 | * | |
1589 | * Trailing whitespace is stripped off. | |
1590 | * | |
1591 | * @param key key name | |
1592 | * @param value pointer to value string | |
1593 | * @returns 0 for success, or an error code | |
1594 | */ | |
1595 | virtual int read_meta(const std::string& key, | |
1596 | std::string *value); | |
1597 | ||
1598 | /** | |
1599 | * get ideal max value for collection_list() | |
1600 | * | |
1601 | * default to some arbitrary values; the implementation will override. | |
1602 | */ | |
1603 | virtual int get_ideal_list_max() { return 64; } | |
1604 | ||
1605 | ||
1606 | /** | |
1607 | * get a collection handle | |
1608 | * | |
1609 | * Provide a trivial handle as a default to avoid converting legacy | |
1610 | * implementations. | |
1611 | */ | |
11fdf7f2 TL |
1612 | virtual CollectionHandle open_collection(const coll_t &cid) = 0; |
1613 | ||
1614 | /** | |
1615 | * get a collection handle for a soon-to-be-created collection | |
1616 | * | |
1617 | * This handle must be used by queue_transaction that includes a | |
1618 | * create_collection call in order to become valid. It will become the | |
1619 | * reference to the created collection. | |
1620 | */ | |
1621 | virtual CollectionHandle create_new_collection(const coll_t &cid) = 0; | |
7c673cae | 1622 | |
11fdf7f2 TL |
1623 | /** |
1624 | * set ContextQueue for a collection | |
1625 | * | |
1626 | * After that, oncommits of Transaction will queue into commit_queue. | |
1627 | * And osd ShardThread will call oncommits. | |
1628 | */ | |
1629 | virtual void set_collection_commit_queue(const coll_t &cid, ContextQueue *commit_queue) = 0; | |
7c673cae FG |
1630 | |
1631 | /** | |
1632 | * Synchronous read operations | |
1633 | */ | |
1634 | ||
1635 | /** | |
1636 | * exists -- Test for existance of object | |
1637 | * | |
1638 | * @param cid collection for object | |
1639 | * @param oid oid of object | |
1640 | * @returns true if object exists, false otherwise | |
1641 | */ | |
11fdf7f2 | 1642 | virtual bool exists(CollectionHandle& c, const ghobject_t& oid) = 0; |
7c673cae FG |
1643 | /** |
1644 | * set_collection_opts -- set pool options for a collectioninformation for an object | |
1645 | * | |
1646 | * @param cid collection | |
1647 | * @param opts new collection options | |
1648 | * @returns 0 on success, negative error code on failure. | |
1649 | */ | |
1650 | virtual int set_collection_opts( | |
11fdf7f2 | 1651 | CollectionHandle& c, |
7c673cae FG |
1652 | const pool_opts_t& opts) = 0; |
1653 | ||
1654 | /** | |
1655 | * stat -- get information for an object | |
1656 | * | |
1657 | * @param cid collection for object | |
1658 | * @param oid oid of object | |
1659 | * @param st output information for the object | |
1660 | * @param allow_eio if false, assert on -EIO operation failure | |
1661 | * @returns 0 on success, negative error code on failure. | |
1662 | */ | |
7c673cae FG |
1663 | virtual int stat( |
1664 | CollectionHandle &c, | |
1665 | const ghobject_t& oid, | |
1666 | struct stat *st, | |
11fdf7f2 | 1667 | bool allow_eio = false) = 0; |
7c673cae FG |
1668 | /** |
1669 | * read -- read a byte range of data from an object | |
1670 | * | |
1671 | * Note: if reading from an offset past the end of the object, we | |
1672 | * return 0 (not, say, -EINVAL). | |
1673 | * | |
1674 | * @param cid collection for object | |
1675 | * @param oid oid of object | |
1676 | * @param offset location offset of first byte to be read | |
1677 | * @param len number of bytes to be read | |
1678 | * @param bl output bufferlist | |
1679 | * @param op_flags is CEPH_OSD_OP_FLAG_* | |
7c673cae FG |
1680 | * @returns number of bytes read on success, or negative error code on failure. |
1681 | */ | |
7c673cae FG |
1682 | virtual int read( |
1683 | CollectionHandle &c, | |
1684 | const ghobject_t& oid, | |
1685 | uint64_t offset, | |
1686 | size_t len, | |
1687 | bufferlist& bl, | |
11fdf7f2 | 1688 | uint32_t op_flags = 0) = 0; |
7c673cae FG |
1689 | |
1690 | /** | |
1691 | * fiemap -- get extent map of data of an object | |
1692 | * | |
1693 | * Returns an encoded map of the extents of an object's data portion | |
1694 | * (map<offset,size>). | |
1695 | * | |
1696 | * A non-enlightened implementation is free to return the extent (offset, len) | |
1697 | * as the sole extent. | |
1698 | * | |
1699 | * @param cid collection for object | |
1700 | * @param oid oid of object | |
1701 | * @param offset location offset of first byte to be read | |
1702 | * @param len number of bytes to be read | |
1703 | * @param bl output bufferlist for extent map information. | |
1704 | * @returns 0 on success, negative error code on failure. | |
1705 | */ | |
7c673cae | 1706 | virtual int fiemap(CollectionHandle& c, const ghobject_t& oid, |
11fdf7f2 | 1707 | uint64_t offset, size_t len, bufferlist& bl) = 0; |
7c673cae | 1708 | virtual int fiemap(CollectionHandle& c, const ghobject_t& oid, |
11fdf7f2 | 1709 | uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) = 0; |
7c673cae FG |
1710 | |
1711 | /** | |
1712 | * getattr -- get an xattr of an object | |
1713 | * | |
1714 | * @param cid collection for object | |
1715 | * @param oid oid of object | |
1716 | * @param name name of attr to read | |
1717 | * @param value place to put output result. | |
1718 | * @returns 0 on success, negative error code on failure. | |
1719 | */ | |
7c673cae | 1720 | virtual int getattr(CollectionHandle &c, const ghobject_t& oid, |
11fdf7f2 | 1721 | const char *name, bufferptr& value) = 0; |
7c673cae FG |
1722 | |
1723 | /** | |
1724 | * getattr -- get an xattr of an object | |
1725 | * | |
1726 | * @param cid collection for object | |
1727 | * @param oid oid of object | |
1728 | * @param name name of attr to read | |
1729 | * @param value place to put output result. | |
1730 | * @returns 0 on success, negative error code on failure. | |
1731 | */ | |
7c673cae FG |
1732 | int getattr( |
1733 | CollectionHandle &c, const ghobject_t& oid, | |
1734 | const string& name, bufferlist& value) { | |
1735 | bufferptr bp; | |
1736 | int r = getattr(c, oid, name.c_str(), bp); | |
1737 | value.push_back(bp); | |
1738 | return r; | |
1739 | } | |
1740 | ||
1741 | /** | |
1742 | * getattrs -- get all of the xattrs of an object | |
1743 | * | |
1744 | * @param cid collection for object | |
1745 | * @param oid oid of object | |
1746 | * @param aset place to put output result. | |
1747 | * @returns 0 on success, negative error code on failure. | |
1748 | */ | |
7c673cae | 1749 | virtual int getattrs(CollectionHandle &c, const ghobject_t& oid, |
11fdf7f2 | 1750 | map<string,bufferptr>& aset) = 0; |
7c673cae FG |
1751 | |
1752 | /** | |
1753 | * getattrs -- get all of the xattrs of an object | |
1754 | * | |
1755 | * @param cid collection for object | |
1756 | * @param oid oid of object | |
1757 | * @param aset place to put output result. | |
1758 | * @returns 0 on success, negative error code on failure. | |
1759 | */ | |
7c673cae FG |
1760 | int getattrs(CollectionHandle &c, const ghobject_t& oid, |
1761 | map<string,bufferlist>& aset) { | |
1762 | map<string,bufferptr> bmap; | |
1763 | int r = getattrs(c, oid, bmap); | |
1764 | for (map<string,bufferptr>::iterator i = bmap.begin(); | |
1765 | i != bmap.end(); | |
1766 | ++i) { | |
1767 | aset[i->first].append(i->second); | |
1768 | } | |
1769 | return r; | |
1770 | } | |
1771 | ||
1772 | ||
1773 | // collections | |
1774 | ||
1775 | /** | |
1776 | * list_collections -- get all of the collections known to this ObjectStore | |
1777 | * | |
1778 | * @param ls list of the collections in sorted order. | |
1779 | * @returns 0 on success, negative error code on failure. | |
1780 | */ | |
1781 | virtual int list_collections(vector<coll_t>& ls) = 0; | |
1782 | ||
1783 | /** | |
1784 | * does a collection exist? | |
1785 | * | |
1786 | * @param c collection | |
1787 | * @returns true if it exists, false otherwise | |
1788 | */ | |
1789 | virtual bool collection_exists(const coll_t& c) = 0; | |
1790 | ||
1791 | /** | |
1792 | * is a collection empty? | |
1793 | * | |
1794 | * @param c collection | |
1795 | * @param empty true if the specified collection is empty, false otherwise | |
1796 | * @returns 0 on success, negative error code on failure. | |
1797 | */ | |
11fdf7f2 | 1798 | virtual int collection_empty(CollectionHandle& c, bool *empty) = 0; |
7c673cae FG |
1799 | |
1800 | /** | |
1801 | * return the number of significant bits of the coll_t::pgid. | |
1802 | * | |
1803 | * This should return what the last create_collection or split_collection | |
1804 | * set. A legacy backend may return -EAGAIN if the value is unavailable | |
1805 | * (because we upgraded from an older version, e.g., FileStore). | |
1806 | */ | |
11fdf7f2 | 1807 | virtual int collection_bits(CollectionHandle& c) = 0; |
7c673cae FG |
1808 | |
1809 | ||
1810 | /** | |
1811 | * list contents of a collection that fall in the range [start, end) and no more than a specified many result | |
1812 | * | |
1813 | * @param c collection | |
1814 | * @param start list object that sort >= this value | |
1815 | * @param end list objects that sort < this value | |
1816 | * @param max return no more than this many results | |
1817 | * @param seq return no objects with snap < seq | |
1818 | * @param ls [out] result | |
1819 | * @param next [out] next item sorts >= this value | |
1820 | * @return zero on success, or negative error | |
1821 | */ | |
7c673cae FG |
1822 | virtual int collection_list(CollectionHandle &c, |
1823 | const ghobject_t& start, const ghobject_t& end, | |
1824 | int max, | |
11fdf7f2 | 1825 | vector<ghobject_t> *ls, ghobject_t *next) = 0; |
7c673cae FG |
1826 | |
1827 | ||
1828 | /// OMAP | |
1829 | /// Get omap contents | |
7c673cae FG |
1830 | virtual int omap_get( |
1831 | CollectionHandle &c, ///< [in] Collection containing oid | |
1832 | const ghobject_t &oid, ///< [in] Object containing omap | |
1833 | bufferlist *header, ///< [out] omap header | |
1834 | map<string, bufferlist> *out /// < [out] Key to value map | |
11fdf7f2 | 1835 | ) = 0; |
7c673cae FG |
1836 | |
1837 | /// Get omap header | |
7c673cae FG |
1838 | virtual int omap_get_header( |
1839 | CollectionHandle &c, ///< [in] Collection containing oid | |
1840 | const ghobject_t &oid, ///< [in] Object containing omap | |
1841 | bufferlist *header, ///< [out] omap header | |
1842 | bool allow_eio = false ///< [in] don't assert on eio | |
11fdf7f2 | 1843 | ) = 0; |
7c673cae FG |
1844 | |
1845 | /// Get keys defined on oid | |
7c673cae FG |
1846 | virtual int omap_get_keys( |
1847 | CollectionHandle &c, ///< [in] Collection containing oid | |
1848 | const ghobject_t &oid, ///< [in] Object containing omap | |
1849 | set<string> *keys ///< [out] Keys defined on oid | |
11fdf7f2 | 1850 | ) = 0; |
7c673cae FG |
1851 | |
1852 | /// Get key values | |
7c673cae FG |
1853 | virtual int omap_get_values( |
1854 | CollectionHandle &c, ///< [in] Collection containing oid | |
1855 | const ghobject_t &oid, ///< [in] Object containing omap | |
1856 | const set<string> &keys, ///< [in] Keys to get | |
1857 | map<string, bufferlist> *out ///< [out] Returned keys and values | |
11fdf7f2 | 1858 | ) = 0; |
7c673cae FG |
1859 | |
1860 | /// Filters keys into out which are defined on oid | |
7c673cae FG |
1861 | virtual int omap_check_keys( |
1862 | CollectionHandle &c, ///< [in] Collection containing oid | |
1863 | const ghobject_t &oid, ///< [in] Object containing omap | |
1864 | const set<string> &keys, ///< [in] Keys to check | |
1865 | set<string> *out ///< [out] Subset of keys defined on oid | |
11fdf7f2 | 1866 | ) = 0; |
7c673cae FG |
1867 | |
1868 | /** | |
1869 | * Returns an object map iterator | |
1870 | * | |
1871 | * Warning! The returned iterator is an implicit lock on filestore | |
1872 | * operations in c. Do not use filestore methods on c while the returned | |
1873 | * iterator is live. (Filling in a transaction is no problem). | |
1874 | * | |
1875 | * @return iterator, null on error | |
1876 | */ | |
7c673cae FG |
1877 | virtual ObjectMap::ObjectMapIterator get_omap_iterator( |
1878 | CollectionHandle &c, ///< [in] collection | |
1879 | const ghobject_t &oid ///< [in] object | |
11fdf7f2 | 1880 | ) = 0; |
7c673cae FG |
1881 | |
1882 | virtual int flush_journal() { return -EOPNOTSUPP; } | |
1883 | ||
1884 | virtual int dump_journal(ostream& out) { return -EOPNOTSUPP; } | |
1885 | ||
1886 | virtual int snapshot(const string& name) { return -EOPNOTSUPP; } | |
1887 | ||
1888 | /** | |
1889 | * Set and get internal fsid for this instance. No external data is modified | |
1890 | */ | |
1891 | virtual void set_fsid(uuid_d u) = 0; | |
1892 | virtual uuid_d get_fsid() = 0; | |
1893 | ||
1894 | /** | |
1895 | * Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store | |
1896 | * - num objects - total (including witeouts) object count to measure used space for. | |
1897 | */ | |
1898 | virtual uint64_t estimate_objects_overhead(uint64_t num_objects) = 0; | |
1899 | ||
1900 | ||
1901 | // DEBUG | |
1902 | virtual void inject_data_error(const ghobject_t &oid) {} | |
1903 | virtual void inject_mdata_error(const ghobject_t &oid) {} | |
224ce89b WB |
1904 | |
1905 | virtual void compact() {} | |
28e407b8 AA |
1906 | virtual bool has_builtin_csum() const { |
1907 | return false; | |
1908 | } | |
7c673cae FG |
1909 | }; |
1910 | WRITE_CLASS_ENCODER(ObjectStore::Transaction) | |
1911 | WRITE_CLASS_ENCODER(ObjectStore::Transaction::TransactionData) | |
1912 | ||
7c673cae FG |
1913 | ostream& operator<<(ostream& out, const ObjectStore::Transaction& tx); |
1914 | ||
1915 | #endif |