1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
14 #ifndef CEPH_OBJECTSTORE_H
15 #define CEPH_OBJECTSTORE_H
17 #include "include/Context.h"
18 #include "include/buffer.h"
19 #include "include/types.h"
20 #include "osd/osd_types.h"
21 #include "common/TrackedOp.h"
22 #include "common/WorkQueue.h"
23 #include "ObjectMap.h"
30 #if defined(DARWIN) || defined(__FreeBSD__) || defined(__sun)
31 #include <sys/statvfs.h>
33 #include <sys/vfs.h> /* or <sys/statfs.h> */
36 #define OPS_PER_PTR 32
49 * low-level interface to the local OSD file system
55 static inline void encode(const map
<string
,bufferptr
> *attrset
, bufferlist
&bl
) {
56 ::encode(*attrset
, bl
);
59 // this isn't the best place for these, but...
60 void decode_str_str_map_to_bl(bufferlist::iterator
& p
, bufferlist
*out
);
61 void decode_str_set_to_bl(bufferlist::iterator
& p
, bufferlist
*out
);
64 typedef uint32_t osflagbits_t
;
65 const int SKIP_JOURNAL_REPLAY
= 1 << 0;
66 const int SKIP_MOUNT_OMAP
= 1 << 1;
75 * create - create an ObjectStore instance.
77 * This is invoked once at initialization time.
79 * @param type type of store. This is a string from the configuration file.
80 * @param data path (or other descriptor) for data
81 * @param journal path (or other descriptor) for journal (optional)
82 * @param flags which filestores should check if applicable
84 static ObjectStore
*create(CephContext
*cct
,
87 const string
& journal
,
88 osflagbits_t flags
= 0);
91 * probe a block device to learn the uuid of the owning OSD
94 * @param path path to device
95 * @param fsid [out] osd uuid
97 static int probe_block_device_fsid(
103 * Fetch Object Store statistics.
105 * Currently only latency of write and apply times are measured.
107 * This appears to be called with nothing locked.
109 virtual objectstore_perf_stat_t
get_cur_stats() = 0;
112 * Fetch Object Store performance counters.
115 * This appears to be called with nothing locked.
117 virtual const PerfCounters
* get_perf_counters() const = 0;
120 * a sequencer orders transactions
122 * Any transactions queued under a given sequencer will be applied in
123 * sequence. Transactions queued under different sequencers may run
126 * Clients of ObjectStore create and maintain their own Sequencer objects.
127 * When a list of transactions is queued the caller specifies a Sequencer to be used.
132 * ABC for Sequencer implementation, private to the ObjectStore derived class.
133 * created in ...::queue_transaction(s)
135 struct Sequencer_impl
: public RefCountedObject
{
138 // block until any previous transactions are visible. specifically,
139 // collection_list and collection_empty need to reflect prior operations.
140 virtual void flush() = 0;
142 // called when we are done with the impl. the impl may have a different
143 // (longer) lifecycle than the Sequencer.
144 virtual void discard() {}
149 * There are two cases:
150 * 1) sequencer is currently idle: the method returns true. c is
152 * 2) sequencer is not idle: the method returns false and c is
153 * called asyncronously with a value of 0 once all transactions
154 * queued on this sequencer prior to the call have been applied
157 virtual bool flush_commit(
158 Context
*c
///< [in] context to call upon flush/commit
159 ) = 0; ///< @return true if idle, false otherwise
161 Sequencer_impl(CephContext
* cct
) : RefCountedObject(NULL
, 0), cct(cct
) {}
162 ~Sequencer_impl() override
{}
164 typedef boost::intrusive_ptr
<Sequencer_impl
> Sequencer_implRef
;
167 * External (opaque) sequencer implementation
174 explicit Sequencer(string n
)
175 : name(n
), shard_hint(spg_t()), p(NULL
) {
179 p
->discard(); // tell impl we are done with it
182 /// return a unique string identifier for this sequencer
183 const string
& get_name() const {
186 /// wait for any queued transactions on this sequencer to apply
192 /// @see Sequencer_impl::flush_commit()
193 bool flush_commit(Context
*c
) {
197 return p
->flush_commit(c
);
202 struct CollectionImpl
: public RefCountedObject
{
203 virtual const coll_t
&get_cid() = 0;
204 CollectionImpl() : RefCountedObject(NULL
, 0) {}
206 typedef boost::intrusive_ptr
<CollectionImpl
> CollectionHandle
;
208 struct CompatCollectionHandle
: public CollectionImpl
{
210 explicit CompatCollectionHandle(coll_t c
) : cid(c
) {}
211 const coll_t
&get_cid() override
{
216 /*********************************
218 * Object Contents and semantics
220 * All ObjectStore objects are identified as a named object
221 * (ghobject_t and hobject_t) in a named collection (coll_t).
222 * ObjectStore operations support the creation, mutation, deletion
223 * and enumeration of objects within a collection. Enumeration is
224 * in sorted key order (where keys are sorted by hash). Object names
225 * are globally unique.
227 * Each object has four distinct parts: byte data, xattrs, omap_header
230 * The data portion of an object is conceptually equivalent to a
231 * file in a file system. Random and Partial access for both read
232 * and write operations is required. The ability to have a sparse
233 * implementation of the data portion of an object is beneficial for
234 * some workloads, but not required. There is a system-wide limit on
235 * the maximum size of an object, which is typically around 100 MB.
237 * Xattrs are equivalent to the extended attributes of file
238 * systems. Xattrs are a set of key/value pairs. Sub-value access
239 * is not required. It is possible to enumerate the set of xattrs in
240 * key order. At the implementation level, xattrs are used
241 * exclusively internal to Ceph and the implementer can expect the
242 * total size of all of the xattrs on an object to be relatively
243 * small, i.e., less than 64KB. Much of Ceph assumes that accessing
244 * xattrs on temporally adjacent object accesses (recent past or
245 * near future) is inexpensive.
247 * omap_header is a single blob of data. It can be read or written
250 * Omap entries are conceptually the same as xattrs
251 * but in a different address space. In other words, you can have
252 * the same key as an xattr and an omap entry and they have distinct
253 * values. Enumeration of xattrs doesn't include omap entries and
254 * vice versa. The size and access characteristics of omap entries
255 * are very different from xattrs. In particular, the value portion
256 * of an omap entry can be quite large (MBs). More importantly, the
257 * interface must support efficient range queries on omap entries even
258 * when there are a large numbers of entries.
260 *********************************/
262 /*******************************
266 * A collection is simply a grouping of objects. Collections have
267 * names (coll_t) and can be enumerated in order. Like an
268 * individual object, a collection also has a set of xattrs.
274 /*********************************
277 * A Transaction represents a sequence of primitive mutation
280 * Three events in the life of a Transaction result in
281 * callbacks. Any Transaction can contain any number of callback
282 * objects (Context) for any combination of the three classes of
285 * on_applied_sync, on_applied, and on_commit.
287 * The "on_applied" and "on_applied_sync" callbacks are invoked when
288 * the modifications requested by the Transaction are visible to
289 * subsequent ObjectStore operations, i.e., the results are
290 * readable. The only conceptual difference between on_applied and
291 * on_applied_sync is the specific thread and locking environment in
292 * which the callbacks operate. "on_applied_sync" is called
293 * directly by an ObjectStore execution thread. It is expected to
294 * execute quickly and must not acquire any locks of the calling
295 * environment. Conversely, "on_applied" is called from the separate
296 * Finisher thread, meaning that it can contend for calling
297 * environment locks. NB, on_applied and on_applied_sync are
298 * sometimes called on_readable and on_readable_sync.
300 * The "on_commit" callback is also called from the Finisher thread
301 * and indicates that all of the mutations have been durably
302 * committed to stable storage (i.e., are now software/hardware
305 * At the implementation level, each mutation primitive (and its
306 * associated data) can be serialized to a single buffer. That
307 * serialization, however, does not copy any data, but (using the
308 * bufferlist library) will reference the original buffers. This
309 * implies that the buffer that contains the data being submitted
310 * must remain stable until the on_commit callback completes. In
311 * practice, bufferlist handles all of this for you and this
312 * subtlety is only relevant if you are referencing an existing
313 * buffer via buffer::raw_static.
315 * Some implementations of ObjectStore choose to implement their own
316 * form of journaling that uses the serialized form of a
317 * Transaction. This requires that the encode/decode logic properly
318 * version itself and handle version upgrades that might change the
319 * format of the encoded Transaction. This has already happened a
320 * couple of times and the Transaction object contains some helper
321 * variables that aid in this legacy decoding:
323 * sobject_encoding detects an older/simpler version of oid
324 * present in pre-bobtail versions of ceph. use_pool_override
325 * also detects a situation where the pool of an oid can be
326 * override for legacy operations/buffers. For non-legacy
327 * implementation of ObjectStore, neither of these fields is
331 * TRANSACTION ISOLATION
333 * Except as noted below, isolation is the responsibility of the
334 * caller. In other words, if any storage element (storage element
335 * == any of the four portions of an object as described above) is
336 * altered by a transaction (including deletion), the caller
337 * promises not to attempt to read that element while the
338 * transaction is pending (here pending means from the time of
339 * issuance until the "on_applied_sync" callback has been
340 * received). Violations of isolation need not be detected by
341 * ObjectStore and there is no corresponding error mechanism for
342 * reporting an isolation violation (crashing would be the
343 * appropriate way to report an isolation violation if detected).
345 * Enumeration operations may violate transaction isolation as
346 * described above when a storage element is being created or
347 * deleted as part of a transaction. In this case, ObjectStore is
348 * allowed to consider the enumeration operation to either precede
349 * or follow the violating transaction element. In other words, the
350 * presence/absence of the mutated element in the enumeration is
351 * entirely at the discretion of ObjectStore. The arbitrary ordering
352 * applies independently to each transaction element. For example,
353 * if a transaction contains two mutating elements "create A" and
354 * "delete B". And an enumeration operation is performed while this
355 * transaction is pending. It is permissable for ObjectStore to
356 * report any of the four possible combinations of the existence of
364 OP_TOUCH
= 9, // cid, oid
365 OP_WRITE
= 10, // cid, oid, offset, len, bl
366 OP_ZERO
= 11, // cid, oid, offset, len
367 OP_TRUNCATE
= 12, // cid, oid, len
368 OP_REMOVE
= 13, // cid, oid
369 OP_SETATTR
= 14, // cid, oid, attrname, bl
370 OP_SETATTRS
= 15, // cid, oid, attrset
371 OP_RMATTR
= 16, // cid, oid, attrname
372 OP_CLONE
= 17, // cid, oid, newoid
373 OP_CLONERANGE
= 18, // cid, oid, newoid, offset, len
374 OP_CLONERANGE2
= 30, // cid, oid, newoid, srcoff, len, dstoff
376 OP_TRIMCACHE
= 19, // cid, oid, offset, len **DEPRECATED**
378 OP_MKCOLL
= 20, // cid
379 OP_RMCOLL
= 21, // cid
380 OP_COLL_ADD
= 22, // cid, oldcid, oid
381 OP_COLL_REMOVE
= 23, // cid, oid
382 OP_COLL_SETATTR
= 24, // cid, attrname, bl
383 OP_COLL_RMATTR
= 25, // cid, attrname
384 OP_COLL_SETATTRS
= 26, // cid, attrset
385 OP_COLL_MOVE
= 8, // newcid, oldcid, oid
387 OP_STARTSYNC
= 27, // start a sync
389 OP_RMATTRS
= 28, // cid, oid
390 OP_COLL_RENAME
= 29, // cid, newcid
392 OP_OMAP_CLEAR
= 31, // cid
393 OP_OMAP_SETKEYS
= 32, // cid, attrset
394 OP_OMAP_RMKEYS
= 33, // cid, keyset
395 OP_OMAP_SETHEADER
= 34, // cid, header
396 OP_SPLIT_COLLECTION
= 35, // cid, bits, destination
397 OP_SPLIT_COLLECTION2
= 36, /* cid, bits, destination
398 doesn't create the destination */
399 OP_OMAP_RMKEYRANGE
= 37, // cid, oid, firstkey, lastkey
400 OP_COLL_MOVE_RENAME
= 38, // oldcid, oldoid, newcid, newoid
402 OP_SETALLOCHINT
= 39, // cid, oid, object_size, write_size
403 OP_COLL_HINT
= 40, // cid, type, bl
405 OP_TRY_RENAME
= 41, // oldcid, oldoid, newoid
407 OP_COLL_SET_BITS
= 42, // cid, bits
410 // Transaction hint type
412 COLL_HINT_EXPECTED_NUM_OBJECTS
= 1,
422 __le32 dest_oid
; //OP_CLONE, OP_CLONERANGE
423 __le64 dest_off
; //OP_CLONERANGE
426 __le32 hint_type
; //OP_COLL_HINT
429 __le32 alloc_hint_flags
; //OP_SETALLOCHINT
432 __le64 expected_object_size
; //OP_SETALLOCHINT
433 __le64 expected_write_size
; //OP_SETALLOCHINT
434 __le32 split_bits
; //OP_SPLIT_COLLECTION2,OP_COLL_SET_BITS,
436 __le32 split_rem
; //OP_SPLIT_COLLECTION2
437 } __attribute__ ((packed
)) ;
439 struct TransactionData
{
441 __le32 largest_data_len
;
442 __le32 largest_data_off
;
443 __le32 largest_data_off_in_data_bl
;
444 __le32 fadvise_flags
;
446 TransactionData() noexcept
:
450 largest_data_off_in_data_bl(0),
453 // override default move operations to reset default values
454 TransactionData(TransactionData
&& other
) noexcept
:
456 largest_data_len(other
.largest_data_len
),
457 largest_data_off(other
.largest_data_off
),
458 largest_data_off_in_data_bl(other
.largest_data_off_in_data_bl
),
459 fadvise_flags(other
.fadvise_flags
) {
461 other
.largest_data_len
= 0;
462 other
.largest_data_off
= 0;
463 other
.largest_data_off_in_data_bl
= 0;
464 other
.fadvise_flags
= 0;
466 TransactionData
& operator=(TransactionData
&& other
) noexcept
{
468 largest_data_len
= other
.largest_data_len
;
469 largest_data_off
= other
.largest_data_off
;
470 largest_data_off_in_data_bl
= other
.largest_data_off_in_data_bl
;
471 fadvise_flags
= other
.fadvise_flags
;
473 other
.largest_data_len
= 0;
474 other
.largest_data_off
= 0;
475 other
.largest_data_off_in_data_bl
= 0;
476 other
.fadvise_flags
= 0;
480 TransactionData(const TransactionData
& other
) = default;
481 TransactionData
& operator=(const TransactionData
& other
) = default;
483 void encode(bufferlist
& bl
) const {
484 bl
.append((char*)this, sizeof(TransactionData
));
486 void decode(bufferlist::iterator
&bl
) {
487 bl
.copy(sizeof(TransactionData
), (char*)this);
489 } __attribute__ ((packed
)) ;
492 TransactionData data
;
494 void *osr
{nullptr}; // NULL on replay
496 map
<coll_t
, __le32
> coll_index
;
497 map
<ghobject_t
, __le32
> object_index
;
500 __le32 object_id
{0};
507 list
<Context
*> on_applied
;
508 list
<Context
*> on_commit
;
509 list
<Context
*> on_applied_sync
;
512 Transaction() = default;
514 explicit Transaction(bufferlist::iterator
&dp
) {
517 explicit Transaction(bufferlist
&nbl
) {
518 bufferlist::iterator dp
= nbl
.begin();
522 // override default move operations to reset default values
523 Transaction(Transaction
&& other
) noexcept
:
524 data(std::move(other
.data
)),
526 coll_index(std::move(other
.coll_index
)),
527 object_index(std::move(other
.object_index
)),
528 coll_id(other
.coll_id
),
529 object_id(other
.object_id
),
530 data_bl(std::move(other
.data_bl
)),
531 op_bl(std::move(other
.op_bl
)),
532 op_ptr(std::move(other
.op_ptr
)),
533 on_applied(std::move(other
.on_applied
)),
534 on_commit(std::move(other
.on_commit
)),
535 on_applied_sync(std::move(other
.on_applied_sync
)) {
541 Transaction
& operator=(Transaction
&& other
) noexcept
{
542 data
= std::move(other
.data
);
544 coll_index
= std::move(other
.coll_index
);
545 object_index
= std::move(other
.object_index
);
546 coll_id
= other
.coll_id
;
547 object_id
= other
.object_id
;
548 data_bl
= std::move(other
.data_bl
);
549 op_bl
= std::move(other
.op_bl
);
550 op_ptr
= std::move(other
.op_ptr
);
551 on_applied
= std::move(other
.on_applied
);
552 on_commit
= std::move(other
.on_commit
);
553 on_applied_sync
= std::move(other
.on_applied_sync
);
560 Transaction(const Transaction
& other
) = default;
561 Transaction
& operator=(const Transaction
& other
) = default;
563 /* Operations on callback contexts */
564 void register_on_applied(Context
*c
) {
566 on_applied
.push_back(c
);
568 void register_on_commit(Context
*c
) {
570 on_commit
.push_back(c
);
572 void register_on_applied_sync(Context
*c
) {
574 on_applied_sync
.push_back(c
);
576 void register_on_complete(Context
*c
) {
578 RunOnDeleteRef
_complete (std::make_shared
<RunOnDelete
>(c
));
579 register_on_applied(new ContainerContext
<RunOnDeleteRef
>(_complete
));
580 register_on_commit(new ContainerContext
<RunOnDeleteRef
>(_complete
));
583 static void collect_contexts(
584 vector
<Transaction
>& t
,
585 Context
**out_on_applied
,
586 Context
**out_on_commit
,
587 Context
**out_on_applied_sync
) {
588 assert(out_on_applied
);
589 assert(out_on_commit
);
590 assert(out_on_applied_sync
);
591 list
<Context
*> on_applied
, on_commit
, on_applied_sync
;
592 for (vector
<Transaction
>::iterator i
= t
.begin();
595 on_applied
.splice(on_applied
.end(), (*i
).on_applied
);
596 on_commit
.splice(on_commit
.end(), (*i
).on_commit
);
597 on_applied_sync
.splice(on_applied_sync
.end(), (*i
).on_applied_sync
);
599 *out_on_applied
= C_Contexts::list_to_context(on_applied
);
600 *out_on_commit
= C_Contexts::list_to_context(on_commit
);
601 *out_on_applied_sync
= C_Contexts::list_to_context(on_applied_sync
);
604 Context
*get_on_applied() {
605 return C_Contexts::list_to_context(on_applied
);
607 Context
*get_on_commit() {
608 return C_Contexts::list_to_context(on_commit
);
610 Context
*get_on_applied_sync() {
611 return C_Contexts::list_to_context(on_applied_sync
);
614 void set_fadvise_flags(uint32_t flags
) {
615 data
.fadvise_flags
= flags
;
617 void set_fadvise_flag(uint32_t flag
) {
618 data
.fadvise_flags
= data
.fadvise_flags
| flag
;
620 uint32_t get_fadvise_flags() { return data
.fadvise_flags
; }
622 void swap(Transaction
& other
) noexcept
{
623 std::swap(data
, other
.data
);
624 std::swap(on_applied
, other
.on_applied
);
625 std::swap(on_commit
, other
.on_commit
);
626 std::swap(on_applied_sync
, other
.on_applied_sync
);
628 std::swap(coll_index
, other
.coll_index
);
629 std::swap(object_index
, other
.object_index
);
630 std::swap(coll_id
, other
.coll_id
);
631 std::swap(object_id
, other
.object_id
);
632 op_bl
.swap(other
.op_bl
);
633 data_bl
.swap(other
.data_bl
);
636 void _update_op(Op
* op
,
638 vector
<__le32
> &om
) {
653 case OP_OMAP_SETKEYS
:
655 case OP_OMAP_RMKEYRANGE
:
656 case OP_OMAP_SETHEADER
:
660 case OP_SETALLOCHINT
:
661 assert(op
->cid
< cm
.size());
662 assert(op
->oid
< om
.size());
663 op
->cid
= cm
[op
->cid
];
664 op
->oid
= om
[op
->oid
];
669 assert(op
->cid
< cm
.size());
670 assert(op
->oid
< om
.size());
671 assert(op
->dest_oid
< om
.size());
672 op
->cid
= cm
[op
->cid
];
673 op
->oid
= om
[op
->oid
];
674 op
->dest_oid
= om
[op
->dest_oid
];
679 case OP_COLL_SETATTR
:
681 case OP_COLL_SETATTRS
:
683 case OP_COLL_SET_BITS
:
684 assert(op
->cid
< cm
.size());
685 op
->cid
= cm
[op
->cid
];
689 assert(op
->cid
< cm
.size());
690 assert(op
->oid
< om
.size());
691 assert(op
->dest_cid
< om
.size());
692 op
->cid
= cm
[op
->cid
];
693 op
->dest_cid
= cm
[op
->dest_cid
];
694 op
->oid
= om
[op
->oid
];
697 case OP_COLL_MOVE_RENAME
:
698 assert(op
->cid
< cm
.size());
699 assert(op
->oid
< om
.size());
700 assert(op
->dest_cid
< cm
.size());
701 assert(op
->dest_oid
< om
.size());
702 op
->cid
= cm
[op
->cid
];
703 op
->oid
= om
[op
->oid
];
704 op
->dest_cid
= cm
[op
->dest_cid
];
705 op
->dest_oid
= om
[op
->dest_oid
];
709 assert(op
->cid
< cm
.size());
710 assert(op
->oid
< om
.size());
711 assert(op
->dest_oid
< om
.size());
712 op
->cid
= cm
[op
->cid
];
713 op
->oid
= om
[op
->oid
];
714 op
->dest_oid
= om
[op
->dest_oid
];
717 case OP_SPLIT_COLLECTION2
:
718 assert(op
->cid
< cm
.size());
719 assert(op
->dest_cid
< cm
.size());
720 op
->cid
= cm
[op
->cid
];
721 op
->dest_cid
= cm
[op
->dest_cid
];
725 assert(0 == "Unkown OP");
731 vector
<__le32
> &om
) {
733 list
<bufferptr
> list
= bl
.buffers();
734 std::list
<bufferptr
>::iterator p
;
736 for(p
= list
.begin(); p
!= list
.end(); ++p
) {
737 assert(p
->length() % sizeof(Op
) == 0);
739 char* raw_p
= p
->c_str();
740 char* raw_end
= raw_p
+ p
->length();
741 while (raw_p
< raw_end
) {
742 _update_op(reinterpret_cast<Op
*>(raw_p
), cm
, om
);
747 /// Append the operations of the parameter to this Transaction. Those operations are removed from the parameter Transaction
748 void append(Transaction
& other
) {
750 data
.ops
+= other
.data
.ops
;
751 if (other
.data
.largest_data_len
> data
.largest_data_len
) {
752 data
.largest_data_len
= other
.data
.largest_data_len
;
753 data
.largest_data_off
= other
.data
.largest_data_off
;
754 data
.largest_data_off_in_data_bl
= data_bl
.length() + other
.data
.largest_data_off_in_data_bl
;
756 data
.fadvise_flags
|= other
.data
.fadvise_flags
;
757 on_applied
.splice(on_applied
.end(), other
.on_applied
);
758 on_commit
.splice(on_commit
.end(), other
.on_commit
);
759 on_applied_sync
.splice(on_applied_sync
.end(), other
.on_applied_sync
);
761 //append coll_index & object_index
762 vector
<__le32
> cm(other
.coll_index
.size());
763 map
<coll_t
, __le32
>::iterator coll_index_p
;
764 for (coll_index_p
= other
.coll_index
.begin();
765 coll_index_p
!= other
.coll_index
.end();
767 cm
[coll_index_p
->second
] = _get_coll_id(coll_index_p
->first
);
770 vector
<__le32
> om(other
.object_index
.size());
771 map
<ghobject_t
, __le32
>::iterator object_index_p
;
772 for (object_index_p
= other
.object_index
.begin();
773 object_index_p
!= other
.object_index
.end();
775 om
[object_index_p
->second
] = _get_object_id(object_index_p
->first
);
778 //the other.op_bl SHOULD NOT be changes during append operation,
779 //we use additional bufferlist to avoid this problem
780 bufferptr
other_op_bl_ptr(other
.op_bl
.length());
781 other
.op_bl
.copy(0, other
.op_bl
.length(), other_op_bl_ptr
.c_str());
782 bufferlist other_op_bl
;
783 other_op_bl
.append(other_op_bl_ptr
);
785 //update other_op_bl with cm & om
786 //When the other is appended to current transaction, all coll_index and
787 //object_index in other.op_buffer should be updated by new index of the
788 //combined transaction
789 _update_op_bl(other_op_bl
, cm
, om
);
792 op_bl
.append(other_op_bl
);
794 data_bl
.append(other
.data_bl
);
797 /** Inquires about the Transaction as a whole. */
799 /// How big is the encoded Transaction buffer?
800 uint64_t get_encoded_bytes() {
801 //layout: data_bl + op_bl + coll_index + object_index + data
803 // coll_index size, object_index size and sizeof(transaction_data)
804 // all here, so they may be computed at compile-time
805 size_t final_size
= sizeof(__u32
) * 2 + sizeof(data
);
807 // coll_index second and object_index second
808 final_size
+= (coll_index
.size() + object_index
.size()) * sizeof(__le32
);
811 for (auto p
= coll_index
.begin(); p
!= coll_index
.end(); ++p
) {
812 final_size
+= p
->first
.encoded_size();
815 // object_index first
816 for (auto p
= object_index
.begin(); p
!= object_index
.end(); ++p
) {
817 final_size
+= p
->first
.encoded_size();
820 return data_bl
.length() +
825 /// Retain old version for regression testing purposes
826 uint64_t get_encoded_bytes_test() {
827 //layout: data_bl + op_bl + coll_index + object_index + data
829 ::encode(coll_index
, bl
);
830 ::encode(object_index
, bl
);
832 return data_bl
.length() +
838 uint64_t get_num_bytes() {
839 return get_encoded_bytes();
841 /// Size of largest data buffer to the "write" operation encountered so far
842 uint32_t get_data_length() {
843 return data
.largest_data_len
;
845 /// offset within the encoded buffer to the start of the largest data buffer that's encoded
846 uint32_t get_data_offset() {
847 if (data
.largest_data_off_in_data_bl
) {
848 return data
.largest_data_off_in_data_bl
+
849 sizeof(__u8
) + // encode struct_v
850 sizeof(__u8
) + // encode compat_v
851 sizeof(__u32
) + // encode len
852 sizeof(__u32
); // data_bl len
856 /// offset of buffer as aligned to destination within object.
857 int get_data_alignment() {
858 if (!data
.largest_data_len
)
860 return (0 - get_data_offset()) & ~CEPH_PAGE_MASK
;
862 /// Is the Transaction empty (no operations)
866 /// Number of operations in the transation
871 void set_osr(void *s
) {
882 * Helper object to parse Transactions.
884 * ObjectStore instances use this object to step down the encoded
885 * buffer decoding operation codes and parameters as we go.
894 bufferlist::iterator data_bl_p
;
897 vector
<coll_t
> colls
;
898 vector
<ghobject_t
> objects
;
901 explicit iterator(Transaction
*t
)
903 data_bl_p(t
->data_bl
.begin()),
904 colls(t
->coll_index
.size()),
905 objects(t
->object_index
.size()) {
908 op_buffer_p
= t
->op_bl
.get_contiguous(0, t
->data
.ops
* sizeof(Op
));
910 map
<coll_t
, __le32
>::iterator coll_index_p
;
911 for (coll_index_p
= t
->coll_index
.begin();
912 coll_index_p
!= t
->coll_index
.end();
914 colls
[coll_index_p
->second
] = coll_index_p
->first
;
917 map
<ghobject_t
, __le32
>::iterator object_index_p
;
918 for (object_index_p
= t
->object_index
.begin();
919 object_index_p
!= t
->object_index
.end();
921 objects
[object_index_p
->second
] = object_index_p
->first
;
925 friend class Transaction
;
935 Op
* op
= reinterpret_cast<Op
*>(op_buffer_p
);
936 op_buffer_p
+= sizeof(Op
);
941 string
decode_string() {
943 ::decode(s
, data_bl_p
);
946 void decode_bp(bufferptr
& bp
) {
947 ::decode(bp
, data_bl_p
);
949 void decode_bl(bufferlist
& bl
) {
950 ::decode(bl
, data_bl_p
);
952 void decode_attrset(map
<string
,bufferptr
>& aset
) {
953 ::decode(aset
, data_bl_p
);
955 void decode_attrset(map
<string
,bufferlist
>& aset
) {
956 ::decode(aset
, data_bl_p
);
958 void decode_attrset_bl(bufferlist
*pbl
) {
959 decode_str_str_map_to_bl(data_bl_p
, pbl
);
961 void decode_keyset(set
<string
> &keys
){
962 ::decode(keys
, data_bl_p
);
964 void decode_keyset_bl(bufferlist
*pbl
){
965 decode_str_set_to_bl(data_bl_p
, pbl
);
968 const ghobject_t
&get_oid(__le32 oid_id
) {
969 assert(oid_id
< objects
.size());
970 return objects
[oid_id
];
972 const coll_t
&get_cid(__le32 cid_id
) {
973 assert(cid_id
< colls
.size());
974 return colls
[cid_id
];
976 uint32_t get_fadvise_flags() const {
977 return t
->get_fadvise_flags();
982 return iterator(this);
986 void _build_actions_from_tbl();
989 * Helper functions to encode the various mutation elements of a
990 * transaction. These are 1:1 with the operation codes (see
991 * enumeration above). These routines ensure that the
992 * encoder/creator of a transaction gets the right data in the
993 * right place. Sadly, there's no corresponding version nor any
994 * form of seat belts for the decoder.
997 if (op_ptr
.length() == 0 || op_ptr
.offset() >= op_ptr
.length()) {
998 op_ptr
= bufferptr(sizeof(Op
) * OPS_PER_PTR
);
1000 bufferptr
ptr(op_ptr
, 0, sizeof(Op
));
1003 op_ptr
.set_offset(op_ptr
.offset() + sizeof(Op
));
1005 char* p
= ptr
.c_str();
1006 memset(p
, 0, sizeof(Op
));
1007 return reinterpret_cast<Op
*>(p
);
1009 __le32
_get_coll_id(const coll_t
& coll
) {
1010 map
<coll_t
, __le32
>::iterator c
= coll_index
.find(coll
);
1011 if (c
!= coll_index
.end())
1014 __le32 index_id
= coll_id
++;
1015 coll_index
[coll
] = index_id
;
1018 __le32
_get_object_id(const ghobject_t
& oid
) {
1019 map
<ghobject_t
, __le32
>::iterator o
= object_index
.find(oid
);
1020 if (o
!= object_index
.end())
1023 __le32 index_id
= object_id
++;
1024 object_index
[oid
] = index_id
;
1029 /// Commence a global file system sync operation.
1031 Op
* _op
= _get_next_op();
1032 _op
->op
= OP_STARTSYNC
;
1037 Op
* _op
= _get_next_op();
1044 * Ensure the existance of an object in a collection. Create an
1045 * empty object if necessary
1047 void touch(const coll_t
& cid
, const ghobject_t
& oid
) {
1048 Op
* _op
= _get_next_op();
1050 _op
->cid
= _get_coll_id(cid
);
1051 _op
->oid
= _get_object_id(oid
);
1055 * Write data to an offset within an object. If the object is too
1056 * small, it is expanded as needed. It is possible to specify an
1057 * offset beyond the current end of an object and it will be
1058 * expanded as needed. Simple implementations of ObjectStore will
1059 * just zero the data between the old end of the object and the
1060 * newly provided data. More sophisticated implementations of
1061 * ObjectStore will omit the untouched data and store it as a
1062 * "hole" in the file.
1064 * Note that a 0-length write does not affect the size of the object.
1066 void write(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t off
, uint64_t len
,
1067 const bufferlist
& write_data
, uint32_t flags
= 0) {
1068 uint32_t orig_len
= data_bl
.length();
1069 Op
* _op
= _get_next_op();
1071 _op
->cid
= _get_coll_id(cid
);
1072 _op
->oid
= _get_object_id(oid
);
1075 ::encode(write_data
, data_bl
);
1077 assert(len
== write_data
.length());
1078 data
.fadvise_flags
= data
.fadvise_flags
| flags
;
1079 if (write_data
.length() > data
.largest_data_len
) {
1080 data
.largest_data_len
= write_data
.length();
1081 data
.largest_data_off
= off
;
1082 data
.largest_data_off_in_data_bl
= orig_len
+ sizeof(__u32
); // we are about to
1087 * zero out the indicated byte range within an object. Some
1088 * ObjectStore instances may optimize this to release the
1089 * underlying storage space.
1091 * If the zero range extends beyond the end of the object, the object
1092 * size is extended, just as if we were writing a buffer full of zeros.
1093 * EXCEPT if the length is 0, in which case (just like a 0-length write)
1094 * we do not adjust the object size.
1096 void zero(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t off
, uint64_t len
) {
1097 Op
* _op
= _get_next_op();
1099 _op
->cid
= _get_coll_id(cid
);
1100 _op
->oid
= _get_object_id(oid
);
1105 /// Discard all data in the object beyond the specified size.
1106 void truncate(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t off
) {
1107 Op
* _op
= _get_next_op();
1108 _op
->op
= OP_TRUNCATE
;
1109 _op
->cid
= _get_coll_id(cid
);
1110 _op
->oid
= _get_object_id(oid
);
1114 /// Remove an object. All four parts of the object are removed.
1115 void remove(const coll_t
& cid
, const ghobject_t
& oid
) {
1116 Op
* _op
= _get_next_op();
1117 _op
->op
= OP_REMOVE
;
1118 _op
->cid
= _get_coll_id(cid
);
1119 _op
->oid
= _get_object_id(oid
);
1122 /// Set an xattr of an object
1123 void setattr(const coll_t
& cid
, const ghobject_t
& oid
, const char* name
, bufferlist
& val
) {
1125 setattr(cid
, oid
, n
, val
);
1127 /// Set an xattr of an object
1128 void setattr(const coll_t
& cid
, const ghobject_t
& oid
, const string
& s
, bufferlist
& val
) {
1129 Op
* _op
= _get_next_op();
1130 _op
->op
= OP_SETATTR
;
1131 _op
->cid
= _get_coll_id(cid
);
1132 _op
->oid
= _get_object_id(oid
);
1133 ::encode(s
, data_bl
);
1134 ::encode(val
, data_bl
);
1137 /// Set multiple xattrs of an object
1138 void setattrs(const coll_t
& cid
, const ghobject_t
& oid
, const map
<string
,bufferptr
>& attrset
) {
1139 Op
* _op
= _get_next_op();
1140 _op
->op
= OP_SETATTRS
;
1141 _op
->cid
= _get_coll_id(cid
);
1142 _op
->oid
= _get_object_id(oid
);
1143 ::encode(attrset
, data_bl
);
1146 /// Set multiple xattrs of an object
1147 void setattrs(const coll_t
& cid
, const ghobject_t
& oid
, const map
<string
,bufferlist
>& attrset
) {
1148 Op
* _op
= _get_next_op();
1149 _op
->op
= OP_SETATTRS
;
1150 _op
->cid
= _get_coll_id(cid
);
1151 _op
->oid
= _get_object_id(oid
);
1152 ::encode(attrset
, data_bl
);
1155 /// remove an xattr from an object
1156 void rmattr(const coll_t
& cid
, const ghobject_t
& oid
, const char *name
) {
1158 rmattr(cid
, oid
, n
);
1160 /// remove an xattr from an object
1161 void rmattr(const coll_t
& cid
, const ghobject_t
& oid
, const string
& s
) {
1162 Op
* _op
= _get_next_op();
1163 _op
->op
= OP_RMATTR
;
1164 _op
->cid
= _get_coll_id(cid
);
1165 _op
->oid
= _get_object_id(oid
);
1166 ::encode(s
, data_bl
);
1169 /// remove all xattrs from an object
1170 void rmattrs(const coll_t
& cid
, const ghobject_t
& oid
) {
1171 Op
* _op
= _get_next_op();
1172 _op
->op
= OP_RMATTRS
;
1173 _op
->cid
= _get_coll_id(cid
);
1174 _op
->oid
= _get_object_id(oid
);
1178 * Clone an object into another object.
1180 * Low-cost (e.g., O(1)) cloning (if supported) is best, but
1181 * fallback to an O(n) copy is allowed. All four parts of the
1182 * object are cloned (data, xattrs, omap header, omap
1185 * The destination named object may already exist, in
1186 * which case its previous contents are discarded.
1188 void clone(const coll_t
& cid
, const ghobject_t
& oid
,
1189 const ghobject_t
& noid
) {
1190 Op
* _op
= _get_next_op();
1192 _op
->cid
= _get_coll_id(cid
);
1193 _op
->oid
= _get_object_id(oid
);
1194 _op
->dest_oid
= _get_object_id(noid
);
1198 * Clone a byte range from one object to another.
1200 * The data portion of the destination object receives a copy of a
1201 * portion of the data from the source object. None of the other
1202 * three parts of an object is copied from the source.
1204 * The destination object size may be extended to the dstoff + len.
1206 * The source range *must* overlap with the source object data. If it does
1207 * not the result is undefined.
1209 void clone_range(const coll_t
& cid
, const ghobject_t
& oid
,
1210 const ghobject_t
& noid
,
1211 uint64_t srcoff
, uint64_t srclen
, uint64_t dstoff
) {
1212 Op
* _op
= _get_next_op();
1213 _op
->op
= OP_CLONERANGE2
;
1214 _op
->cid
= _get_coll_id(cid
);
1215 _op
->oid
= _get_object_id(oid
);
1216 _op
->dest_oid
= _get_object_id(noid
);
1219 _op
->dest_off
= dstoff
;
1223 /// Create the collection
1224 void create_collection(const coll_t
& cid
, int bits
) {
1225 Op
* _op
= _get_next_op();
1226 _op
->op
= OP_MKCOLL
;
1227 _op
->cid
= _get_coll_id(cid
);
1228 _op
->split_bits
= bits
;
1233 * Give the collection a hint.
1235 * @param cid - collection id.
1236 * @param type - hint type.
1237 * @param hint - the hint payload, which contains the customized
1238 * data along with the hint type.
1240 void collection_hint(const coll_t
& cid
, uint32_t type
, const bufferlist
& hint
) {
1241 Op
* _op
= _get_next_op();
1242 _op
->op
= OP_COLL_HINT
;
1243 _op
->cid
= _get_coll_id(cid
);
1244 _op
->hint_type
= type
;
1245 ::encode(hint
, data_bl
);
1249 /// remove the collection, the collection must be empty
1250 void remove_collection(const coll_t
& cid
) {
1251 Op
* _op
= _get_next_op();
1252 _op
->op
= OP_RMCOLL
;
1253 _op
->cid
= _get_coll_id(cid
);
1256 void collection_move(const coll_t
& cid
, coll_t oldcid
, const ghobject_t
& oid
)
1257 __attribute__ ((deprecated
)) {
1258 // NOTE: we encode this as a fixed combo of ADD + REMOVE. they
1259 // always appear together, so this is effectively a single MOVE.
1260 Op
* _op
= _get_next_op();
1261 _op
->op
= OP_COLL_ADD
;
1262 _op
->cid
= _get_coll_id(oldcid
);
1263 _op
->oid
= _get_object_id(oid
);
1264 _op
->dest_cid
= _get_coll_id(cid
);
1267 _op
= _get_next_op();
1268 _op
->op
= OP_COLL_REMOVE
;
1269 _op
->cid
= _get_coll_id(oldcid
);
1270 _op
->oid
= _get_object_id(oid
);
1273 void collection_move_rename(const coll_t
& oldcid
, const ghobject_t
& oldoid
,
1274 coll_t cid
, const ghobject_t
& oid
) {
1275 Op
* _op
= _get_next_op();
1276 _op
->op
= OP_COLL_MOVE_RENAME
;
1277 _op
->cid
= _get_coll_id(oldcid
);
1278 _op
->oid
= _get_object_id(oldoid
);
1279 _op
->dest_cid
= _get_coll_id(cid
);
1280 _op
->dest_oid
= _get_object_id(oid
);
1283 void try_rename(coll_t cid
, const ghobject_t
& oldoid
,
1284 const ghobject_t
& oid
) {
1285 Op
* _op
= _get_next_op();
1286 _op
->op
= OP_TRY_RENAME
;
1287 _op
->cid
= _get_coll_id(cid
);
1288 _op
->oid
= _get_object_id(oldoid
);
1289 _op
->dest_oid
= _get_object_id(oid
);
1293 /// Remove omap from oid
1295 coll_t cid
, ///< [in] Collection containing oid
1296 const ghobject_t
&oid
///< [in] Object from which to remove omap
1298 Op
* _op
= _get_next_op();
1299 _op
->op
= OP_OMAP_CLEAR
;
1300 _op
->cid
= _get_coll_id(cid
);
1301 _op
->oid
= _get_object_id(oid
);
1304 /// Set keys on oid omap. Replaces duplicate keys.
1306 const coll_t
& cid
, ///< [in] Collection containing oid
1307 const ghobject_t
&oid
, ///< [in] Object to update
1308 const map
<string
, bufferlist
> &attrset
///< [in] Replacement keys and values
1310 Op
* _op
= _get_next_op();
1311 _op
->op
= OP_OMAP_SETKEYS
;
1312 _op
->cid
= _get_coll_id(cid
);
1313 _op
->oid
= _get_object_id(oid
);
1314 ::encode(attrset
, data_bl
);
1318 /// Set keys on an oid omap (bufferlist variant).
1320 coll_t cid
, ///< [in] Collection containing oid
1321 const ghobject_t
&oid
, ///< [in] Object to update
1322 const bufferlist
&attrset_bl
///< [in] Replacement keys and values
1324 Op
* _op
= _get_next_op();
1325 _op
->op
= OP_OMAP_SETKEYS
;
1326 _op
->cid
= _get_coll_id(cid
);
1327 _op
->oid
= _get_object_id(oid
);
1328 data_bl
.append(attrset_bl
);
1332 /// Remove keys from oid omap
1334 coll_t cid
, ///< [in] Collection containing oid
1335 const ghobject_t
&oid
, ///< [in] Object from which to remove the omap
1336 const set
<string
> &keys
///< [in] Keys to clear
1338 Op
* _op
= _get_next_op();
1339 _op
->op
= OP_OMAP_RMKEYS
;
1340 _op
->cid
= _get_coll_id(cid
);
1341 _op
->oid
= _get_object_id(oid
);
1342 ::encode(keys
, data_bl
);
1346 /// Remove keys from oid omap
1348 coll_t cid
, ///< [in] Collection containing oid
1349 const ghobject_t
&oid
, ///< [in] Object from which to remove the omap
1350 const bufferlist
&keys_bl
///< [in] Keys to clear
1352 Op
* _op
= _get_next_op();
1353 _op
->op
= OP_OMAP_RMKEYS
;
1354 _op
->cid
= _get_coll_id(cid
);
1355 _op
->oid
= _get_object_id(oid
);
1356 data_bl
.append(keys_bl
);
1360 /// Remove key range from oid omap
1361 void omap_rmkeyrange(
1362 coll_t cid
, ///< [in] Collection containing oid
1363 const ghobject_t
&oid
, ///< [in] Object from which to remove the omap keys
1364 const string
& first
, ///< [in] first key in range
1365 const string
& last
///< [in] first key past range, range is [first,last)
1367 Op
* _op
= _get_next_op();
1368 _op
->op
= OP_OMAP_RMKEYRANGE
;
1369 _op
->cid
= _get_coll_id(cid
);
1370 _op
->oid
= _get_object_id(oid
);
1371 ::encode(first
, data_bl
);
1372 ::encode(last
, data_bl
);
1377 void omap_setheader(
1378 coll_t cid
, ///< [in] Collection containing oid
1379 const ghobject_t
&oid
, ///< [in] Object
1380 const bufferlist
&bl
///< [in] Header value
1382 Op
* _op
= _get_next_op();
1383 _op
->op
= OP_OMAP_SETHEADER
;
1384 _op
->cid
= _get_coll_id(cid
);
1385 _op
->oid
= _get_object_id(oid
);
1386 ::encode(bl
, data_bl
);
1390 /// Split collection based on given prefixes, objects matching the specified bits/rem are
1391 /// moved to the new collection
1392 void split_collection(
1396 coll_t destination
) {
1397 Op
* _op
= _get_next_op();
1398 _op
->op
= OP_SPLIT_COLLECTION2
;
1399 _op
->cid
= _get_coll_id(cid
);
1400 _op
->dest_cid
= _get_coll_id(destination
);
1401 _op
->split_bits
= bits
;
1402 _op
->split_rem
= rem
;
1406 void collection_set_bits(
1409 Op
* _op
= _get_next_op();
1410 _op
->op
= OP_COLL_SET_BITS
;
1411 _op
->cid
= _get_coll_id(cid
);
1412 _op
->split_bits
= bits
;
1416 /// Set allocation hint for an object
1417 /// make 0 values(expected_object_size, expected_write_size) noops for all implementations
1418 void set_alloc_hint(
1420 const ghobject_t
&oid
,
1421 uint64_t expected_object_size
,
1422 uint64_t expected_write_size
,
1425 Op
* _op
= _get_next_op();
1426 _op
->op
= OP_SETALLOCHINT
;
1427 _op
->cid
= _get_coll_id(cid
);
1428 _op
->oid
= _get_object_id(oid
);
1429 _op
->expected_object_size
= expected_object_size
;
1430 _op
->expected_write_size
= expected_write_size
;
1431 _op
->alloc_hint_flags
= flags
;
1435 void encode(bufferlist
& bl
) const {
1436 //layout: data_bl + op_bl + coll_index + object_index + data
1437 ENCODE_START(9, 9, bl
);
1438 ::encode(data_bl
, bl
);
1439 ::encode(op_bl
, bl
);
1440 ::encode(coll_index
, bl
);
1441 ::encode(object_index
, bl
);
1446 void decode(bufferlist::iterator
&bl
) {
1447 DECODE_START(9, bl
);
1450 ::decode(data_bl
, bl
);
1451 ::decode(op_bl
, bl
);
1452 ::decode(coll_index
, bl
);
1453 ::decode(object_index
, bl
);
1455 coll_id
= coll_index
.size();
1456 object_id
= object_index
.size();
1461 void dump(ceph::Formatter
*f
);
1462 static void generate_test_instances(list
<Transaction
*>& o
);
1465 // synchronous wrappers
1466 unsigned apply_transaction(Sequencer
*osr
, Transaction
&& t
, Context
*ondisk
=0) {
1467 vector
<Transaction
> tls
;
1468 tls
.push_back(std::move(t
));
1469 return apply_transactions(osr
, tls
, ondisk
);
1471 unsigned apply_transactions(Sequencer
*osr
, vector
<Transaction
>& tls
, Context
*ondisk
=0);
1473 int queue_transaction(Sequencer
*osr
, Transaction
&& t
, Context
*onreadable
, Context
*ondisk
=0,
1474 Context
*onreadable_sync
=0,
1475 TrackedOpRef op
= TrackedOpRef(),
1476 ThreadPool::TPHandle
*handle
= NULL
) {
1477 vector
<Transaction
> tls
;
1478 tls
.push_back(std::move(t
));
1479 return queue_transactions(osr
, tls
, onreadable
, ondisk
, onreadable_sync
,
1483 int queue_transactions(Sequencer
*osr
, vector
<Transaction
>& tls
,
1484 Context
*onreadable
, Context
*ondisk
=0,
1485 Context
*onreadable_sync
=0,
1486 TrackedOpRef op
= TrackedOpRef(),
1487 ThreadPool::TPHandle
*handle
= NULL
) {
1488 assert(!tls
.empty());
1489 tls
.back().register_on_applied(onreadable
);
1490 tls
.back().register_on_commit(ondisk
);
1491 tls
.back().register_on_applied_sync(onreadable_sync
);
1492 return queue_transactions(osr
, tls
, op
, handle
);
1495 virtual int queue_transactions(
1496 Sequencer
*osr
, vector
<Transaction
>& tls
,
1497 TrackedOpRef op
= TrackedOpRef(),
1498 ThreadPool::TPHandle
*handle
= NULL
) = 0;
1501 int queue_transactions(
1503 vector
<Transaction
>& tls
,
1504 Context
*onreadable
,
1506 Context
*onreadable_sync
,
1507 Context
*oncomplete
,
1510 int queue_transaction(
1513 Context
*onreadable
,
1515 Context
*onreadable_sync
,
1516 Context
*oncomplete
,
1519 vector
<Transaction
> tls
;
1520 tls
.push_back(std::move(t
));
1521 return queue_transactions(
1522 osr
, tls
, onreadable
, oncommit
, onreadable_sync
, oncomplete
, op
);
1526 ObjectStore(CephContext
* cct
,
1527 const std::string
& path_
) : path(path_
), cct(cct
) {}
1528 virtual ~ObjectStore() {}
1531 explicit ObjectStore(const ObjectStore
& o
) = delete;
1532 const ObjectStore
& operator=(const ObjectStore
& o
) = delete;
1535 virtual int upgrade() {
1539 virtual void get_db_statistics(Formatter
*f
) { }
1540 virtual void generate_db_histogram(Formatter
*f
) { }
1541 virtual void flush_cache() { }
1542 virtual void dump_perf_counters(Formatter
*f
) {}
1544 virtual string
get_type() = 0;
1547 virtual bool test_mount_in_use() = 0;
1548 virtual int mount() = 0;
1549 virtual int umount() = 0;
1550 virtual int fsck(bool deep
) {
1553 virtual int repair(bool deep
) {
1557 virtual void set_cache_shards(unsigned num
) { }
1560 * Returns 0 if the hobject is valid, -error otherwise
1563 * -ENAMETOOLONG: locator/namespace/name too large
1565 virtual int validate_hobject_key(const hobject_t
&obj
) const = 0;
1567 virtual unsigned get_max_attr_name_length() = 0;
1568 virtual int mkfs() = 0; // wipe
1569 virtual int mkjournal() = 0; // journal only
1570 virtual bool needs_journal() = 0; //< requires a journal
1571 virtual bool wants_journal() = 0; //< prefers a journal
1572 virtual bool allows_journal() = 0; //< allows a journal
1577 * Check whether store is backed by a rotational (HDD) or non-rotational
1580 * This must be usable *before* the store is mounted.
1582 * @return true for HDD, false for SSD
1584 virtual bool is_rotational() {
1589 * is_journal_rotational
1591 * Check whether journal is backed by a rotational (HDD) or non-rotational
1595 * @return true for HDD, false for SSD
1597 virtual bool is_journal_rotational() {
1601 virtual string
get_default_device_class() {
1602 return is_rotational() ? "hdd" : "ssd";
1605 virtual bool can_sort_nibblewise() {
1606 return false; // assume a backend cannot, unless it says otherwise
1609 virtual int statfs(struct store_statfs_t
*buf
) = 0;
1611 virtual void collect_metadata(map
<string
,string
> *pm
) { }
1614 * write_meta - write a simple configuration key out-of-band
1616 * Write a simple key/value pair for basic store configuration
1617 * (e.g., a uuid or magic number) to an unopened/unmounted store.
1618 * The default implementation writes this to a plaintext file in the
1621 * A newline is appended.
1623 * @param key key name (e.g., "fsid")
1624 * @param value value (e.g., a uuid rendered as a string)
1625 * @returns 0 for success, or an error code
1627 virtual int write_meta(const std::string
& key
,
1628 const std::string
& value
);
1631 * read_meta - read a simple configuration key out-of-band
1633 * Read a simple key value to an unopened/mounted store.
1635 * Trailing whitespace is stripped off.
1637 * @param key key name
1638 * @param value pointer to value string
1639 * @returns 0 for success, or an error code
1641 virtual int read_meta(const std::string
& key
,
1642 std::string
*value
);
1645 * get ideal max value for collection_list()
1647 * default to some arbitrary values; the implementation will override.
1649 virtual int get_ideal_list_max() { return 64; }
1653 * get a collection handle
1655 * Provide a trivial handle as a default to avoid converting legacy
1658 virtual CollectionHandle
open_collection(const coll_t
&cid
) {
1659 return new CompatCollectionHandle(cid
);
1664 * Synchronous read operations
1668 * exists -- Test for existance of object
1670 * @param cid collection for object
1671 * @param oid oid of object
1672 * @returns true if object exists, false otherwise
1674 virtual bool exists(const coll_t
& cid
, const ghobject_t
& oid
) = 0; // useful?
1675 virtual bool exists(CollectionHandle
& c
, const ghobject_t
& oid
) {
1676 return exists(c
->get_cid(), oid
);
1679 * set_collection_opts -- set pool options for a collectioninformation for an object
1681 * @param cid collection
1682 * @param opts new collection options
1683 * @returns 0 on success, negative error code on failure.
1685 virtual int set_collection_opts(
1687 const pool_opts_t
& opts
) = 0;
1690 * stat -- get information for an object
1692 * @param cid collection for object
1693 * @param oid oid of object
1694 * @param st output information for the object
1695 * @param allow_eio if false, assert on -EIO operation failure
1696 * @returns 0 on success, negative error code on failure.
1700 const ghobject_t
& oid
,
1702 bool allow_eio
= false) = 0; // struct stat?
1704 CollectionHandle
&c
,
1705 const ghobject_t
& oid
,
1707 bool allow_eio
= false) {
1708 return stat(c
->get_cid(), oid
, st
, allow_eio
);
1712 * read -- read a byte range of data from an object
1714 * Note: if reading from an offset past the end of the object, we
1715 * return 0 (not, say, -EINVAL).
1717 * @param cid collection for object
1718 * @param oid oid of object
1719 * @param offset location offset of first byte to be read
1720 * @param len number of bytes to be read
1721 * @param bl output bufferlist
1722 * @param op_flags is CEPH_OSD_OP_FLAG_*
1723 * @param allow_eio if false, assert on -EIO operation failure
1724 * @returns number of bytes read on success, or negative error code on failure.
1728 const ghobject_t
& oid
,
1732 uint32_t op_flags
= 0) = 0;
1734 CollectionHandle
&c
,
1735 const ghobject_t
& oid
,
1739 uint32_t op_flags
= 0) {
1740 return read(c
->get_cid(), oid
, offset
, len
, bl
, op_flags
);
1744 * fiemap -- get extent map of data of an object
1746 * Returns an encoded map of the extents of an object's data portion
1747 * (map<offset,size>).
1749 * A non-enlightened implementation is free to return the extent (offset, len)
1750 * as the sole extent.
1752 * @param cid collection for object
1753 * @param oid oid of object
1754 * @param offset location offset of first byte to be read
1755 * @param len number of bytes to be read
1756 * @param bl output bufferlist for extent map information.
1757 * @returns 0 on success, negative error code on failure.
1759 virtual int fiemap(const coll_t
& cid
, const ghobject_t
& oid
,
1760 uint64_t offset
, size_t len
, bufferlist
& bl
) = 0;
1761 virtual int fiemap(const coll_t
& cid
, const ghobject_t
& oid
,
1762 uint64_t offset
, size_t len
,
1763 map
<uint64_t, uint64_t>& destmap
) = 0;
1764 virtual int fiemap(CollectionHandle
& c
, const ghobject_t
& oid
,
1765 uint64_t offset
, size_t len
, bufferlist
& bl
) {
1766 return fiemap(c
->get_cid(), oid
, offset
, len
, bl
);
1768 virtual int fiemap(CollectionHandle
& c
, const ghobject_t
& oid
,
1769 uint64_t offset
, size_t len
, map
<uint64_t, uint64_t>& destmap
) {
1770 return fiemap(c
->get_cid(), oid
, offset
, len
, destmap
);
1774 * getattr -- get an xattr of an object
1776 * @param cid collection for object
1777 * @param oid oid of object
1778 * @param name name of attr to read
1779 * @param value place to put output result.
1780 * @returns 0 on success, negative error code on failure.
1782 virtual int getattr(const coll_t
& cid
, const ghobject_t
& oid
,
1783 const char *name
, bufferptr
& value
) = 0;
1784 virtual int getattr(CollectionHandle
&c
, const ghobject_t
& oid
,
1785 const char *name
, bufferptr
& value
) {
1786 return getattr(c
->get_cid(), oid
, name
, value
);
1790 * getattr -- get an xattr of an object
1792 * @param cid collection for object
1793 * @param oid oid of object
1794 * @param name name of attr to read
1795 * @param value place to put output result.
1796 * @returns 0 on success, negative error code on failure.
1798 int getattr(const coll_t
& cid
, const ghobject_t
& oid
, const char *name
, bufferlist
& value
) {
1800 int r
= getattr(cid
, oid
, name
, bp
);
1802 value
.push_back(bp
);
1806 coll_t cid
, const ghobject_t
& oid
,
1807 const string
& name
, bufferlist
& value
) {
1809 int r
= getattr(cid
, oid
, name
.c_str(), bp
);
1810 value
.push_back(bp
);
1814 CollectionHandle
&c
, const ghobject_t
& oid
,
1815 const string
& name
, bufferlist
& value
) {
1817 int r
= getattr(c
, oid
, name
.c_str(), bp
);
1818 value
.push_back(bp
);
1823 * getattrs -- get all of the xattrs of an object
1825 * @param cid collection for object
1826 * @param oid oid of object
1827 * @param aset place to put output result.
1828 * @returns 0 on success, negative error code on failure.
1830 virtual int getattrs(const coll_t
& cid
, const ghobject_t
& oid
,
1831 map
<string
,bufferptr
>& aset
) = 0;
1832 virtual int getattrs(CollectionHandle
&c
, const ghobject_t
& oid
,
1833 map
<string
,bufferptr
>& aset
) {
1834 return getattrs(c
->get_cid(), oid
, aset
);
1838 * getattrs -- get all of the xattrs of an object
1840 * @param cid collection for object
1841 * @param oid oid of object
1842 * @param aset place to put output result.
1843 * @returns 0 on success, negative error code on failure.
1845 int getattrs(const coll_t
& cid
, const ghobject_t
& oid
, map
<string
,bufferlist
>& aset
) {
1846 map
<string
,bufferptr
> bmap
;
1847 int r
= getattrs(cid
, oid
, bmap
);
1848 for (map
<string
,bufferptr
>::iterator i
= bmap
.begin();
1851 aset
[i
->first
].append(i
->second
);
1855 int getattrs(CollectionHandle
&c
, const ghobject_t
& oid
,
1856 map
<string
,bufferlist
>& aset
) {
1857 map
<string
,bufferptr
> bmap
;
1858 int r
= getattrs(c
, oid
, bmap
);
1859 for (map
<string
,bufferptr
>::iterator i
= bmap
.begin();
1862 aset
[i
->first
].append(i
->second
);
1871 * list_collections -- get all of the collections known to this ObjectStore
1873 * @param ls list of the collections in sorted order.
1874 * @returns 0 on success, negative error code on failure.
1876 virtual int list_collections(vector
<coll_t
>& ls
) = 0;
1879 * does a collection exist?
1881 * @param c collection
1882 * @returns true if it exists, false otherwise
1884 virtual bool collection_exists(const coll_t
& c
) = 0;
1887 * is a collection empty?
1889 * @param c collection
1890 * @param empty true if the specified collection is empty, false otherwise
1891 * @returns 0 on success, negative error code on failure.
1893 virtual int collection_empty(const coll_t
& c
, bool *empty
) = 0;
1896 * return the number of significant bits of the coll_t::pgid.
1898 * This should return what the last create_collection or split_collection
1899 * set. A legacy backend may return -EAGAIN if the value is unavailable
1900 * (because we upgraded from an older version, e.g., FileStore).
1902 virtual int collection_bits(const coll_t
& c
) = 0;
1906 * list contents of a collection that fall in the range [start, end) and no more than a specified many result
1908 * @param c collection
1909 * @param start list object that sort >= this value
1910 * @param end list objects that sort < this value
1911 * @param max return no more than this many results
1912 * @param seq return no objects with snap < seq
1913 * @param ls [out] result
1914 * @param next [out] next item sorts >= this value
1915 * @return zero on success, or negative error
1917 virtual int collection_list(const coll_t
& c
,
1918 const ghobject_t
& start
, const ghobject_t
& end
,
1920 vector
<ghobject_t
> *ls
, ghobject_t
*next
) = 0;
1921 virtual int collection_list(CollectionHandle
&c
,
1922 const ghobject_t
& start
, const ghobject_t
& end
,
1924 vector
<ghobject_t
> *ls
, ghobject_t
*next
) {
1925 return collection_list(c
->get_cid(), start
, end
, max
, ls
, next
);
1930 /// Get omap contents
1931 virtual int omap_get(
1932 const coll_t
& c
, ///< [in] Collection containing oid
1933 const ghobject_t
&oid
, ///< [in] Object containing omap
1934 bufferlist
*header
, ///< [out] omap header
1935 map
<string
, bufferlist
> *out
/// < [out] Key to value map
1937 virtual int omap_get(
1938 CollectionHandle
&c
, ///< [in] Collection containing oid
1939 const ghobject_t
&oid
, ///< [in] Object containing omap
1940 bufferlist
*header
, ///< [out] omap header
1941 map
<string
, bufferlist
> *out
/// < [out] Key to value map
1943 return omap_get(c
->get_cid(), oid
, header
, out
);
1947 virtual int omap_get_header(
1948 const coll_t
& c
, ///< [in] Collection containing oid
1949 const ghobject_t
&oid
, ///< [in] Object containing omap
1950 bufferlist
*header
, ///< [out] omap header
1951 bool allow_eio
= false ///< [in] don't assert on eio
1953 virtual int omap_get_header(
1954 CollectionHandle
&c
, ///< [in] Collection containing oid
1955 const ghobject_t
&oid
, ///< [in] Object containing omap
1956 bufferlist
*header
, ///< [out] omap header
1957 bool allow_eio
= false ///< [in] don't assert on eio
1959 return omap_get_header(c
->get_cid(), oid
, header
, allow_eio
);
1962 /// Get keys defined on oid
1963 virtual int omap_get_keys(
1964 const coll_t
& c
, ///< [in] Collection containing oid
1965 const ghobject_t
&oid
, ///< [in] Object containing omap
1966 set
<string
> *keys
///< [out] Keys defined on oid
1968 virtual int omap_get_keys(
1969 CollectionHandle
&c
, ///< [in] Collection containing oid
1970 const ghobject_t
&oid
, ///< [in] Object containing omap
1971 set
<string
> *keys
///< [out] Keys defined on oid
1973 return omap_get_keys(c
->get_cid(), oid
, keys
);
1977 virtual int omap_get_values(
1978 const coll_t
& c
, ///< [in] Collection containing oid
1979 const ghobject_t
&oid
, ///< [in] Object containing omap
1980 const set
<string
> &keys
, ///< [in] Keys to get
1981 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
1983 virtual int omap_get_values(
1984 CollectionHandle
&c
, ///< [in] Collection containing oid
1985 const ghobject_t
&oid
, ///< [in] Object containing omap
1986 const set
<string
> &keys
, ///< [in] Keys to get
1987 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
1989 return omap_get_values(c
->get_cid(), oid
, keys
, out
);
1992 /// Filters keys into out which are defined on oid
1993 virtual int omap_check_keys(
1994 const coll_t
& c
, ///< [in] Collection containing oid
1995 const ghobject_t
&oid
, ///< [in] Object containing omap
1996 const set
<string
> &keys
, ///< [in] Keys to check
1997 set
<string
> *out
///< [out] Subset of keys defined on oid
1999 virtual int omap_check_keys(
2000 CollectionHandle
&c
, ///< [in] Collection containing oid
2001 const ghobject_t
&oid
, ///< [in] Object containing omap
2002 const set
<string
> &keys
, ///< [in] Keys to check
2003 set
<string
> *out
///< [out] Subset of keys defined on oid
2005 return omap_check_keys(c
->get_cid(), oid
, keys
, out
);
2009 * Returns an object map iterator
2011 * Warning! The returned iterator is an implicit lock on filestore
2012 * operations in c. Do not use filestore methods on c while the returned
2013 * iterator is live. (Filling in a transaction is no problem).
2015 * @return iterator, null on error
2017 virtual ObjectMap::ObjectMapIterator
get_omap_iterator(
2018 const coll_t
& c
, ///< [in] collection
2019 const ghobject_t
&oid
///< [in] object
2021 virtual ObjectMap::ObjectMapIterator
get_omap_iterator(
2022 CollectionHandle
&c
, ///< [in] collection
2023 const ghobject_t
&oid
///< [in] object
2025 return get_omap_iterator(c
->get_cid(), oid
);
2028 virtual int flush_journal() { return -EOPNOTSUPP
; }
2030 virtual int dump_journal(ostream
& out
) { return -EOPNOTSUPP
; }
2032 virtual int snapshot(const string
& name
) { return -EOPNOTSUPP
; }
2035 * Set and get internal fsid for this instance. No external data is modified
2037 virtual void set_fsid(uuid_d u
) = 0;
2038 virtual uuid_d
get_fsid() = 0;
2041 * Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store
2042 * - num objects - total (including witeouts) object count to measure used space for.
2044 virtual uint64_t estimate_objects_overhead(uint64_t num_objects
) = 0;
2048 virtual void inject_data_error(const ghobject_t
&oid
) {}
2049 virtual void inject_mdata_error(const ghobject_t
&oid
) {}
2051 virtual void compact() {}
2052 virtual bool has_builtin_csum() const {
2056 WRITE_CLASS_ENCODER(ObjectStore::Transaction
)
2057 WRITE_CLASS_ENCODER(ObjectStore::Transaction::TransactionData
)
2059 static inline void intrusive_ptr_add_ref(ObjectStore::Sequencer_impl
*s
) {
2062 static inline void intrusive_ptr_release(ObjectStore::Sequencer_impl
*s
) {
2066 ostream
& operator<<(ostream
& out
, const ObjectStore::Sequencer
& s
);
2067 ostream
& operator<<(ostream
& out
, const ObjectStore::Transaction
& tx
);