1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
8 #include "include/Context.h"
9 #include "include/int_types.h"
10 #include "include/buffer.h"
12 #include "osd/osd_types.h"
14 #define OPS_PER_PTR 32
16 void decode_str_str_map_to_bl(ceph::buffer::list::const_iterator
& p
, ceph::buffer::list
*out
);
17 void decode_str_set_to_bl(ceph::buffer::list::const_iterator
& p
, ceph::buffer::list
*out
);
20 /*********************************
23 * A Transaction represents a sequence of primitive mutation
26 * Three events in the life of a Transaction result in
27 * callbacks. Any Transaction can contain any number of callback
28 * objects (Context) for any combination of the three classes of
31 * on_applied_sync, on_applied, and on_commit.
33 * The "on_applied" and "on_applied_sync" callbacks are invoked when
34 * the modifications requested by the Transaction are visible to
35 * subsequent ObjectStore operations, i.e., the results are
36 * readable. The only conceptual difference between on_applied and
37 * on_applied_sync is the specific thread and locking environment in
38 * which the callbacks operate. "on_applied_sync" is called
39 * directly by an ObjectStore execution thread. It is expected to
40 * execute quickly and must not acquire any locks of the calling
41 * environment. Conversely, "on_applied" is called from the separate
42 * Finisher thread, meaning that it can contend for calling
43 * environment locks. NB, on_applied and on_applied_sync are
44 * sometimes called on_readable and on_readable_sync.
46 * The "on_commit" callback is also called from the Finisher thread
47 * and indicates that all of the mutations have been durably
48 * committed to stable storage (i.e., are now software/hardware
51 * At the implementation level, each mutation primitive (and its
52 * associated data) can be serialized to a single buffer. That
53 * serialization, however, does not copy any data, but (using the
54 * ceph::buffer::list library) will reference the original buffers. This
55 * implies that the buffer that contains the data being submitted
56 * must remain stable until the on_commit callback completes. In
57 * practice, ceph::buffer::list handles all of this for you and this
58 * subtlety is only relevant if you are referencing an existing
59 * buffer via buffer::raw_static.
61 * Some implementations of ObjectStore choose to implement their own
62 * form of journaling that uses the serialized form of a
63 * Transaction. This requires that the encode/decode logic properly
64 * version itself and handle version upgrades that might change the
65 * format of the encoded Transaction. This has already happened a
66 * couple of times and the Transaction object contains some helper
67 * variables that aid in this legacy decoding:
69 * sobject_encoding detects an older/simpler version of oid
70 * present in pre-bobtail versions of ceph. use_pool_override
71 * also detects a situation where the pool of an oid can be
72 * overridden for legacy operations/buffers. For non-legacy
73 * implementations of ObjectStore, neither of these fields are
77 * TRANSACTION ISOLATION
79 * Except as noted above, isolation is the responsibility of the
80 * caller. In other words, if any storage element (storage element
81 * == any of the four portions of an object as described above) is
82 * altered by a transaction (including deletion), the caller
83 * promises not to attempt to read that element while the
84 * transaction is pending (here pending means from the time of
85 * issuance until the "on_applied_sync" callback has been
86 * received). Violations of isolation need not be detected by
87 * ObjectStore and there is no corresponding error mechanism for
88 * reporting an isolation violation (crashing would be the
89 * appropriate way to report an isolation violation if detected).
91 * Enumeration operations may violate transaction isolation as
92 * described above when a storage element is being created or
93 * deleted as part of a transaction. In this case, ObjectStore is
94 * allowed to consider the enumeration operation to either precede
95 * or follow the violating transaction element. In other words, the
96 * presence/absence of the mutated element in the enumeration is
97 * entirely at the discretion of ObjectStore. The arbitrary ordering
98 * applies independently to each transaction element. For example,
99 * if a transaction contains two mutating elements "create A" and
100 * "delete B". And an enumeration operation is performed while this
101 * transaction is pending. It is permissible for ObjectStore to
102 * report any of the four possible combinations of the existence of
111 OP_CREATE
= 7, // cid, oid
112 OP_TOUCH
= 9, // cid, oid
113 OP_WRITE
= 10, // cid, oid, offset, len, bl
114 OP_ZERO
= 11, // cid, oid, offset, len
115 OP_TRUNCATE
= 12, // cid, oid, len
116 OP_REMOVE
= 13, // cid, oid
117 OP_SETATTR
= 14, // cid, oid, attrname, bl
118 OP_SETATTRS
= 15, // cid, oid, attrset
119 OP_RMATTR
= 16, // cid, oid, attrname
120 OP_CLONE
= 17, // cid, oid, newoid
121 OP_CLONERANGE
= 18, // cid, oid, newoid, offset, len
122 OP_CLONERANGE2
= 30, // cid, oid, newoid, srcoff, len, dstoff
124 OP_TRIMCACHE
= 19, // cid, oid, offset, len **DEPRECATED**
126 OP_MKCOLL
= 20, // cid
127 OP_RMCOLL
= 21, // cid
128 OP_COLL_ADD
= 22, // cid, oldcid, oid
129 OP_COLL_REMOVE
= 23, // cid, oid
130 OP_COLL_SETATTR
= 24, // cid, attrname, bl
131 OP_COLL_RMATTR
= 25, // cid, attrname
132 OP_COLL_SETATTRS
= 26, // cid, attrset
133 OP_COLL_MOVE
= 8, // newcid, oldcid, oid
135 OP_RMATTRS
= 28, // cid, oid
136 OP_COLL_RENAME
= 29, // cid, newcid
138 OP_OMAP_CLEAR
= 31, // cid
139 OP_OMAP_SETKEYS
= 32, // cid, attrset
140 OP_OMAP_RMKEYS
= 33, // cid, keyset
141 OP_OMAP_SETHEADER
= 34, // cid, header
142 OP_SPLIT_COLLECTION
= 35, // cid, bits, destination
143 OP_SPLIT_COLLECTION2
= 36, /* cid, bits, destination
144 doesn't create the destination */
145 OP_OMAP_RMKEYRANGE
= 37, // cid, oid, firstkey, lastkey
146 OP_COLL_MOVE_RENAME
= 38, // oldcid, oldoid, newcid, newoid
148 OP_SETALLOCHINT
= 39, // cid, oid, object_size, write_size
149 OP_COLL_HINT
= 40, // cid, type, bl
151 OP_TRY_RENAME
= 41, // oldcid, oldoid, newoid
153 OP_COLL_SET_BITS
= 42, // cid, bits
155 OP_MERGE_COLLECTION
= 43, // cid, destination
158 // Transaction hint type
160 COLL_HINT_EXPECTED_NUM_OBJECTS
= 1,
170 ceph_le32 dest_oid
; //OP_CLONE, OP_CLONERANGE
171 ceph_le64 dest_off
; //OP_CLONERANGE
172 ceph_le32 hint
; //OP_COLL_HINT,OP_SETALLOCHINT
173 ceph_le64 expected_object_size
; //OP_SETALLOCHINT
174 ceph_le64 expected_write_size
; //OP_SETALLOCHINT
175 ceph_le32 split_bits
; //OP_SPLIT_COLLECTION2,OP_COLL_SET_BITS,
177 ceph_le32 split_rem
; //OP_SPLIT_COLLECTION2
178 } __attribute__ ((packed
)) ;
180 struct TransactionData
{
182 ceph_le32 largest_data_len
;
183 ceph_le32 largest_data_off
;
184 ceph_le32 largest_data_off_in_data_bl
;
185 ceph_le32 fadvise_flags
;
187 TransactionData() noexcept
:
191 largest_data_off_in_data_bl(0),
194 // override default move operations to reset default values
195 TransactionData(TransactionData
&& other
) noexcept
:
197 largest_data_len(other
.largest_data_len
),
198 largest_data_off(other
.largest_data_off
),
199 largest_data_off_in_data_bl(other
.largest_data_off_in_data_bl
),
200 fadvise_flags(other
.fadvise_flags
) {
202 other
.largest_data_len
= 0;
203 other
.largest_data_off
= 0;
204 other
.largest_data_off_in_data_bl
= 0;
205 other
.fadvise_flags
= 0;
207 TransactionData
& operator=(TransactionData
&& other
) noexcept
{
209 largest_data_len
= other
.largest_data_len
;
210 largest_data_off
= other
.largest_data_off
;
211 largest_data_off_in_data_bl
= other
.largest_data_off_in_data_bl
;
212 fadvise_flags
= other
.fadvise_flags
;
214 other
.largest_data_len
= 0;
215 other
.largest_data_off
= 0;
216 other
.largest_data_off_in_data_bl
= 0;
217 other
.fadvise_flags
= 0;
221 TransactionData(const TransactionData
& other
) = default;
222 TransactionData
& operator=(const TransactionData
& other
) = default;
224 void encode(ceph::buffer::list
& bl
) const {
225 bl
.append((char*)this, sizeof(TransactionData
));
227 void decode(ceph::buffer::list::const_iterator
&bl
) {
228 bl
.copy(sizeof(TransactionData
), (char*)this);
230 } __attribute__ ((packed
)) ;
233 TransactionData data
;
235 std::map
<coll_t
, uint32_t> coll_index
;
236 std::map
<ghobject_t
, uint32_t> object_index
;
238 uint32_t coll_id
= 0;
239 uint32_t object_id
= 0;
241 ceph::buffer::list data_bl
;
242 ceph::buffer::list op_bl
;
244 std::list
<Context
*> on_applied
;
245 std::list
<Context
*> on_commit
;
246 std::list
<Context
*> on_applied_sync
;
249 Transaction() = default;
251 explicit Transaction(ceph::buffer::list::const_iterator
&dp
) {
254 explicit Transaction(ceph::buffer::list
&nbl
) {
255 auto dp
= nbl
.cbegin();
259 // override default move operations to reset default values
260 Transaction(Transaction
&& other
) noexcept
:
261 data(std::move(other
.data
)),
262 coll_index(std::move(other
.coll_index
)),
263 object_index(std::move(other
.object_index
)),
264 coll_id(other
.coll_id
),
265 object_id(other
.object_id
),
266 data_bl(std::move(other
.data_bl
)),
267 op_bl(std::move(other
.op_bl
)),
268 on_applied(std::move(other
.on_applied
)),
269 on_commit(std::move(other
.on_commit
)),
270 on_applied_sync(std::move(other
.on_applied_sync
)) {
275 Transaction
& operator=(Transaction
&& other
) noexcept
{
276 data
= std::move(other
.data
);
277 coll_index
= std::move(other
.coll_index
);
278 object_index
= std::move(other
.object_index
);
279 coll_id
= other
.coll_id
;
280 object_id
= other
.object_id
;
281 data_bl
= std::move(other
.data_bl
);
282 op_bl
= std::move(other
.op_bl
);
283 on_applied
= std::move(other
.on_applied
);
284 on_commit
= std::move(other
.on_commit
);
285 on_applied_sync
= std::move(other
.on_applied_sync
);
291 Transaction(const Transaction
& other
) = default;
292 Transaction
& operator=(const Transaction
& other
) = default;
294 // expose object_index for FileStore::Op's benefit
295 const std::map
<ghobject_t
, uint32_t>& get_object_index() const {
299 /* Operations on callback contexts */
300 void register_on_applied(Context
*c
) {
302 on_applied
.push_back(c
);
304 void register_on_commit(Context
*c
) {
306 on_commit
.push_back(c
);
308 void register_on_applied_sync(Context
*c
) {
310 on_applied_sync
.push_back(c
);
312 void register_on_complete(Context
*c
) {
314 RunOnDeleteRef
_complete (std::make_shared
<RunOnDelete
>(c
));
315 register_on_applied(new ContainerContext
<RunOnDeleteRef
>(_complete
));
316 register_on_commit(new ContainerContext
<RunOnDeleteRef
>(_complete
));
318 bool has_contexts() const {
320 !on_commit
.empty() ||
321 !on_applied
.empty() ||
322 !on_applied_sync
.empty();
325 static void collect_contexts(
326 std::vector
<Transaction
>& t
,
327 Context
**out_on_applied
,
328 Context
**out_on_commit
,
329 Context
**out_on_applied_sync
) {
330 ceph_assert(out_on_applied
);
331 ceph_assert(out_on_commit
);
332 ceph_assert(out_on_applied_sync
);
333 std::list
<Context
*> on_applied
, on_commit
, on_applied_sync
;
335 on_applied
.splice(on_applied
.end(), i
.on_applied
);
336 on_commit
.splice(on_commit
.end(), i
.on_commit
);
337 on_applied_sync
.splice(on_applied_sync
.end(), i
.on_applied_sync
);
339 *out_on_applied
= C_Contexts::list_to_context(on_applied
);
340 *out_on_commit
= C_Contexts::list_to_context(on_commit
);
341 *out_on_applied_sync
= C_Contexts::list_to_context(on_applied_sync
);
343 static void collect_contexts(
344 std::vector
<Transaction
>& t
,
345 std::list
<Context
*> *out_on_applied
,
346 std::list
<Context
*> *out_on_commit
,
347 std::list
<Context
*> *out_on_applied_sync
) {
348 ceph_assert(out_on_applied
);
349 ceph_assert(out_on_commit
);
350 ceph_assert(out_on_applied_sync
);
352 out_on_applied
->splice(out_on_applied
->end(), i
.on_applied
);
353 out_on_commit
->splice(out_on_commit
->end(), i
.on_commit
);
354 out_on_applied_sync
->splice(out_on_applied_sync
->end(),
358 static Context
*collect_all_contexts(
360 std::list
<Context
*> contexts
;
361 contexts
.splice(contexts
.end(), t
.on_applied
);
362 contexts
.splice(contexts
.end(), t
.on_commit
);
363 contexts
.splice(contexts
.end(), t
.on_applied_sync
);
364 return C_Contexts::list_to_context(contexts
);
367 Context
*get_on_applied() {
368 return C_Contexts::list_to_context(on_applied
);
370 Context
*get_on_commit() {
371 return C_Contexts::list_to_context(on_commit
);
373 Context
*get_on_applied_sync() {
374 return C_Contexts::list_to_context(on_applied_sync
);
377 void set_fadvise_flags(uint32_t flags
) {
378 data
.fadvise_flags
= flags
;
380 void set_fadvise_flag(uint32_t flag
) {
381 data
.fadvise_flags
= data
.fadvise_flags
| flag
;
383 uint32_t get_fadvise_flags() { return data
.fadvise_flags
; }
385 void swap(Transaction
& other
) noexcept
{
386 std::swap(data
, other
.data
);
387 std::swap(on_applied
, other
.on_applied
);
388 std::swap(on_commit
, other
.on_commit
);
389 std::swap(on_applied_sync
, other
.on_applied_sync
);
391 std::swap(coll_index
, other
.coll_index
);
392 std::swap(object_index
, other
.object_index
);
393 std::swap(coll_id
, other
.coll_id
);
394 std::swap(object_id
, other
.object_id
);
395 op_bl
.swap(other
.op_bl
);
396 data_bl
.swap(other
.data_bl
);
399 void _update_op(Op
* op
,
400 std::vector
<uint32_t> &cm
,
401 std::vector
<uint32_t> &om
) {
416 case OP_OMAP_SETKEYS
:
418 case OP_OMAP_RMKEYRANGE
:
419 case OP_OMAP_SETHEADER
:
423 case OP_SETALLOCHINT
:
424 ceph_assert(op
->cid
< cm
.size());
425 ceph_assert(op
->oid
< om
.size());
426 op
->cid
= cm
[op
->cid
];
427 op
->oid
= om
[op
->oid
];
432 ceph_assert(op
->cid
< cm
.size());
433 ceph_assert(op
->oid
< om
.size());
434 ceph_assert(op
->dest_oid
< om
.size());
435 op
->cid
= cm
[op
->cid
];
436 op
->oid
= om
[op
->oid
];
437 op
->dest_oid
= om
[op
->dest_oid
];
442 case OP_COLL_SETATTR
:
444 case OP_COLL_SETATTRS
:
446 case OP_COLL_SET_BITS
:
447 ceph_assert(op
->cid
< cm
.size());
448 op
->cid
= cm
[op
->cid
];
452 ceph_assert(op
->cid
< cm
.size());
453 ceph_assert(op
->oid
< om
.size());
454 ceph_assert(op
->dest_cid
< om
.size());
455 op
->cid
= cm
[op
->cid
];
456 op
->dest_cid
= cm
[op
->dest_cid
];
457 op
->oid
= om
[op
->oid
];
460 case OP_COLL_MOVE_RENAME
:
461 ceph_assert(op
->cid
< cm
.size());
462 ceph_assert(op
->oid
< om
.size());
463 ceph_assert(op
->dest_cid
< cm
.size());
464 ceph_assert(op
->dest_oid
< om
.size());
465 op
->cid
= cm
[op
->cid
];
466 op
->oid
= om
[op
->oid
];
467 op
->dest_cid
= cm
[op
->dest_cid
];
468 op
->dest_oid
= om
[op
->dest_oid
];
472 ceph_assert(op
->cid
< cm
.size());
473 ceph_assert(op
->oid
< om
.size());
474 ceph_assert(op
->dest_oid
< om
.size());
475 op
->cid
= cm
[op
->cid
];
476 op
->oid
= om
[op
->oid
];
477 op
->dest_oid
= om
[op
->dest_oid
];
480 case OP_SPLIT_COLLECTION2
:
481 ceph_assert(op
->cid
< cm
.size());
482 ceph_assert(op
->dest_cid
< cm
.size());
483 op
->cid
= cm
[op
->cid
];
484 op
->dest_cid
= cm
[op
->dest_cid
];
487 case OP_MERGE_COLLECTION
:
488 ceph_assert(op
->cid
< cm
.size());
489 ceph_assert(op
->dest_cid
< cm
.size());
490 op
->cid
= cm
[op
->cid
];
491 op
->dest_cid
= cm
[op
->dest_cid
];
495 ceph_abort_msg("Unknown OP");
499 ceph::buffer::list
& bl
,
500 std::vector
<uint32_t> &cm
,
501 std::vector
<uint32_t> &om
) {
502 for (auto& bp
: bl
.buffers()) {
503 ceph_assert(bp
.length() % sizeof(Op
) == 0);
505 char* raw_p
= const_cast<char*>(bp
.c_str());
506 char* raw_end
= raw_p
+ bp
.length();
507 while (raw_p
< raw_end
) {
508 _update_op(reinterpret_cast<Op
*>(raw_p
), cm
, om
);
513 /// Append the operations of the parameter to this Transaction. Those operations are removed from the parameter Transaction
514 void append(Transaction
& other
) {
516 data
.ops
= data
.ops
+ other
.data
.ops
;
517 if (other
.data
.largest_data_len
> data
.largest_data_len
) {
518 data
.largest_data_len
= other
.data
.largest_data_len
;
519 data
.largest_data_off
= other
.data
.largest_data_off
;
520 data
.largest_data_off_in_data_bl
= data_bl
.length() + other
.data
.largest_data_off_in_data_bl
;
522 data
.fadvise_flags
= data
.fadvise_flags
| other
.data
.fadvise_flags
;
523 on_applied
.splice(on_applied
.end(), other
.on_applied
);
524 on_commit
.splice(on_commit
.end(), other
.on_commit
);
525 on_applied_sync
.splice(on_applied_sync
.end(), other
.on_applied_sync
);
527 //append coll_index & object_index
528 std::vector
<uint32_t> cm(other
.coll_index
.size());
529 std::map
<coll_t
, uint32_t>::iterator coll_index_p
;
530 for (coll_index_p
= other
.coll_index
.begin();
531 coll_index_p
!= other
.coll_index
.end();
533 cm
[coll_index_p
->second
] = _get_coll_id(coll_index_p
->first
);
536 std::vector
<uint32_t> om(other
.object_index
.size());
537 std::map
<ghobject_t
, uint32_t>::iterator object_index_p
;
538 for (object_index_p
= other
.object_index
.begin();
539 object_index_p
!= other
.object_index
.end();
541 om
[object_index_p
->second
] = _get_object_id(object_index_p
->first
);
544 //the other.op_bl SHOULD NOT be changes during append operation,
545 //we use additional ceph::buffer::list to avoid this problem
546 ceph::buffer::list other_op_bl
;
548 ceph::buffer::ptr
other_op_bl_ptr(other
.op_bl
.length());
549 other
.op_bl
.begin().copy(other
.op_bl
.length(), other_op_bl_ptr
.c_str());
550 other_op_bl
.append(std::move(other_op_bl_ptr
));
553 //update other_op_bl with cm & om
554 //When the other is appended to current transaction, all coll_index and
555 //object_index in other.op_buffer should be updated by new index of the
556 //combined transaction
557 _update_op_bl(other_op_bl
, cm
, om
);
560 op_bl
.append(other_op_bl
);
562 data_bl
.append(other
.data_bl
);
565 /** Inquires about the Transaction as a whole. */
567 /// How big is the encoded Transaction buffer?
568 uint64_t get_encoded_bytes() {
569 //layout: data_bl + op_bl + coll_index + object_index + data
571 // coll_index size, object_index size and sizeof(transaction_data)
572 // all here, so they may be computed at compile-time
573 size_t final_size
= sizeof(__u32
) * 2 + sizeof(data
);
575 // coll_index second and object_index second
576 final_size
+= (coll_index
.size() + object_index
.size()) * sizeof(__u32
);
579 for (auto p
= coll_index
.begin(); p
!= coll_index
.end(); ++p
) {
580 final_size
+= p
->first
.encoded_size();
583 // object_index first
584 for (auto p
= object_index
.begin(); p
!= object_index
.end(); ++p
) {
585 final_size
+= p
->first
.encoded_size();
588 return data_bl
.length() +
593 /// Retain old version for regression testing purposes
594 uint64_t get_encoded_bytes_test() {
596 //layout: data_bl + op_bl + coll_index + object_index + data
597 ceph::buffer::list bl
;
598 encode(coll_index
, bl
);
599 encode(object_index
, bl
);
601 return data_bl
.length() +
607 uint64_t get_num_bytes() {
608 return get_encoded_bytes();
610 /// Size of largest data buffer to the "write" operation encountered so far
611 uint32_t get_data_length() {
612 return data
.largest_data_len
;
614 /// offset within the encoded buffer to the start of the largest data buffer that's encoded
615 uint32_t get_data_offset() {
616 if (data
.largest_data_off_in_data_bl
) {
617 return data
.largest_data_off_in_data_bl
+
618 sizeof(__u8
) + // encode struct_v
619 sizeof(__u8
) + // encode compat_v
620 sizeof(__u32
) + // encode len
621 sizeof(__u32
); // data_bl len
625 /// offset of buffer as aligned to destination within object.
626 int get_data_alignment() {
627 if (!data
.largest_data_len
)
629 return (0 - get_data_offset()) & ~CEPH_PAGE_MASK
;
631 /// Is the Transaction empty (no operations)
635 /// Number of operations in the transaction
643 * Helper object to parse Transactions.
645 * ObjectStore instances use this object to step down the encoded
646 * buffer decoding operation codes and parameters as we go.
655 ceph::buffer::list::const_iterator data_bl_p
;
658 std::vector
<coll_t
> colls
;
659 std::vector
<ghobject_t
> objects
;
662 explicit iterator(Transaction
*t
)
664 data_bl_p(t
->data_bl
.cbegin()),
665 colls(t
->coll_index
.size()),
666 objects(t
->object_index
.size()) {
669 op_buffer_p
= t
->op_bl
.c_str();
671 std::map
<coll_t
, uint32_t>::iterator coll_index_p
;
672 for (coll_index_p
= t
->coll_index
.begin();
673 coll_index_p
!= t
->coll_index
.end();
675 colls
[coll_index_p
->second
] = coll_index_p
->first
;
678 std::map
<ghobject_t
, uint32_t>::iterator object_index_p
;
679 for (object_index_p
= t
->object_index
.begin();
680 object_index_p
!= t
->object_index
.end();
682 objects
[object_index_p
->second
] = object_index_p
->first
;
686 friend class Transaction
;
694 ceph_assert(ops
> 0);
696 Op
* op
= reinterpret_cast<Op
*>(op_buffer_p
);
697 op_buffer_p
+= sizeof(Op
);
702 std::string
decode_string() {
705 decode(s
, data_bl_p
);
708 void decode_bp(ceph::buffer::ptr
& bp
) {
710 decode(bp
, data_bl_p
);
712 void decode_bl(ceph::buffer::list
& bl
) {
714 decode(bl
, data_bl_p
);
716 void decode_attrset(std::map
<std::string
,ceph::buffer::ptr
>& aset
) {
718 decode(aset
, data_bl_p
);
720 void decode_attrset(std::map
<std::string
,ceph::buffer::list
>& aset
) {
722 decode(aset
, data_bl_p
);
724 void decode_attrset_bl(ceph::buffer::list
*pbl
) {
725 decode_str_str_map_to_bl(data_bl_p
, pbl
);
727 void decode_keyset(std::set
<std::string
> &keys
){
729 decode(keys
, data_bl_p
);
731 void decode_keyset_bl(ceph::buffer::list
*pbl
){
732 decode_str_set_to_bl(data_bl_p
, pbl
);
735 const ghobject_t
&get_oid(uint32_t oid_id
) {
736 ceph_assert(oid_id
< objects
.size());
737 return objects
[oid_id
];
739 const coll_t
&get_cid(uint32_t cid_id
) {
740 ceph_assert(cid_id
< colls
.size());
741 return colls
[cid_id
];
743 uint32_t get_fadvise_flags() const {
744 return t
->get_fadvise_flags();
747 const std::vector
<ghobject_t
> &get_objects() const {
753 return iterator(this);
757 void _build_actions_from_tbl();
760 * Helper functions to encode the various mutation elements of a
761 * transaction. These are 1:1 with the operation codes (see
762 * enumeration above). These routines ensure that the
763 * encoder/creator of a transaction gets the right data in the
764 * right place. Sadly, there's no corresponding version nor any
765 * form of seat belts for the decoder.
768 if (op_bl
.get_append_buffer_unused_tail_length() < sizeof(Op
)) {
769 op_bl
.reserve(sizeof(Op
) * OPS_PER_PTR
);
771 // append_hole ensures bptr merging. Even huge number of ops
772 // shouldn't result in overpopulating bl::_buffers.
773 char* const p
= op_bl
.append_hole(sizeof(Op
)).c_str();
774 memset(p
, 0, sizeof(Op
));
775 return reinterpret_cast<Op
*>(p
);
777 uint32_t _get_coll_id(const coll_t
& coll
) {
778 std::map
<coll_t
, uint32_t>::iterator c
= coll_index
.find(coll
);
779 if (c
!= coll_index
.end())
782 uint32_t index_id
= coll_id
++;
783 coll_index
[coll
] = index_id
;
786 uint32_t _get_object_id(const ghobject_t
& oid
) {
787 std::map
<ghobject_t
, uint32_t>::iterator o
= object_index
.find(oid
);
788 if (o
!= object_index
.end())
791 uint32_t index_id
= object_id
++;
792 object_index
[oid
] = index_id
;
799 Op
* _op
= _get_next_op();
801 data
.ops
= data
.ops
+ 1;
806 * create an object that does not yet exist
807 * (behavior is undefined if the object already exists)
809 void create(const coll_t
& cid
, const ghobject_t
& oid
) {
810 Op
* _op
= _get_next_op();
812 _op
->cid
= _get_coll_id(cid
);
813 _op
->oid
= _get_object_id(oid
);
814 data
.ops
= data
.ops
+ 1;
819 * Ensure the existance of an object in a collection. Create an
820 * empty object if necessary
822 void touch(const coll_t
& cid
, const ghobject_t
& oid
) {
823 Op
* _op
= _get_next_op();
825 _op
->cid
= _get_coll_id(cid
);
826 _op
->oid
= _get_object_id(oid
);
827 data
.ops
= data
.ops
+ 1;
830 * Write data to an offset within an object. If the object is too
831 * small, it is expanded as needed. It is possible to specify an
832 * offset beyond the current end of an object and it will be
833 * expanded as needed. Simple implementations of ObjectStore will
834 * just zero the data between the old end of the object and the
835 * newly provided data. More sophisticated implementations of
836 * ObjectStore will omit the untouched data and store it as a
837 * "hole" in the file.
839 * Note that a 0-length write does not affect the size of the object.
841 void write(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t off
, uint64_t len
,
842 const ceph::buffer::list
& write_data
, uint32_t flags
= 0) {
844 uint32_t orig_len
= data_bl
.length();
845 Op
* _op
= _get_next_op();
847 _op
->cid
= _get_coll_id(cid
);
848 _op
->oid
= _get_object_id(oid
);
851 encode(write_data
, data_bl
);
853 ceph_assert(len
== write_data
.length());
854 data
.fadvise_flags
= data
.fadvise_flags
| flags
;
855 if (write_data
.length() > data
.largest_data_len
) {
856 data
.largest_data_len
= write_data
.length();
857 data
.largest_data_off
= off
;
858 data
.largest_data_off_in_data_bl
= orig_len
+ sizeof(__u32
); // we are about to
860 data
.ops
= data
.ops
+ 1;
863 * zero out the indicated byte range within an object. Some
864 * ObjectStore instances may optimize this to release the
865 * underlying storage space.
867 * If the zero range extends beyond the end of the object, the object
868 * size is extended, just as if we were writing a buffer full of zeros.
869 * EXCEPT if the length is 0, in which case (just like a 0-length write)
870 * we do not adjust the object size.
872 void zero(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t off
, uint64_t len
) {
873 Op
* _op
= _get_next_op();
875 _op
->cid
= _get_coll_id(cid
);
876 _op
->oid
= _get_object_id(oid
);
879 data
.ops
= data
.ops
+ 1;
881 /// Discard all data in the object beyond the specified size.
882 void truncate(const coll_t
& cid
, const ghobject_t
& oid
, uint64_t off
) {
883 Op
* _op
= _get_next_op();
884 _op
->op
= OP_TRUNCATE
;
885 _op
->cid
= _get_coll_id(cid
);
886 _op
->oid
= _get_object_id(oid
);
888 data
.ops
= data
.ops
+ 1;
890 /// Remove an object. All four parts of the object are removed.
891 void remove(const coll_t
& cid
, const ghobject_t
& oid
) {
892 Op
* _op
= _get_next_op();
894 _op
->cid
= _get_coll_id(cid
);
895 _op
->oid
= _get_object_id(oid
);
896 data
.ops
= data
.ops
+ 1;
898 /// Set an xattr of an object
899 void setattr(const coll_t
& cid
, const ghobject_t
& oid
, const char* name
, ceph::buffer::list
& val
) {
901 setattr(cid
, oid
, n
, val
);
903 /// Set an xattr of an object
904 void setattr(const coll_t
& cid
, const ghobject_t
& oid
, const std::string
& s
, ceph::buffer::list
& val
) {
906 Op
* _op
= _get_next_op();
907 _op
->op
= OP_SETATTR
;
908 _op
->cid
= _get_coll_id(cid
);
909 _op
->oid
= _get_object_id(oid
);
911 encode(val
, data_bl
);
912 data
.ops
= data
.ops
+ 1;
914 /// Set multiple xattrs of an object
915 void setattrs(const coll_t
& cid
,
916 const ghobject_t
& oid
,
917 const std::map
<std::string
,ceph::buffer::ptr
,std::less
<>>& attrset
) {
919 Op
* _op
= _get_next_op();
920 _op
->op
= OP_SETATTRS
;
921 _op
->cid
= _get_coll_id(cid
);
922 _op
->oid
= _get_object_id(oid
);
923 encode(attrset
, data_bl
);
924 data
.ops
= data
.ops
+ 1;
926 /// Set multiple xattrs of an object
927 void setattrs(const coll_t
& cid
,
928 const ghobject_t
& oid
,
929 const std::map
<std::string
,ceph::buffer::list
,std::less
<>>& attrset
) {
931 Op
* _op
= _get_next_op();
932 _op
->op
= OP_SETATTRS
;
933 _op
->cid
= _get_coll_id(cid
);
934 _op
->oid
= _get_object_id(oid
);
935 encode(attrset
, data_bl
);
936 data
.ops
= data
.ops
+ 1;
938 /// remove an xattr from an object
939 void rmattr(const coll_t
& cid
, const ghobject_t
& oid
, const char *name
) {
943 /// remove an xattr from an object
944 void rmattr(const coll_t
& cid
, const ghobject_t
& oid
, const std::string
& s
) {
946 Op
* _op
= _get_next_op();
948 _op
->cid
= _get_coll_id(cid
);
949 _op
->oid
= _get_object_id(oid
);
951 data
.ops
= data
.ops
+ 1;
953 /// remove all xattrs from an object
954 void rmattrs(const coll_t
& cid
, const ghobject_t
& oid
) {
955 Op
* _op
= _get_next_op();
956 _op
->op
= OP_RMATTRS
;
957 _op
->cid
= _get_coll_id(cid
);
958 _op
->oid
= _get_object_id(oid
);
959 data
.ops
= data
.ops
+ 1;
962 * Clone an object into another object.
964 * Low-cost (e.g., O(1)) cloning (if supported) is best, but
965 * fallback to an O(n) copy is allowed. All four parts of the
966 * object are cloned (data, xattrs, omap header, omap
969 * The destination named object may already exist, in
970 * which case its previous contents are discarded.
972 void clone(const coll_t
& cid
, const ghobject_t
& oid
,
973 const ghobject_t
& noid
) {
974 Op
* _op
= _get_next_op();
976 _op
->cid
= _get_coll_id(cid
);
977 _op
->oid
= _get_object_id(oid
);
978 _op
->dest_oid
= _get_object_id(noid
);
979 data
.ops
= data
.ops
+ 1;
982 * Clone a byte range from one object to another.
984 * The data portion of the destination object receives a copy of a
985 * portion of the data from the source object. None of the other
986 * three parts of an object is copied from the source.
988 * The destination object size may be extended to the dstoff + len.
990 * The source range *must* overlap with the source object data. If it does
991 * not the result is undefined.
993 void clone_range(const coll_t
& cid
, const ghobject_t
& oid
,
994 const ghobject_t
& noid
,
995 uint64_t srcoff
, uint64_t srclen
, uint64_t dstoff
) {
996 Op
* _op
= _get_next_op();
997 _op
->op
= OP_CLONERANGE2
;
998 _op
->cid
= _get_coll_id(cid
);
999 _op
->oid
= _get_object_id(oid
);
1000 _op
->dest_oid
= _get_object_id(noid
);
1003 _op
->dest_off
= dstoff
;
1004 data
.ops
= data
.ops
+ 1;
1007 /// Create the collection
1008 void create_collection(const coll_t
& cid
, int bits
) {
1009 Op
* _op
= _get_next_op();
1010 _op
->op
= OP_MKCOLL
;
1011 _op
->cid
= _get_coll_id(cid
);
1012 _op
->split_bits
= bits
;
1013 data
.ops
= data
.ops
+ 1;
1017 * Give the collection a hint.
1019 * @param cid - collection id.
1020 * @param type - hint type.
1021 * @param hint - the hint payload, which contains the customized
1022 * data along with the hint type.
1024 void collection_hint(const coll_t
& cid
, uint32_t type
, const ceph::buffer::list
& hint
) {
1026 Op
* _op
= _get_next_op();
1027 _op
->op
= OP_COLL_HINT
;
1028 _op
->cid
= _get_coll_id(cid
);
1030 encode(hint
, data_bl
);
1031 data
.ops
= data
.ops
+ 1;
1034 /// remove the collection, the collection must be empty
1035 void remove_collection(const coll_t
& cid
) {
1036 Op
* _op
= _get_next_op();
1037 _op
->op
= OP_RMCOLL
;
1038 _op
->cid
= _get_coll_id(cid
);
1039 data
.ops
= data
.ops
+ 1;
1041 void collection_move(const coll_t
& cid
, const coll_t
&oldcid
, const ghobject_t
& oid
)
1042 __attribute__ ((deprecated
)) {
1043 // NOTE: we encode this as a fixed combo of ADD + REMOVE. they
1044 // always appear together, so this is effectively a single MOVE.
1045 Op
* _op
= _get_next_op();
1046 _op
->op
= OP_COLL_ADD
;
1047 _op
->cid
= _get_coll_id(oldcid
);
1048 _op
->oid
= _get_object_id(oid
);
1049 _op
->dest_cid
= _get_coll_id(cid
);
1050 data
.ops
= data
.ops
+ 1;
1052 _op
= _get_next_op();
1053 _op
->op
= OP_COLL_REMOVE
;
1054 _op
->cid
= _get_coll_id(oldcid
);
1055 _op
->oid
= _get_object_id(oid
);
1056 data
.ops
= data
.ops
+ 1;
1058 void collection_move_rename(const coll_t
& oldcid
, const ghobject_t
& oldoid
,
1059 const coll_t
&cid
, const ghobject_t
& oid
) {
1060 Op
* _op
= _get_next_op();
1061 _op
->op
= OP_COLL_MOVE_RENAME
;
1062 _op
->cid
= _get_coll_id(oldcid
);
1063 _op
->oid
= _get_object_id(oldoid
);
1064 _op
->dest_cid
= _get_coll_id(cid
);
1065 _op
->dest_oid
= _get_object_id(oid
);
1066 data
.ops
= data
.ops
+ 1;
1068 void try_rename(const coll_t
&cid
, const ghobject_t
& oldoid
,
1069 const ghobject_t
& oid
) {
1070 Op
* _op
= _get_next_op();
1071 _op
->op
= OP_TRY_RENAME
;
1072 _op
->cid
= _get_coll_id(cid
);
1073 _op
->oid
= _get_object_id(oldoid
);
1074 _op
->dest_oid
= _get_object_id(oid
);
1075 data
.ops
= data
.ops
+ 1;
1078 /// Remove omap from oid
1080 const coll_t
&cid
, ///< [in] Collection containing oid
1081 const ghobject_t
&oid
///< [in] Object from which to remove omap
1083 Op
* _op
= _get_next_op();
1084 _op
->op
= OP_OMAP_CLEAR
;
1085 _op
->cid
= _get_coll_id(cid
);
1086 _op
->oid
= _get_object_id(oid
);
1087 data
.ops
= data
.ops
+ 1;
1089 /// Set keys on oid omap. Replaces duplicate keys.
1091 const coll_t
& cid
, ///< [in] Collection containing oid
1092 const ghobject_t
&oid
, ///< [in] Object to update
1093 const std::map
<std::string
, ceph::buffer::list
> &attrset
///< [in] Replacement keys and values
1096 Op
* _op
= _get_next_op();
1097 _op
->op
= OP_OMAP_SETKEYS
;
1098 _op
->cid
= _get_coll_id(cid
);
1099 _op
->oid
= _get_object_id(oid
);
1100 encode(attrset
, data_bl
);
1101 data
.ops
= data
.ops
+ 1;
1104 /// Set keys on an oid omap (ceph::buffer::list variant).
1106 const coll_t
&cid
, ///< [in] Collection containing oid
1107 const ghobject_t
&oid
, ///< [in] Object to update
1108 const ceph::buffer::list
&attrset_bl
///< [in] Replacement keys and values
1110 Op
* _op
= _get_next_op();
1111 _op
->op
= OP_OMAP_SETKEYS
;
1112 _op
->cid
= _get_coll_id(cid
);
1113 _op
->oid
= _get_object_id(oid
);
1114 data_bl
.append(attrset_bl
);
1115 data
.ops
= data
.ops
+ 1;
1118 /// Remove keys from oid omap
1120 const coll_t
&cid
, ///< [in] Collection containing oid
1121 const ghobject_t
&oid
, ///< [in] Object from which to remove the omap
1122 const std::set
<std::string
> &keys
///< [in] Keys to clear
1125 Op
* _op
= _get_next_op();
1126 _op
->op
= OP_OMAP_RMKEYS
;
1127 _op
->cid
= _get_coll_id(cid
);
1128 _op
->oid
= _get_object_id(oid
);
1129 encode(keys
, data_bl
);
1130 data
.ops
= data
.ops
+ 1;
1133 /// Remove key from oid omap
1135 const coll_t
&cid
, ///< [in] Collection containing oid
1136 const ghobject_t
&oid
, ///< [in] Object from which to remove the omap
1137 const std::string
& key
///< [in] Keys to clear
1139 Op
* _op
= _get_next_op();
1140 _op
->op
= OP_OMAP_RMKEYS
;
1141 _op
->cid
= _get_coll_id(cid
);
1142 _op
->oid
= _get_object_id(oid
);
1144 encode((uint32_t)1, data_bl
);
1145 encode(key
, data_bl
);
1146 data
.ops
= data
.ops
+ 1;
1149 /// Remove keys from oid omap
1151 const coll_t
&cid
, ///< [in] Collection containing oid
1152 const ghobject_t
&oid
, ///< [in] Object from which to remove the omap
1153 const ceph::buffer::list
&keys_bl
///< [in] Keys to clear
1155 Op
* _op
= _get_next_op();
1156 _op
->op
= OP_OMAP_RMKEYS
;
1157 _op
->cid
= _get_coll_id(cid
);
1158 _op
->oid
= _get_object_id(oid
);
1159 data_bl
.append(keys_bl
);
1160 data
.ops
= data
.ops
+ 1;
1163 /// Remove key range from oid omap
1164 void omap_rmkeyrange(
1165 const coll_t
&cid
, ///< [in] Collection containing oid
1166 const ghobject_t
&oid
, ///< [in] Object from which to remove the omap keys
1167 const std::string
& first
, ///< [in] first key in range
1168 const std::string
& last
///< [in] first key past range, range is [first,last)
1171 Op
* _op
= _get_next_op();
1172 _op
->op
= OP_OMAP_RMKEYRANGE
;
1173 _op
->cid
= _get_coll_id(cid
);
1174 _op
->oid
= _get_object_id(oid
);
1175 encode(first
, data_bl
);
1176 encode(last
, data_bl
);
1177 data
.ops
= data
.ops
+ 1;
1180 /// Remove key range from oid omap
1181 void omap_rmkeyrange(
1182 const coll_t cid
, ///< [in] Collection containing oid
1183 const ghobject_t
&oid
, ///< [in] Object from which to remove the omap keys
1184 const bufferlist
&keys_bl
///< [in] range of keys to clear
1186 Op
* _op
= _get_next_op();
1187 _op
->op
= OP_OMAP_RMKEYRANGE
;
1188 _op
->cid
= _get_coll_id(cid
);
1189 _op
->oid
= _get_object_id(oid
);
1190 data_bl
.append(keys_bl
);
1191 data
.ops
= data
.ops
+ 1;
1195 void omap_setheader(
1196 const coll_t
&cid
, ///< [in] Collection containing oid
1197 const ghobject_t
&oid
, ///< [in] Object
1198 const ceph::buffer::list
&bl
///< [in] Header value
1201 Op
* _op
= _get_next_op();
1202 _op
->op
= OP_OMAP_SETHEADER
;
1203 _op
->cid
= _get_coll_id(cid
);
1204 _op
->oid
= _get_object_id(oid
);
1205 encode(bl
, data_bl
);
1206 data
.ops
= data
.ops
+ 1;
1209 /// Split collection based on given prefixes, objects matching the specified bits/rem are
1210 /// moved to the new collection
1211 void split_collection(
1215 const coll_t
&destination
) {
1216 Op
* _op
= _get_next_op();
1217 _op
->op
= OP_SPLIT_COLLECTION2
;
1218 _op
->cid
= _get_coll_id(cid
);
1219 _op
->dest_cid
= _get_coll_id(destination
);
1220 _op
->split_bits
= bits
;
1221 _op
->split_rem
= rem
;
1222 data
.ops
= data
.ops
+ 1;
1225 /// Merge collection into another.
1226 void merge_collection(
1230 Op
* _op
= _get_next_op();
1231 _op
->op
= OP_MERGE_COLLECTION
;
1232 _op
->cid
= _get_coll_id(cid
);
1233 _op
->dest_cid
= _get_coll_id(destination
);
1234 _op
->split_bits
= bits
;
1235 data
.ops
= data
.ops
+ 1;
1238 void collection_set_bits(
1241 Op
* _op
= _get_next_op();
1242 _op
->op
= OP_COLL_SET_BITS
;
1243 _op
->cid
= _get_coll_id(cid
);
1244 _op
->split_bits
= bits
;
1245 data
.ops
= data
.ops
+ 1;
1248 /// Set allocation hint for an object
1249 /// make 0 values(expected_object_size, expected_write_size) noops for all implementations
1250 void set_alloc_hint(
1252 const ghobject_t
&oid
,
1253 uint64_t expected_object_size
,
1254 uint64_t expected_write_size
,
1257 Op
* _op
= _get_next_op();
1258 _op
->op
= OP_SETALLOCHINT
;
1259 _op
->cid
= _get_coll_id(cid
);
1260 _op
->oid
= _get_object_id(oid
);
1261 _op
->expected_object_size
= expected_object_size
;
1262 _op
->expected_write_size
= expected_write_size
;
1264 data
.ops
= data
.ops
+ 1;
1267 void encode(ceph::buffer::list
& bl
) const {
1268 //layout: data_bl + op_bl + coll_index + object_index + data
1269 ENCODE_START(9, 9, bl
);
1270 encode(data_bl
, bl
);
1272 encode(coll_index
, bl
);
1273 encode(object_index
, bl
);
1278 void decode(ceph::buffer::list::const_iterator
&bl
) {
1279 DECODE_START(9, bl
);
1282 decode(data_bl
, bl
);
1284 decode(coll_index
, bl
);
1285 decode(object_index
, bl
);
1287 coll_id
= coll_index
.size();
1288 object_id
= object_index
.size();
1293 void dump(ceph::Formatter
*f
);
1294 static void generate_test_instances(std::list
<Transaction
*>& o
);
1296 WRITE_CLASS_ENCODER(Transaction
)
1297 WRITE_CLASS_ENCODER(Transaction::TransactionData
)
1299 std::ostream
& operator<<(std::ostream
& out
, const Transaction
& tx
);