1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2016 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
14 #ifndef PGTRANSACTION_H
15 #define PGTRANSACTION_H
21 #include "common/hobject.h"
22 #include "osd/osd_types.h"
23 #include "osd/osd_internal_types.h"
24 #include "common/interval_map.h"
25 #include "common/inline_variant.h"
28 * This class represents transactions which can be submitted to
29 * a PGBackend. For expediency, there are some constraints on
30 * the operations submitted:
31 * 1) Rename sources may only be referenced prior to the rename
32 * operation to the destination.
33 * 2) The graph formed by edges of source->destination for clones
34 * (Create) and Renames must be acyclic.
35 * 3) clone_range sources must not be modified by the same
40 std::map
<hobject_t
, ObjectContextRef
> obc_map
;
42 class ObjectOperation
{
52 hobject_t source
; // must be temp object
55 using InitType
= boost::variant
<
61 InitType init_type
= Init::None();
62 bool delete_first
= false;
65 * is_none() && is_delete() indicates that we are deleting an
66 * object which already exists and not recreating it. delete_first means
67 * that the transaction logically removes the object.
69 * There are really 4 cases:
71 * 1) We are modifying an existing object (is_none() &&
73 * a) If it's an append, we just write into the log entry the old size
74 * b) If it's an actual overwrite, we save the old versions of the
75 * extents being overwritten and write those offsets into the log
77 * 2) We are removing and then recreating an object (!is_none() && is_delete())
79 * 3) We are removing an object (is_none() && is_delete()) -- stash
80 * 4) We are creating an object (!is_none() && !is_delete()) -- create (no
83 * Create, Clone, Rename are the three ways we can recreate it.
84 * ECBackend transaction planning needs this context
85 * to figure out how to perform the transaction.
87 bool deletes_first() const {
90 bool is_delete() const {
91 return boost::get
<Init::None
>(&init_type
) != nullptr && delete_first
;
93 bool is_none() const {
94 return boost::get
<Init::None
>(&init_type
) != nullptr && !delete_first
;
96 bool is_fresh_object() const {
97 return boost::get
<Init::None
>(&init_type
) == nullptr;
99 bool is_rename() const {
100 return boost::get
<Init::Rename
>(&init_type
) != nullptr;
102 bool has_source(hobject_t
*source
= nullptr) const {
105 [&](const Init::Clone
&op
) -> bool {
110 [&](const Init::Rename
&op
) -> bool {
115 [&](const Init::None
&) -> bool { return false; },
116 [&](const Init::Create
&) -> bool { return false; });
119 bool clear_omap
= false;
125 * truncate is represented as a pair because in the event of
126 * multiple truncates within a single transaction we need to
127 * remember the lowest truncate and the final object size
128 * (the last truncate). We also adjust the buffers map
129 * to account for truncates overriding previous writes */
130 std::optional
<std::pair
<uint64_t, uint64_t> > truncate
= std::nullopt
;
132 std::map
<std::string
, std::optional
<ceph::buffer::list
> > attr_updates
;
134 enum class OmapUpdateType
{Remove
, Insert
, RemoveRange
};
135 std::vector
<std::pair
<OmapUpdateType
, ceph::buffer::list
> > omap_updates
;
137 std::optional
<ceph::buffer::list
> omap_header
;
139 /// (old, new) -- only valid with no truncate or buffer updates
140 std::optional
<std::pair
<std::set
<snapid_t
>, std::set
<snapid_t
>>> updated_snaps
;
142 struct alloc_hint_t
{
143 uint64_t expected_object_size
;
144 uint64_t expected_write_size
;
147 std::optional
<alloc_hint_t
> alloc_hint
;
149 struct BufferUpdate
{
151 ceph::buffer::list buffer
;
152 uint32_t fadvise_flags
;
163 using BufferUpdateType
= boost::variant
<
166 BufferUpdate::CloneRange
>;
170 BufferUpdateType
split(
173 const BufferUpdateType
&bu
) const {
176 [&](const BufferUpdate::Write
&w
) -> BufferUpdateType
{
177 ceph::buffer::list bl
;
178 bl
.substr_of(w
.buffer
, offset
, len
);
179 return BufferUpdate::Write
{bl
, w
.fadvise_flags
};
181 [&](const BufferUpdate::Zero
&) -> BufferUpdateType
{
182 return BufferUpdate::Zero
{len
};
184 [&](const BufferUpdate::CloneRange
&c
) -> BufferUpdateType
{
185 return BufferUpdate::CloneRange
{c
.from
, c
.offset
+ offset
, len
};
189 const BufferUpdateType
&left
) const {
192 [&](const BufferUpdate::Write
&w
) -> uint64_t {
193 return w
.buffer
.length();
195 [&](const BufferUpdate::Zero
&z
) -> uint64_t {
198 [&](const BufferUpdate::CloneRange
&c
) -> uint64_t {
203 const BufferUpdateType
&left
,
204 const BufferUpdateType
&right
) const {
207 [&](const BufferUpdate::Write
&w
) -> bool {
208 auto r
= boost::get
<BufferUpdate::Write
>(&right
);
209 return r
!= nullptr && (w
.fadvise_flags
== r
->fadvise_flags
);
211 [&](const BufferUpdate::Zero
&) -> bool {
212 auto r
= boost::get
<BufferUpdate::Zero
>(&right
);
215 [&](const BufferUpdate::CloneRange
&c
) -> bool {
219 BufferUpdateType
merge(
220 BufferUpdateType
&&left
,
221 BufferUpdateType
&&right
) const {
224 [&](const BufferUpdate::Write
&w
) -> BufferUpdateType
{
225 auto r
= boost::get
<BufferUpdate::Write
>(&right
);
226 ceph_assert(r
&& w
.fadvise_flags
== r
->fadvise_flags
);
227 ceph::buffer::list bl
= w
.buffer
;
228 bl
.append(r
->buffer
);
229 return BufferUpdate::Write
{bl
, w
.fadvise_flags
};
231 [&](const BufferUpdate::Zero
&z
) -> BufferUpdateType
{
232 auto r
= boost::get
<BufferUpdate::Zero
>(&right
);
234 return BufferUpdate::Zero
{z
.len
+ r
->len
};
236 [&](const BufferUpdate::CloneRange
&c
) -> BufferUpdateType
{
237 ceph_abort_msg("violates can_merge condition");
243 using buffer_update_type
= interval_map
<
244 uint64_t, BufferUpdateType
, SplitMerger
>;
245 buffer_update_type buffer_updates
;
247 friend class PGTransaction
;
249 std::map
<hobject_t
, ObjectOperation
> op_map
;
251 ObjectOperation
&get_object_op_for_modify(const hobject_t
&hoid
) {
252 auto &op
= op_map
[hoid
];
253 ceph_assert(!op
.is_delete());
256 ObjectOperation
&get_object_op(const hobject_t
&hoid
) {
261 ObjectContextRef obc
) {
263 obc_map
[obc
->obs
.oi
.soid
] = obc
;
265 /// Sets up state for new object
267 const hobject_t
&hoid
269 auto &op
= op_map
[hoid
];
270 ceph_assert(op
.is_none() || op
.is_delete());
271 op
.init_type
= ObjectOperation::Init::Create();
274 /// Sets up state for target cloned from source
276 const hobject_t
&target
, ///< [in] obj to clone to
277 const hobject_t
&source
///< [in] obj to clone from
279 auto &op
= op_map
[target
];
280 ceph_assert(op
.is_none() || op
.is_delete());
281 op
.init_type
= ObjectOperation::Init::Clone
{source
};
284 /// Sets up state for target renamed from source
286 const hobject_t
&target
, ///< [in] to, must not exist, be non-temp
287 const hobject_t
&source
///< [in] source (must be a temp object)
289 ceph_assert(source
.is_temp());
290 ceph_assert(!target
.is_temp());
291 auto &op
= op_map
[target
];
292 ceph_assert(op
.is_none() || op
.is_delete());
294 bool del_first
= op
.is_delete();
295 auto iter
= op_map
.find(source
);
296 if (iter
!= op_map
.end()) {
299 op
.delete_first
= del_first
;
302 op
.init_type
= ObjectOperation::Init::Rename
{source
};
305 /// Remove -- must not be called on rename target
307 const hobject_t
&hoid
///< [in] obj to remove
309 auto &op
= get_object_op_for_modify(hoid
);
310 if (!op
.is_fresh_object()) {
311 ceph_assert(!op
.updated_snaps
);
312 op
= ObjectOperation();
313 op
.delete_first
= true;
315 ceph_assert(!op
.is_rename());
316 op_map
.erase(hoid
); // make it a noop if it's a fresh object
321 const hobject_t
&hoid
, ///< [in] object for snaps
322 const std::set
<snapid_t
> &old_snaps
,///< [in] old snaps value
323 const std::set
<snapid_t
> &new_snaps
///< [in] new snaps value
325 auto &op
= get_object_op(hoid
);
326 ceph_assert(!op
.updated_snaps
);
327 ceph_assert(op
.buffer_updates
.empty());
328 ceph_assert(!op
.truncate
);
329 op
.updated_snaps
= make_pair(
334 /// Clears, truncates
336 const hobject_t
&hoid
///< [in] object to clear omap
338 auto &op
= get_object_op_for_modify(hoid
);
339 op
.clear_omap
= true;
340 op
.omap_updates
.clear();
341 op
.omap_header
= std::nullopt
;
344 const hobject_t
&hoid
, ///< [in] object
345 uint64_t off
///< [in] offset to truncate to
347 auto &op
= get_object_op_for_modify(hoid
);
348 ceph_assert(!op
.updated_snaps
);
349 op
.buffer_updates
.erase(
351 std::numeric_limits
<uint64_t>::max() - off
);
352 if (!op
.truncate
|| off
< op
.truncate
->first
) {
353 op
.truncate
= std::pair
<uint64_t, uint64_t>(off
, off
);
355 op
.truncate
->second
= off
;
361 const hobject_t
&hoid
, ///< [in] object to write
362 std::map
<std::string
, ceph::buffer::list
> &attrs
///< [in] attrs, may be cleared
364 auto &op
= get_object_op_for_modify(hoid
);
365 for (auto &&i
: attrs
) {
366 auto& d
= op
.attr_updates
[i
.first
];
372 const hobject_t
&hoid
, ///< [in] object to write
373 const std::string
&attrname
, ///< [in] attr to write
374 ceph::buffer::list
&bl
///< [in] val to write, may be claimed
376 auto &op
= get_object_op_for_modify(hoid
);
377 auto& d
= op
.attr_updates
[attrname
];
382 const hobject_t
&hoid
, ///< [in] object to write
383 const std::string
&attrname
///< [in] attr to remove
385 auto &op
= get_object_op_for_modify(hoid
);
386 op
.attr_updates
[attrname
] = std::nullopt
;
391 const hobject_t
&hoid
, ///< [in] object (must exist)
392 uint64_t expected_object_size
, ///< [in]
393 uint64_t expected_write_size
,
396 auto &op
= get_object_op_for_modify(hoid
);
397 op
.alloc_hint
= ObjectOperation::alloc_hint_t
{
398 expected_object_size
, expected_write_size
, flags
};
403 const hobject_t
&hoid
, ///< [in] object to write
404 uint64_t off
, ///< [in] off at which to write
405 uint64_t len
, ///< [in] len to write from bl
406 ceph::buffer::list
&bl
, ///< [in] bl to write will be claimed to len
407 uint32_t fadvise_flags
= 0 ///< [in] fadvise hint
409 auto &op
= get_object_op_for_modify(hoid
);
410 ceph_assert(!op
.updated_snaps
);
411 ceph_assert(len
> 0);
412 ceph_assert(len
== bl
.length());
413 op
.buffer_updates
.insert(
416 ObjectOperation::BufferUpdate::Write
{bl
, fadvise_flags
});
419 const hobject_t
&from
, ///< [in] from
420 const hobject_t
&to
, ///< [in] to
421 uint64_t fromoff
, ///< [in] offset
422 uint64_t len
, ///< [in] len
423 uint64_t tooff
///< [in] offset
425 auto &op
= get_object_op_for_modify(to
);
426 ceph_assert(!op
.updated_snaps
);
427 op
.buffer_updates
.insert(
430 ObjectOperation::BufferUpdate::CloneRange
{from
, fromoff
, len
});
433 const hobject_t
&hoid
, ///< [in] object
434 uint64_t off
, ///< [in] offset to start zeroing at
435 uint64_t len
///< [in] amount to zero
437 auto &op
= get_object_op_for_modify(hoid
);
438 ceph_assert(!op
.updated_snaps
);
439 op
.buffer_updates
.insert(
442 ObjectOperation::BufferUpdate::Zero
{len
});
447 const hobject_t
&hoid
, ///< [in] object to write
448 ceph::buffer::list
&keys_bl
///< [in] encoded map<string, ceph::buffer::list>
450 auto &op
= get_object_op_for_modify(hoid
);
451 op
.omap_updates
.emplace_back(
453 ObjectOperation::OmapUpdateType::Insert
,
457 const hobject_t
&hoid
, ///< [in] object to write
458 std::map
<std::string
, ceph::buffer::list
> &keys
///< [in] omap keys, may be cleared
461 ceph::buffer::list bl
;
463 omap_setkeys(hoid
, bl
);
466 const hobject_t
&hoid
, ///< [in] object to write
467 ceph::buffer::list
&keys_bl
///< [in] encode set<string>
469 auto &op
= get_object_op_for_modify(hoid
);
470 op
.omap_updates
.emplace_back(
472 ObjectOperation::OmapUpdateType::Remove
,
476 const hobject_t
&hoid
, ///< [in] object to write
477 std::set
<std::string
> &keys
///< [in] omap keys, may be cleared
480 ceph::buffer::list bl
;
482 omap_rmkeys(hoid
, bl
);
484 void omap_rmkeyrange(
485 const hobject_t
&hoid
, ///< [in] object to write
486 ceph::buffer::list
&range_bl
///< [in] encode string[2]
488 auto &op
= get_object_op_for_modify(hoid
);
489 op
.omap_updates
.emplace_back(
491 ObjectOperation::OmapUpdateType::RemoveRange
,
494 void omap_rmkeyrange(
495 const hobject_t
&hoid
, ///< [in] object to write
496 std::string
& key_begin
, ///< [in] first key in range
497 std::string
& key_end
///< [in] first key past range, range is [first,last)
499 ceph::buffer::list bl
;
500 ::encode(key_begin
, bl
);
501 ::encode(key_end
, bl
);
502 omap_rmkeyrange(hoid
, bl
);
505 const hobject_t
&hoid
, ///< [in] object to write
506 ceph::buffer::list
&header
///< [in] header
508 auto &op
= get_object_op_for_modify(hoid
);
509 op
.omap_header
= header
;
513 return op_map
.empty();
516 uint64_t get_bytes_written() const {
518 for (auto &&i
: op_map
) {
519 for (auto &&j
: i
.second
.buffer_updates
) {
527 const hobject_t
&hoid
///< [in] obj to which we are doing nothing
529 get_object_op_for_modify(hoid
);
532 /* Calls t() on all pair<hobject_t, ObjectOperation> & such that clone/rename
533 * sinks are always called before clone sources
535 * TODO: add a fast path for the single object case and possibly the single
536 * object clone from source case (make_writeable made a clone).
538 * This structure only requires that the source->sink graph be acyclic.
539 * This is much more general than is actually required by PrimaryLogPG.
540 * Only 4 flavors of multi-object transactions actually happen:
541 * 1) rename temp -> object for copyfrom
542 * 2) clone head -> clone, modify head for make_writeable on normal head write
543 * 3) clone clone -> head for rollback
546 * We can bypass the below logic for single object transactions trivially
547 * (including case 1 above since temp doesn't show up again).
548 * For 2-3, we could add something ad-hoc to ensure that they happen in the
549 * right order, but it actually seems easier to just do the graph construction.
551 template <typename T
>
552 void safe_create_traverse(T
&&t
) {
553 std::map
<hobject_t
, std::list
<hobject_t
>> dgraph
;
554 std::list
<hobject_t
> stack
;
556 // Populate stack with roots, dgraph with edges
557 for (auto &&opair
: op_map
) {
559 if (opair
.second
.has_source(&source
)) {
560 auto &l
= dgraph
[source
];
561 if (l
.empty() && !op_map
.count(source
)) {
562 /* Source oids not in op_map need to be added as roots
563 * (but only once!) */
564 stack
.push_back(source
);
566 l
.push_back(opair
.first
);
568 stack
.push_back(opair
.first
);
572 /* Why don't we need to worry about accessing the same node
573 * twice? dgraph nodes always have in-degree at most 1 because
574 * the inverse graph nodes (source->dest) can have out-degree
575 * at most 1 (only one possible source). We do a post-order
576 * depth-first traversal here to ensure we call f on children
579 while (!stack
.empty()) {
580 hobject_t
&cur
= stack
.front();
581 auto diter
= dgraph
.find(cur
);
582 if (diter
== dgraph
.end()) {
583 /* Leaf: pop and call t() */
584 auto opiter
= op_map
.find(cur
);
585 if (opiter
!= op_map
.end())
589 /* Internal node: push children onto stack, remove edge,
590 * recurse. When this node is encountered again, it'll
592 ceph_assert(!diter
->second
.empty());
593 stack
.splice(stack
.begin(), diter
->second
);
599 using PGTransactionUPtr
= std::unique_ptr
<PGTransaction
>;