1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "librbd/io/ObjectRequest.h"
5 #include "common/ceph_context.h"
6 #include "common/dout.h"
7 #include "common/errno.h"
8 #include "common/ceph_mutex.h"
9 #include "include/Context.h"
10 #include "include/err.h"
11 #include "include/neorados/RADOS.hpp"
12 #include "osd/osd_types.h"
13 #include "librados/snap_set_diff.h"
14 #include "librbd/AsioEngine.h"
15 #include "librbd/ExclusiveLock.h"
16 #include "librbd/ImageCtx.h"
17 #include "librbd/ObjectMap.h"
18 #include "librbd/Utils.h"
19 #include "librbd/asio/Utils.h"
20 #include "librbd/io/AioCompletion.h"
21 #include "librbd/io/CopyupRequest.h"
22 #include "librbd/io/ImageRequest.h"
23 #include "librbd/io/Utils.h"
25 #include <boost/optional.hpp>
27 #define dout_subsys ceph_subsys_rbd
29 #define dout_prefix *_dout << "librbd::io::ObjectRequest: " << this \
30 << " " << __func__ << ": " \
31 << data_object_name(this->m_ictx, \
32 this->m_object_no) << " "
37 using librbd::util::data_object_name
;
38 using librbd::util::create_context_callback
;
39 using librbd::util::create_trace
;
44 inline bool is_copy_on_read(I
*ictx
, const IOContext
& io_context
) {
45 std::shared_lock image_locker
{ictx
->image_lock
};
46 return (ictx
->clone_copy_on_read
&& !ictx
->read_only
&&
47 io_context
->read_snap().value_or(CEPH_NOSNAP
) == CEPH_NOSNAP
&&
48 (ictx
->exclusive_lock
== nullptr ||
49 ictx
->exclusive_lock
->is_lock_owner()));
52 template <typename S
, typename D
>
53 void convert_snap_set(const S
& src_snap_set
,
55 dst_snap_set
->seq
= src_snap_set
.seq
;
56 dst_snap_set
->clones
.reserve(src_snap_set
.clones
.size());
57 for (auto& src_clone
: src_snap_set
.clones
) {
58 dst_snap_set
->clones
.emplace_back();
59 auto& dst_clone
= dst_snap_set
->clones
.back();
60 dst_clone
.cloneid
= src_clone
.cloneid
;
61 dst_clone
.snaps
= src_clone
.snaps
;
62 dst_clone
.overlap
= src_clone
.overlap
;
63 dst_clone
.size
= src_clone
.size
;
67 } // anonymous namespace
71 ObjectRequest
<I
>::create_write(
72 I
*ictx
, uint64_t object_no
, uint64_t object_off
, ceph::bufferlist
&& data
,
73 IOContext io_context
, int op_flags
, int write_flags
,
74 std::optional
<uint64_t> assert_version
,
75 const ZTracer::Trace
&parent_trace
, Context
*completion
) {
76 return new ObjectWriteRequest
<I
>(ictx
, object_no
, object_off
,
77 std::move(data
), io_context
, op_flags
,
78 write_flags
, assert_version
,
79 parent_trace
, completion
);
84 ObjectRequest
<I
>::create_discard(
85 I
*ictx
, uint64_t object_no
, uint64_t object_off
, uint64_t object_len
,
86 IOContext io_context
, int discard_flags
,
87 const ZTracer::Trace
&parent_trace
, Context
*completion
) {
88 return new ObjectDiscardRequest
<I
>(ictx
, object_no
, object_off
,
89 object_len
, io_context
, discard_flags
,
90 parent_trace
, completion
);
95 ObjectRequest
<I
>::create_write_same(
96 I
*ictx
, uint64_t object_no
, uint64_t object_off
, uint64_t object_len
,
97 ceph::bufferlist
&& data
, IOContext io_context
, int op_flags
,
98 const ZTracer::Trace
&parent_trace
, Context
*completion
) {
99 return new ObjectWriteSameRequest
<I
>(ictx
, object_no
, object_off
,
100 object_len
, std::move(data
), io_context
,
101 op_flags
, parent_trace
, completion
);
104 template <typename I
>
106 ObjectRequest
<I
>::create_compare_and_write(
107 I
*ictx
, uint64_t object_no
, uint64_t object_off
,
108 ceph::bufferlist
&& cmp_data
, ceph::bufferlist
&& write_data
,
109 IOContext io_context
, uint64_t *mismatch_offset
, int op_flags
,
110 const ZTracer::Trace
&parent_trace
, Context
*completion
) {
111 return new ObjectCompareAndWriteRequest
<I
>(ictx
, object_no
, object_off
,
113 std::move(write_data
), io_context
,
114 mismatch_offset
, op_flags
,
115 parent_trace
, completion
);
118 template <typename I
>
119 ObjectRequest
<I
>::ObjectRequest(
120 I
*ictx
, uint64_t objectno
, IOContext io_context
,
121 const char *trace_name
, const ZTracer::Trace
&trace
, Context
*completion
)
122 : m_ictx(ictx
), m_object_no(objectno
), m_io_context(io_context
),
123 m_completion(completion
),
124 m_trace(create_trace(*ictx
, "", trace
)) {
125 ceph_assert(m_ictx
->data_ctx
.is_valid());
126 if (m_trace
.valid()) {
127 m_trace
.copy_name(trace_name
+ std::string(" ") +
128 data_object_name(ictx
, objectno
));
129 m_trace
.event("start");
133 template <typename I
>
134 void ObjectRequest
<I
>::add_write_hint(I
& image_ctx
, neorados::WriteOp
* wr
) {
135 auto alloc_hint_flags
= static_cast<neorados::alloc_hint::alloc_hint_t
>(
136 image_ctx
.alloc_hint_flags
);
137 if (image_ctx
.enable_alloc_hint
) {
138 wr
->set_alloc_hint(image_ctx
.get_object_size(),
139 image_ctx
.get_object_size(),
141 } else if (image_ctx
.alloc_hint_flags
!= 0U) {
142 wr
->set_alloc_hint(0, 0, alloc_hint_flags
);
146 template <typename I
>
147 bool ObjectRequest
<I
>::compute_parent_extents(Extents
*parent_extents
,
149 ceph_assert(ceph_mutex_is_locked(m_ictx
->image_lock
));
151 m_has_parent
= false;
152 parent_extents
->clear();
154 uint64_t parent_overlap
;
155 int r
= m_ictx
->get_parent_overlap(
156 m_io_context
->read_snap().value_or(CEPH_NOSNAP
), &parent_overlap
);
158 // NOTE: it's possible for a snapshot to be deleted while we are
159 // still reading from it
160 lderr(m_ictx
->cct
) << "failed to retrieve parent overlap: "
161 << cpp_strerror(r
) << dendl
;
165 if (!read_request
&& !m_ictx
->migration_info
.empty()) {
166 parent_overlap
= m_ictx
->migration_info
.overlap
;
169 if (parent_overlap
== 0) {
173 io::util::extent_to_file(m_ictx
, m_object_no
, 0, m_ictx
->layout
.object_size
,
175 uint64_t object_overlap
= m_ictx
->prune_parent_extents(*parent_extents
,
177 if (object_overlap
> 0) {
178 ldout(m_ictx
->cct
, 20) << "overlap " << parent_overlap
<< " "
179 << "extents " << *parent_extents
<< dendl
;
180 m_has_parent
= !parent_extents
->empty();
186 template <typename I
>
187 void ObjectRequest
<I
>::async_finish(int r
) {
188 ldout(m_ictx
->cct
, 20) << "r=" << r
<< dendl
;
189 m_ictx
->asio_engine
->post([this, r
]() { finish(r
); });
192 template <typename I
>
193 void ObjectRequest
<I
>::finish(int r
) {
194 ldout(m_ictx
->cct
, 20) << "r=" << r
<< dendl
;
195 m_completion
->complete(r
);
201 template <typename I
>
202 ObjectReadRequest
<I
>::ObjectReadRequest(
203 I
*ictx
, uint64_t objectno
, ReadExtents
* extents
,
204 IOContext io_context
, int op_flags
, int read_flags
,
205 const ZTracer::Trace
&parent_trace
, uint64_t* version
,
207 : ObjectRequest
<I
>(ictx
, objectno
, io_context
, "read", parent_trace
,
209 m_extents(extents
), m_op_flags(op_flags
),m_read_flags(read_flags
),
213 template <typename I
>
214 void ObjectReadRequest
<I
>::send() {
215 I
*image_ctx
= this->m_ictx
;
216 ldout(image_ctx
->cct
, 20) << dendl
;
221 template <typename I
>
222 void ObjectReadRequest
<I
>::read_object() {
223 I
*image_ctx
= this->m_ictx
;
225 std::shared_lock image_locker
{image_ctx
->image_lock
};
226 auto read_snap_id
= this->m_io_context
->read_snap().value_or(CEPH_NOSNAP
);
227 if (read_snap_id
== image_ctx
->snap_id
&&
228 image_ctx
->object_map
!= nullptr &&
229 !image_ctx
->object_map
->object_may_exist(this->m_object_no
)) {
230 image_ctx
->asio_engine
->post([this]() { read_parent(); });
233 image_locker
.unlock();
235 ldout(image_ctx
->cct
, 20) << "snap_id=" << read_snap_id
<< dendl
;
237 neorados::ReadOp read_op
;
238 for (auto& extent
: *this->m_extents
) {
239 if (extent
.length
>= image_ctx
->sparse_read_threshold_bytes
) {
240 read_op
.sparse_read(extent
.offset
, extent
.length
, &extent
.bl
,
243 read_op
.read(extent
.offset
, extent
.length
, &extent
.bl
);
246 util::apply_op_flags(
247 m_op_flags
, image_ctx
->get_read_flags(read_snap_id
), &read_op
);
249 image_ctx
->rados_api
.execute(
250 {data_object_name(this->m_ictx
, this->m_object_no
)},
251 *this->m_io_context
, std::move(read_op
), nullptr,
252 librbd::asio::util::get_callback_adapter(
253 [this](int r
) { handle_read_object(r
); }), m_version
,
254 (this->m_trace
.valid() ? this->m_trace
.get_info() : nullptr));
257 template <typename I
>
258 void ObjectReadRequest
<I
>::handle_read_object(int r
) {
259 I
*image_ctx
= this->m_ictx
;
260 ldout(image_ctx
->cct
, 20) << "r=" << r
<< dendl
;
261 if (m_version
!= nullptr) {
262 ldout(image_ctx
->cct
, 20) << "version=" << *m_version
<< dendl
;
269 lderr(image_ctx
->cct
) << "failed to read from object: "
270 << cpp_strerror(r
) << dendl
;
278 template <typename I
>
279 void ObjectReadRequest
<I
>::read_parent() {
280 if ((m_read_flags
& READ_FLAG_DISABLE_READ_FROM_PARENT
) != 0) {
281 this->finish(-ENOENT
);
285 I
*image_ctx
= this->m_ictx
;
286 ldout(image_ctx
->cct
, 20) << dendl
;
288 auto ctx
= create_context_callback
<
289 ObjectReadRequest
<I
>, &ObjectReadRequest
<I
>::handle_read_parent
>(this);
291 io::util::read_parent
<I
>(
292 image_ctx
, this->m_object_no
, this->m_extents
,
293 this->m_io_context
->read_snap().value_or(CEPH_NOSNAP
), this->m_trace
,
297 template <typename I
>
298 void ObjectReadRequest
<I
>::handle_read_parent(int r
) {
299 I
*image_ctx
= this->m_ictx
;
300 ldout(image_ctx
->cct
, 20) << "r=" << r
<< dendl
;
306 lderr(image_ctx
->cct
) << "failed to read parent extents: "
307 << cpp_strerror(r
) << dendl
;
315 template <typename I
>
316 void ObjectReadRequest
<I
>::copyup() {
317 I
*image_ctx
= this->m_ictx
;
318 if (!is_copy_on_read(image_ctx
, this->m_io_context
)) {
323 image_ctx
->owner_lock
.lock_shared();
324 image_ctx
->image_lock
.lock_shared();
325 Extents parent_extents
;
326 if (!this->compute_parent_extents(&parent_extents
, true) ||
327 (image_ctx
->exclusive_lock
!= nullptr &&
328 !image_ctx
->exclusive_lock
->is_lock_owner())) {
329 image_ctx
->image_lock
.unlock_shared();
330 image_ctx
->owner_lock
.unlock_shared();
335 ldout(image_ctx
->cct
, 20) << dendl
;
337 image_ctx
->copyup_list_lock
.lock();
338 auto it
= image_ctx
->copyup_list
.find(this->m_object_no
);
339 if (it
== image_ctx
->copyup_list
.end()) {
340 // create and kick off a CopyupRequest
341 auto new_req
= CopyupRequest
<I
>::create(
342 image_ctx
, this->m_object_no
, std::move(parent_extents
), this->m_trace
);
344 image_ctx
->copyup_list
[this->m_object_no
] = new_req
;
345 image_ctx
->copyup_list_lock
.unlock();
346 image_ctx
->image_lock
.unlock_shared();
349 image_ctx
->copyup_list_lock
.unlock();
350 image_ctx
->image_lock
.unlock_shared();
353 image_ctx
->owner_lock
.unlock_shared();
359 template <typename I
>
360 AbstractObjectWriteRequest
<I
>::AbstractObjectWriteRequest(
361 I
*ictx
, uint64_t object_no
, uint64_t object_off
, uint64_t len
,
362 IOContext io_context
, const char *trace_name
,
363 const ZTracer::Trace
&parent_trace
, Context
*completion
)
364 : ObjectRequest
<I
>(ictx
, object_no
, io_context
, trace_name
, parent_trace
,
366 m_object_off(object_off
), m_object_len(len
)
368 if (this->m_object_off
== 0 &&
369 this->m_object_len
== ictx
->get_object_size()) {
370 m_full_object
= true;
373 compute_parent_info();
375 ictx
->image_lock
.lock_shared();
376 if (!ictx
->migration_info
.empty()) {
377 m_guarding_migration_write
= true;
379 ictx
->image_lock
.unlock_shared();
382 template <typename I
>
383 void AbstractObjectWriteRequest
<I
>::compute_parent_info() {
384 I
*image_ctx
= this->m_ictx
;
385 std::shared_lock image_locker
{image_ctx
->image_lock
};
387 this->compute_parent_extents(&m_parent_extents
, false);
389 if (!this->has_parent() ||
391 !this->m_io_context
->write_snap_context() &&
392 !is_post_copyup_write_required())) {
393 m_copyup_enabled
= false;
397 template <typename I
>
398 void AbstractObjectWriteRequest
<I
>::add_write_hint(
399 neorados::WriteOp
*wr
) {
400 I
*image_ctx
= this->m_ictx
;
401 std::shared_lock image_locker
{image_ctx
->image_lock
};
402 if (image_ctx
->object_map
== nullptr || !this->m_object_may_exist
||
403 image_ctx
->alloc_hint_flags
!= 0U) {
404 ObjectRequest
<I
>::add_write_hint(*image_ctx
, wr
);
408 template <typename I
>
409 void AbstractObjectWriteRequest
<I
>::send() {
410 I
*image_ctx
= this->m_ictx
;
411 ldout(image_ctx
->cct
, 20) << this->get_op_type() << " "
412 << this->m_object_off
<< "~" << this->m_object_len
415 std::shared_lock image_lock
{image_ctx
->image_lock
};
416 if (image_ctx
->object_map
== nullptr) {
417 m_object_may_exist
= true;
419 // should have been flushed prior to releasing lock
420 ceph_assert(image_ctx
->exclusive_lock
->is_lock_owner());
421 m_object_may_exist
= image_ctx
->object_map
->object_may_exist(
426 if (!m_object_may_exist
&& is_no_op_for_nonexistent_object()) {
427 ldout(image_ctx
->cct
, 20) << "skipping no-op on nonexistent object"
429 this->async_finish(0);
433 pre_write_object_map_update();
436 template <typename I
>
437 void AbstractObjectWriteRequest
<I
>::pre_write_object_map_update() {
438 I
*image_ctx
= this->m_ictx
;
440 image_ctx
->image_lock
.lock_shared();
441 if (image_ctx
->object_map
== nullptr || !is_object_map_update_enabled()) {
442 image_ctx
->image_lock
.unlock_shared();
447 if (!m_object_may_exist
&& m_copyup_enabled
) {
448 // optimization: copyup required
449 image_ctx
->image_lock
.unlock_shared();
454 uint8_t new_state
= this->get_pre_write_object_map_state();
455 ldout(image_ctx
->cct
, 20) << this->m_object_off
<< "~" << this->m_object_len
458 if (image_ctx
->object_map
->template aio_update
<
459 AbstractObjectWriteRequest
<I
>,
460 &AbstractObjectWriteRequest
<I
>::handle_pre_write_object_map_update
>(
461 CEPH_NOSNAP
, this->m_object_no
, new_state
, {}, this->m_trace
, false,
463 image_ctx
->image_lock
.unlock_shared();
467 image_ctx
->image_lock
.unlock_shared();
471 template <typename I
>
472 void AbstractObjectWriteRequest
<I
>::handle_pre_write_object_map_update(int r
) {
473 I
*image_ctx
= this->m_ictx
;
474 ldout(image_ctx
->cct
, 20) << "r=" << r
<< dendl
;
476 lderr(image_ctx
->cct
) << "failed to update object map: "
477 << cpp_strerror(r
) << dendl
;
485 template <typename I
>
486 void AbstractObjectWriteRequest
<I
>::write_object() {
487 I
*image_ctx
= this->m_ictx
;
488 ldout(image_ctx
->cct
, 20) << dendl
;
490 neorados::WriteOp write_op
;
491 if (m_copyup_enabled
) {
492 if (m_guarding_migration_write
) {
493 auto snap_seq
= (this->m_io_context
->write_snap_context() ?
494 this->m_io_context
->write_snap_context()->first
: 0);
495 ldout(image_ctx
->cct
, 20) << "guarding write: snap_seq=" << snap_seq
498 cls_client::assert_snapc_seq(
499 &write_op
, snap_seq
, cls::rbd::ASSERT_SNAPC_SEQ_LE_SNAPSET_SEQ
);
501 ldout(image_ctx
->cct
, 20) << "guarding write" << dendl
;
502 write_op
.assert_exists();
506 add_write_hint(&write_op
);
507 add_write_ops(&write_op
);
508 ceph_assert(write_op
.size() != 0);
510 image_ctx
->rados_api
.execute(
511 {data_object_name(this->m_ictx
, this->m_object_no
)},
512 *this->m_io_context
, std::move(write_op
),
513 librbd::asio::util::get_callback_adapter(
514 [this](int r
) { handle_write_object(r
); }), nullptr,
515 (this->m_trace
.valid() ? this->m_trace
.get_info() : nullptr));
518 template <typename I
>
519 void AbstractObjectWriteRequest
<I
>::handle_write_object(int r
) {
520 I
*image_ctx
= this->m_ictx
;
521 ldout(image_ctx
->cct
, 20) << "r=" << r
<< dendl
;
523 r
= filter_write_result(r
);
525 if (m_copyup_enabled
) {
529 } else if (r
== -ERANGE
&& m_guarding_migration_write
) {
530 image_ctx
->image_lock
.lock_shared();
531 m_guarding_migration_write
= !image_ctx
->migration_info
.empty();
532 image_ctx
->image_lock
.unlock_shared();
534 if (m_guarding_migration_write
) {
537 ldout(image_ctx
->cct
, 10) << "migration parent gone, restart io" << dendl
;
538 compute_parent_info();
542 } else if (r
== -EILSEQ
) {
543 ldout(image_ctx
->cct
, 10) << "failed to write object" << dendl
;
547 lderr(image_ctx
->cct
) << "failed to write object: " << cpp_strerror(r
)
553 post_write_object_map_update();
556 template <typename I
>
557 void AbstractObjectWriteRequest
<I
>::copyup() {
558 I
*image_ctx
= this->m_ictx
;
559 ldout(image_ctx
->cct
, 20) << dendl
;
561 ceph_assert(!m_copyup_in_progress
);
562 m_copyup_in_progress
= true;
564 image_ctx
->copyup_list_lock
.lock();
565 auto it
= image_ctx
->copyup_list
.find(this->m_object_no
);
566 if (it
== image_ctx
->copyup_list
.end()) {
567 auto new_req
= CopyupRequest
<I
>::create(
568 image_ctx
, this->m_object_no
, std::move(this->m_parent_extents
),
570 this->m_parent_extents
.clear();
572 // make sure to wait on this CopyupRequest
573 new_req
->append_request(this, std::move(get_copyup_overwrite_extents()));
574 image_ctx
->copyup_list
[this->m_object_no
] = new_req
;
576 image_ctx
->copyup_list_lock
.unlock();
579 it
->second
->append_request(this, std::move(get_copyup_overwrite_extents()));
580 image_ctx
->copyup_list_lock
.unlock();
584 template <typename I
>
585 void AbstractObjectWriteRequest
<I
>::handle_copyup(int r
) {
586 I
*image_ctx
= this->m_ictx
;
587 ldout(image_ctx
->cct
, 20) << "r=" << r
<< dendl
;
589 ceph_assert(m_copyup_in_progress
);
590 m_copyup_in_progress
= false;
592 if (r
< 0 && r
!= -ERESTART
) {
593 lderr(image_ctx
->cct
) << "failed to copyup object: " << cpp_strerror(r
)
599 if (r
== -ERESTART
|| is_post_copyup_write_required()) {
604 post_write_object_map_update();
607 template <typename I
>
608 void AbstractObjectWriteRequest
<I
>::post_write_object_map_update() {
609 I
*image_ctx
= this->m_ictx
;
611 image_ctx
->image_lock
.lock_shared();
612 if (image_ctx
->object_map
== nullptr || !is_object_map_update_enabled() ||
613 !is_non_existent_post_write_object_map_state()) {
614 image_ctx
->image_lock
.unlock_shared();
619 ldout(image_ctx
->cct
, 20) << dendl
;
621 // should have been flushed prior to releasing lock
622 ceph_assert(image_ctx
->exclusive_lock
->is_lock_owner());
623 if (image_ctx
->object_map
->template aio_update
<
624 AbstractObjectWriteRequest
<I
>,
625 &AbstractObjectWriteRequest
<I
>::handle_post_write_object_map_update
>(
626 CEPH_NOSNAP
, this->m_object_no
, OBJECT_NONEXISTENT
, OBJECT_PENDING
,
627 this->m_trace
, false, this)) {
628 image_ctx
->image_lock
.unlock_shared();
632 image_ctx
->image_lock
.unlock_shared();
636 template <typename I
>
637 void AbstractObjectWriteRequest
<I
>::handle_post_write_object_map_update(int r
) {
638 I
*image_ctx
= this->m_ictx
;
639 ldout(image_ctx
->cct
, 20) << "r=" << r
<< dendl
;
641 lderr(image_ctx
->cct
) << "failed to update object map: "
642 << cpp_strerror(r
) << dendl
;
650 template <typename I
>
651 void ObjectWriteRequest
<I
>::add_write_hint(neorados::WriteOp
* wr
) {
652 if ((m_write_flags
& OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE
) != 0) {
654 } else if (m_assert_version
.has_value()) {
655 wr
->assert_version(m_assert_version
.value());
657 AbstractObjectWriteRequest
<I
>::add_write_hint(wr
);
660 template <typename I
>
661 void ObjectWriteRequest
<I
>::add_write_ops(neorados::WriteOp
* wr
) {
662 if (this->m_full_object
) {
663 wr
->write_full(bufferlist
{m_write_data
});
665 wr
->write(this->m_object_off
, bufferlist
{m_write_data
});
667 util::apply_op_flags(m_op_flags
, 0U, wr
);
670 template <typename I
>
671 void ObjectDiscardRequest
<I
>::add_write_ops(neorados::WriteOp
* wr
) {
672 switch (m_discard_action
) {
673 case DISCARD_ACTION_REMOVE
:
676 case DISCARD_ACTION_REMOVE_TRUNCATE
:
679 case DISCARD_ACTION_TRUNCATE
:
680 wr
->truncate(this->m_object_off
);
682 case DISCARD_ACTION_ZERO
:
683 wr
->zero(this->m_object_off
, this->m_object_len
);
691 template <typename I
>
692 void ObjectWriteSameRequest
<I
>::add_write_ops(neorados::WriteOp
* wr
) {
693 wr
->writesame(this->m_object_off
, this->m_object_len
,
694 bufferlist
{m_write_data
});
695 util::apply_op_flags(m_op_flags
, 0U, wr
);
698 template <typename I
>
699 void ObjectCompareAndWriteRequest
<I
>::add_write_ops(neorados::WriteOp
* wr
) {
700 wr
->cmpext(this->m_object_off
, bufferlist
{m_cmp_bl
}, nullptr);
702 if (this->m_full_object
) {
703 wr
->write_full(bufferlist
{m_write_bl
});
705 wr
->write(this->m_object_off
, bufferlist
{m_write_bl
});
707 util::apply_op_flags(m_op_flags
, 0U, wr
);
710 template <typename I
>
711 int ObjectCompareAndWriteRequest
<I
>::filter_write_result(int r
) const {
712 if (r
<= -MAX_ERRNO
) {
713 I
*image_ctx
= this->m_ictx
;
714 Extents image_extents
;
716 // object extent compare mismatch
717 uint64_t offset
= -MAX_ERRNO
- r
;
718 io::util::extent_to_file(image_ctx
, this->m_object_no
, offset
,
719 this->m_object_len
, image_extents
);
720 ceph_assert(image_extents
.size() == 1);
722 if (m_mismatch_offset
) {
723 *m_mismatch_offset
= image_extents
[0].first
;
730 template <typename I
>
731 ObjectListSnapsRequest
<I
>::ObjectListSnapsRequest(
732 I
*ictx
, uint64_t objectno
, Extents
&& object_extents
, SnapIds
&& snap_ids
,
733 int list_snaps_flags
, const ZTracer::Trace
&parent_trace
,
734 SnapshotDelta
* snapshot_delta
, Context
*completion
)
736 ictx
, objectno
, ictx
->duplicate_data_io_context(), "snap_list",
737 parent_trace
, completion
),
738 m_object_extents(std::move(object_extents
)),
739 m_snap_ids(std::move(snap_ids
)), m_list_snaps_flags(list_snaps_flags
),
740 m_snapshot_delta(snapshot_delta
) {
741 this->m_io_context
->read_snap(CEPH_SNAPDIR
);
744 template <typename I
>
745 void ObjectListSnapsRequest
<I
>::send() {
746 I
*image_ctx
= this->m_ictx
;
747 ldout(image_ctx
->cct
, 20) << dendl
;
749 if (m_snap_ids
.size() < 2) {
750 lderr(image_ctx
->cct
) << "invalid snap ids: " << m_snap_ids
<< dendl
;
751 this->async_finish(-EINVAL
);
758 template <typename I
>
759 void ObjectListSnapsRequest
<I
>::list_snaps() {
760 I
*image_ctx
= this->m_ictx
;
761 ldout(image_ctx
->cct
, 20) << dendl
;
763 neorados::ReadOp read_op
;
764 read_op
.list_snaps(&m_snap_set
, &m_ec
);
766 image_ctx
->rados_api
.execute(
767 {data_object_name(this->m_ictx
, this->m_object_no
)},
768 *this->m_io_context
, std::move(read_op
), nullptr,
769 librbd::asio::util::get_callback_adapter(
770 [this](int r
) { handle_list_snaps(r
); }), nullptr,
771 (this->m_trace
.valid() ? this->m_trace
.get_info() : nullptr));
774 template <typename I
>
775 void ObjectListSnapsRequest
<I
>::handle_list_snaps(int r
) {
776 I
*image_ctx
= this->m_ictx
;
777 auto cct
= image_ctx
->cct
;
783 ldout(cct
, 20) << "r=" << r
<< dendl
;
785 m_snapshot_delta
->clear();
786 auto& snapshot_delta
= *m_snapshot_delta
;
788 ceph_assert(!m_snap_ids
.empty());
789 librados::snap_t start_snap_id
= 0;
790 librados::snap_t first_snap_id
= *m_snap_ids
.begin();
791 librados::snap_t last_snap_id
= *m_snap_ids
.rbegin();
794 // the object does not exist -- mark the missing extents
795 zero_extent(first_snap_id
, true);
799 lderr(cct
) << "failed to retrieve object snapshot list: " << cpp_strerror(r
)
805 // helper function requires the librados legacy data structure
806 librados::snap_set_t snap_set
;
807 convert_snap_set(m_snap_set
, &snap_set
);
809 bool initial_extents_written
= false;
811 interval_set
<uint64_t> object_interval
;
812 for (auto& object_extent
: m_object_extents
) {
813 object_interval
.insert(object_extent
.first
, object_extent
.second
);
815 ldout(cct
, 20) << "object_interval=" << object_interval
<< dendl
;
817 // loop through all expected snapshots and build interval sets for
818 // data and zeroed ranges for each snapshot
819 uint64_t prev_end_size
= 0;
820 interval_set
<uint64_t> initial_written_extents
;
821 for (auto end_snap_id
: m_snap_ids
) {
822 if (start_snap_id
== end_snap_id
) {
824 } else if (end_snap_id
> last_snap_id
) {
828 interval_set
<uint64_t> diff
;
831 librados::snap_t clone_end_snap_id
;
832 bool read_whole_object
;
833 calc_snap_set_diff(cct
, snap_set
, start_snap_id
,
834 end_snap_id
, &diff
, &end_size
, &exists
,
835 &clone_end_snap_id
, &read_whole_object
);
837 if (read_whole_object
||
839 ((m_list_snaps_flags
& LIST_SNAPS_FLAG_WHOLE_OBJECT
) != 0))) {
840 ldout(cct
, 1) << "need to read full object" << dendl
;
842 diff
.insert(0, image_ctx
->layout
.object_size
);
843 end_size
= image_ctx
->layout
.object_size
;
844 clone_end_snap_id
= end_snap_id
;
845 } else if (!exists
) {
850 // reads should be issued against the newest (existing) snapshot within
851 // the associated snapshot object clone. writes should be issued
852 // against the oldest snapshot in the snap_map.
853 ceph_assert(clone_end_snap_id
>= end_snap_id
);
854 if (clone_end_snap_id
> last_snap_id
) {
855 // do not read past the copy point snapshot
856 clone_end_snap_id
= last_snap_id
;
860 // clip diff to current object extent
861 interval_set
<uint64_t> diff_interval
;
862 diff_interval
.intersection_of(object_interval
, diff
);
864 // clip diff to size of object (in case it was truncated)
865 interval_set
<uint64_t> zero_interval
;
866 if (end_size
< prev_end_size
) {
867 zero_interval
.insert(end_size
, prev_end_size
- end_size
);
868 zero_interval
.intersection_of(object_interval
);
870 interval_set
<uint64_t> trunc_interval
;
871 trunc_interval
.intersection_of(zero_interval
, diff_interval
);
872 if (!trunc_interval
.empty()) {
873 diff_interval
.subtract(trunc_interval
);
874 ldout(cct
, 20) << "clearing truncate diff: " << trunc_interval
<< dendl
;
878 ldout(cct
, 20) << "start_snap_id=" << start_snap_id
<< ", "
879 << "end_snap_id=" << end_snap_id
<< ", "
880 << "clone_end_snap_id=" << clone_end_snap_id
<< ", "
881 << "diff=" << diff
<< ", "
882 << "diff_interval=" << diff_interval
<< ", "
883 << "zero_interval=" << zero_interval
<< ", "
884 << "end_size=" << end_size
<< ", "
885 << "prev_end_size=" << prev_end_size
<< ", "
886 << "exists=" << exists
<< ", "
887 << "whole_object=" << read_whole_object
<< dendl
;
889 // check if object exists prior to start of incremental snap delta so that
890 // we don't DNE the object if no additional deltas exist
891 if (exists
&& start_snap_id
== 0 &&
892 (!diff_interval
.empty() || !zero_interval
.empty())) {
893 ldout(cct
, 20) << "object exists at snap id " << end_snap_id
<< dendl
;
894 initial_extents_written
= true;
897 prev_end_size
= end_size
;
898 start_snap_id
= end_snap_id
;
900 if (end_snap_id
<= first_snap_id
) {
901 // don't include deltas from the starting snapshots, but we iterate over
902 // it to track its existence and size
903 ldout(cct
, 20) << "skipping prior snapshot " << dendl
;
908 for (auto& interval
: diff_interval
) {
909 snapshot_delta
[{end_snap_id
, clone_end_snap_id
}].insert(
910 interval
.first
, interval
.second
,
911 SparseExtent(SPARSE_EXTENT_STATE_DATA
, interval
.second
));
914 zero_interval
.union_of(diff_interval
);
917 if ((m_list_snaps_flags
& LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS
) == 0) {
918 for (auto& interval
: zero_interval
) {
919 snapshot_delta
[{end_snap_id
, end_snap_id
}].insert(
920 interval
.first
, interval
.second
,
921 SparseExtent(SPARSE_EXTENT_STATE_ZEROED
, interval
.second
));
926 bool snapshot_delta_empty
= snapshot_delta
.empty();
927 if (!initial_extents_written
) {
928 zero_extent(first_snap_id
, first_snap_id
> 0);
930 ldout(cct
, 20) << "snapshot_delta=" << snapshot_delta
<< dendl
;
932 if (snapshot_delta_empty
) {
940 template <typename I
>
941 void ObjectListSnapsRequest
<I
>::list_from_parent() {
942 I
*image_ctx
= this->m_ictx
;
943 auto cct
= image_ctx
->cct
;
945 ceph_assert(!m_snap_ids
.empty());
946 librados::snap_t snap_id_start
= *m_snap_ids
.begin();
947 librados::snap_t snap_id_end
= *m_snap_ids
.rbegin();
949 std::unique_lock image_locker
{image_ctx
->image_lock
};
950 if ((snap_id_start
> 0) || (image_ctx
->parent
== nullptr) ||
951 ((m_list_snaps_flags
& LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT
) != 0)) {
952 image_locker
.unlock();
958 // calculate reverse mapping onto the parent image
959 Extents parent_image_extents
;
960 for (auto [object_off
, object_len
]: m_object_extents
) {
961 io::util::extent_to_file(image_ctx
, this->m_object_no
, object_off
,
962 object_len
, parent_image_extents
);
965 uint64_t parent_overlap
= 0;
966 uint64_t object_overlap
= 0;
967 int r
= image_ctx
->get_parent_overlap(snap_id_end
, &parent_overlap
);
969 object_overlap
= image_ctx
->prune_parent_extents(parent_image_extents
,
973 if (object_overlap
== 0) {
974 image_locker
.unlock();
980 auto ctx
= create_context_callback
<
981 ObjectListSnapsRequest
<I
>,
982 &ObjectListSnapsRequest
<I
>::handle_list_from_parent
>(this);
983 auto aio_comp
= AioCompletion::create_and_start(
984 ctx
, librbd::util::get_image_ctx(image_ctx
->parent
), AIO_TYPE_GENERIC
);
985 ldout(cct
, 20) << "aio_comp=" << aio_comp
<< ", "
986 << "parent_image_extents " << parent_image_extents
<< dendl
;
988 auto list_snaps_flags
= (
989 m_list_snaps_flags
| LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS
);
991 ImageListSnapsRequest
<I
> req(
992 *image_ctx
->parent
, aio_comp
, std::move(parent_image_extents
),
993 {0, image_ctx
->parent
->snap_id
}, list_snaps_flags
, &m_parent_snapshot_delta
,
998 template <typename I
>
999 void ObjectListSnapsRequest
<I
>::handle_list_from_parent(int r
) {
1000 I
*image_ctx
= this->m_ictx
;
1001 auto cct
= image_ctx
->cct
;
1003 ldout(cct
, 20) << "r=" << r
<< ", "
1004 << "parent_snapshot_delta=" << m_parent_snapshot_delta
1007 // ignore special-case of fully empty dataset (we ignore zeroes)
1008 if (m_parent_snapshot_delta
.empty()) {
1013 // the write/read snapshot id key is not useful for parent images so
1014 // map the the special-case INITIAL_WRITE_READ_SNAP_IDS key
1015 *m_snapshot_delta
= {};
1016 auto& intervals
= (*m_snapshot_delta
)[INITIAL_WRITE_READ_SNAP_IDS
];
1017 for (auto& [key
, image_extents
] : m_parent_snapshot_delta
) {
1018 for (auto image_extent
: image_extents
) {
1019 auto state
= image_extent
.get_val().state
;
1021 // map image-extents back to this object
1022 striper::LightweightObjectExtents object_extents
;
1023 io::util::file_to_extents(image_ctx
, image_extent
.get_off(),
1024 image_extent
.get_len(), 0, &object_extents
);
1025 for (auto& object_extent
: object_extents
) {
1026 ceph_assert(object_extent
.object_no
== this->m_object_no
);
1028 object_extent
.offset
, object_extent
.length
,
1029 {state
, object_extent
.length
});
1034 ldout(cct
, 20) << "snapshot_delta=" << *m_snapshot_delta
<< dendl
;
1038 template <typename I
>
1039 void ObjectListSnapsRequest
<I
>::zero_extent(uint64_t snap_id
, bool dne
) {
1040 I
*image_ctx
= this->m_ictx
;
1041 auto cct
= image_ctx
->cct
;
1043 // the object does not exist or is (partially) under whiteout -- mark the
1044 // missing extents which would be any portion of the object that does not
1045 // have data in the initial snapshot set
1046 if ((m_list_snaps_flags
& LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS
) == 0) {
1047 interval_set
<uint64_t> interval
;
1048 for (auto [object_offset
, object_length
] : m_object_extents
) {
1049 interval
.insert(object_offset
, object_length
);
1052 for (auto [offset
, length
] : interval
) {
1053 ldout(cct
, 20) << "snapshot " << snap_id
<< ": "
1054 << (dne
? "DNE" : "zeroed") << " extent "
1055 << offset
<< "~" << length
<< dendl
;
1056 (*m_snapshot_delta
)[{snap_id
, snap_id
}].insert(
1059 (dne
? SPARSE_EXTENT_STATE_DNE
: SPARSE_EXTENT_STATE_ZEROED
),
1066 } // namespace librbd
1068 template class librbd::io::ObjectRequest
<librbd::ImageCtx
>;
1069 template class librbd::io::ObjectReadRequest
<librbd::ImageCtx
>;
1070 template class librbd::io::AbstractObjectWriteRequest
<librbd::ImageCtx
>;
1071 template class librbd::io::ObjectWriteRequest
<librbd::ImageCtx
>;
1072 template class librbd::io::ObjectDiscardRequest
<librbd::ImageCtx
>;
1073 template class librbd::io::ObjectWriteSameRequest
<librbd::ImageCtx
>;
1074 template class librbd::io::ObjectCompareAndWriteRequest
<librbd::ImageCtx
>;
1075 template class librbd::io::ObjectListSnapsRequest
<librbd::ImageCtx
>;