]> git.proxmox.com Git - ceph.git/blame - ceph/src/librbd/io/ObjectRequest.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / librbd / io / ObjectRequest.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "librbd/io/ObjectRequest.h"
5#include "common/ceph_context.h"
6#include "common/dout.h"
7#include "common/errno.h"
9f95a23c 8#include "common/ceph_mutex.h"
7c673cae 9#include "include/Context.h"
c07f9fc5 10#include "include/err.h"
f67539c2 11#include "include/neorados/RADOS.hpp"
11fdf7f2 12#include "osd/osd_types.h"
f67539c2
TL
13#include "librados/snap_set_diff.h"
14#include "librbd/AsioEngine.h"
7c673cae
FG
15#include "librbd/ExclusiveLock.h"
16#include "librbd/ImageCtx.h"
17#include "librbd/ObjectMap.h"
18#include "librbd/Utils.h"
f67539c2 19#include "librbd/asio/Utils.h"
7c673cae
FG
20#include "librbd/io/AioCompletion.h"
21#include "librbd/io/CopyupRequest.h"
f67539c2 22#include "librbd/io/ImageRequest.h"
f91f0fd5 23#include "librbd/io/Utils.h"
7c673cae 24
7c673cae
FG
25#include <boost/optional.hpp>
26
27#define dout_subsys ceph_subsys_rbd
28#undef dout_prefix
9f95a23c
TL
29#define dout_prefix *_dout << "librbd::io::ObjectRequest: " << this \
30 << " " << __func__ << ": " \
31 << data_object_name(this->m_ictx, \
32 this->m_object_no) << " "
7c673cae
FG
33
34namespace librbd {
35namespace io {
36
9f95a23c 37using librbd::util::data_object_name;
f91f0fd5 38using librbd::util::create_context_callback;
f91f0fd5 39using librbd::util::create_trace;
9f95a23c 40
b32b8144 41namespace {
7c673cae
FG
42
43template <typename I>
f67539c2 44inline bool is_copy_on_read(I *ictx, const IOContext& io_context) {
9f95a23c 45 std::shared_lock image_locker{ictx->image_lock};
f67539c2
TL
46 return (ictx->clone_copy_on_read && !ictx->read_only &&
47 io_context->read_snap().value_or(CEPH_NOSNAP) == CEPH_NOSNAP &&
b32b8144
FG
48 (ictx->exclusive_lock == nullptr ||
49 ictx->exclusive_lock->is_lock_owner()));
7c673cae
FG
50}
51
f67539c2
TL
52template <typename S, typename D>
53void convert_snap_set(const S& src_snap_set,
54 D* dst_snap_set) {
55 dst_snap_set->seq = src_snap_set.seq;
56 dst_snap_set->clones.reserve(src_snap_set.clones.size());
57 for (auto& src_clone : src_snap_set.clones) {
58 dst_snap_set->clones.emplace_back();
59 auto& dst_clone = dst_snap_set->clones.back();
60 dst_clone.cloneid = src_clone.cloneid;
61 dst_clone.snaps = src_clone.snaps;
62 dst_clone.overlap = src_clone.overlap;
63 dst_clone.size = src_clone.size;
64 }
65}
66
b32b8144 67} // anonymous namespace
3efd9988 68
7c673cae
FG
69template <typename I>
70ObjectRequest<I>*
9f95a23c
TL
71ObjectRequest<I>::create_write(
72 I *ictx, uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
f67539c2
TL
73 IOContext io_context, int op_flags, int write_flags,
74 std::optional<uint64_t> assert_version,
9f95a23c
TL
75 const ZTracer::Trace &parent_trace, Context *completion) {
76 return new ObjectWriteRequest<I>(ictx, object_no, object_off,
f67539c2
TL
77 std::move(data), io_context, op_flags,
78 write_flags, assert_version,
11fdf7f2 79 parent_trace, completion);
7c673cae
FG
80}
81
82template <typename I>
83ObjectRequest<I>*
9f95a23c
TL
84ObjectRequest<I>::create_discard(
85 I *ictx, uint64_t object_no, uint64_t object_off, uint64_t object_len,
f67539c2 86 IOContext io_context, int discard_flags,
9f95a23c
TL
87 const ZTracer::Trace &parent_trace, Context *completion) {
88 return new ObjectDiscardRequest<I>(ictx, object_no, object_off,
f67539c2 89 object_len, io_context, discard_flags,
11fdf7f2 90 parent_trace, completion);
7c673cae
FG
91}
92
93template <typename I>
94ObjectRequest<I>*
9f95a23c
TL
95ObjectRequest<I>::create_write_same(
96 I *ictx, uint64_t object_no, uint64_t object_off, uint64_t object_len,
f67539c2 97 ceph::bufferlist&& data, IOContext io_context, int op_flags,
9f95a23c
TL
98 const ZTracer::Trace &parent_trace, Context *completion) {
99 return new ObjectWriteSameRequest<I>(ictx, object_no, object_off,
f67539c2 100 object_len, std::move(data), io_context,
11fdf7f2 101 op_flags, parent_trace, completion);
7c673cae
FG
102}
103
c07f9fc5
FG
104template <typename I>
105ObjectRequest<I>*
9f95a23c
TL
106ObjectRequest<I>::create_compare_and_write(
107 I *ictx, uint64_t object_no, uint64_t object_off,
108 ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data,
f67539c2 109 IOContext io_context, uint64_t *mismatch_offset, int op_flags,
9f95a23c
TL
110 const ZTracer::Trace &parent_trace, Context *completion) {
111 return new ObjectCompareAndWriteRequest<I>(ictx, object_no, object_off,
11fdf7f2 112 std::move(cmp_data),
f67539c2 113 std::move(write_data), io_context,
b32b8144
FG
114 mismatch_offset, op_flags,
115 parent_trace, completion);
c07f9fc5
FG
116}
117
7c673cae 118template <typename I>
9f95a23c 119ObjectRequest<I>::ObjectRequest(
f67539c2
TL
120 I *ictx, uint64_t objectno, IOContext io_context,
121 const char *trace_name, const ZTracer::Trace &trace, Context *completion)
122 : m_ictx(ictx), m_object_no(objectno), m_io_context(io_context),
123 m_completion(completion),
f91f0fd5 124 m_trace(create_trace(*ictx, "", trace)) {
eafe8130 125 ceph_assert(m_ictx->data_ctx.is_valid());
31f18b77 126 if (m_trace.valid()) {
9f95a23c
TL
127 m_trace.copy_name(trace_name + std::string(" ") +
128 data_object_name(ictx, objectno));
31f18b77
FG
129 m_trace.event("start");
130 }
7c673cae
FG
131}
132
133template <typename I>
f67539c2
TL
134void ObjectRequest<I>::add_write_hint(I& image_ctx, neorados::WriteOp* wr) {
135 auto alloc_hint_flags = static_cast<neorados::alloc_hint::alloc_hint_t>(
136 image_ctx.alloc_hint_flags);
b32b8144 137 if (image_ctx.enable_alloc_hint) {
f67539c2
TL
138 wr->set_alloc_hint(image_ctx.get_object_size(),
139 image_ctx.get_object_size(),
140 alloc_hint_flags);
92f5a8d4 141 } else if (image_ctx.alloc_hint_flags != 0U) {
f67539c2 142 wr->set_alloc_hint(0, 0, alloc_hint_flags);
7c673cae
FG
143 }
144}
145
146template <typename I>
11fdf7f2 147bool ObjectRequest<I>::compute_parent_extents(Extents *parent_extents,
1e59de90 148 ImageArea *area,
11fdf7f2 149 bool read_request) {
9f95a23c 150 ceph_assert(ceph_mutex_is_locked(m_ictx->image_lock));
7c673cae 151
b32b8144
FG
152 m_has_parent = false;
153 parent_extents->clear();
1e59de90 154 *area = ImageArea::DATA;
b32b8144 155
1e59de90 156 uint64_t raw_overlap;
f67539c2 157 int r = m_ictx->get_parent_overlap(
1e59de90 158 m_io_context->read_snap().value_or(CEPH_NOSNAP), &raw_overlap);
7c673cae
FG
159 if (r < 0) {
160 // NOTE: it's possible for a snapshot to be deleted while we are
161 // still reading from it
162 lderr(m_ictx->cct) << "failed to retrieve parent overlap: "
b32b8144
FG
163 << cpp_strerror(r) << dendl;
164 return false;
11fdf7f2 165 }
1e59de90
TL
166 bool migration_write = !read_request && !m_ictx->migration_info.empty();
167 if (migration_write) {
168 raw_overlap = m_ictx->migration_info.overlap;
11fdf7f2 169 }
1e59de90 170 if (raw_overlap == 0) {
7c673cae
FG
171 return false;
172 }
173
1e59de90
TL
174 std::tie(*parent_extents, *area) = io::util::object_to_area_extents(
175 m_ictx, m_object_no, {{0, m_ictx->layout.object_size}});
176 uint64_t object_overlap = m_ictx->prune_parent_extents(
177 *parent_extents, *area, raw_overlap, migration_write);
7c673cae 178 if (object_overlap > 0) {
1e59de90 179 m_has_parent = true;
7c673cae
FG
180 return true;
181 }
182 return false;
183}
184
b32b8144
FG
185template <typename I>
186void ObjectRequest<I>::async_finish(int r) {
187 ldout(m_ictx->cct, 20) << "r=" << r << dendl;
f67539c2 188 m_ictx->asio_engine->post([this, r]() { finish(r); });
b32b8144
FG
189}
190
191template <typename I>
192void ObjectRequest<I>::finish(int r) {
193 ldout(m_ictx->cct, 20) << "r=" << r << dendl;
194 m_completion->complete(r);
195 delete this;
7c673cae
FG
196}
197
198/** read **/
199
200template <typename I>
9f95a23c 201ObjectReadRequest<I>::ObjectReadRequest(
f67539c2
TL
202 I *ictx, uint64_t objectno, ReadExtents* extents,
203 IOContext io_context, int op_flags, int read_flags,
204 const ZTracer::Trace &parent_trace, uint64_t* version,
205 Context *completion)
206 : ObjectRequest<I>(ictx, objectno, io_context, "read", parent_trace,
207 completion),
208 m_extents(extents), m_op_flags(op_flags),m_read_flags(read_flags),
209 m_version(version) {
7c673cae
FG
210}
211
212template <typename I>
b32b8144
FG
213void ObjectReadRequest<I>::send() {
214 I *image_ctx = this->m_ictx;
215 ldout(image_ctx->cct, 20) << dendl;
7c673cae 216
11fdf7f2 217 read_object();
b32b8144
FG
218}
219
220template <typename I>
221void ObjectReadRequest<I>::read_object() {
222 I *image_ctx = this->m_ictx;
b32b8144 223
f67539c2
TL
224 std::shared_lock image_locker{image_ctx->image_lock};
225 auto read_snap_id = this->m_io_context->read_snap().value_or(CEPH_NOSNAP);
226 if (read_snap_id == image_ctx->snap_id &&
227 image_ctx->object_map != nullptr &&
228 !image_ctx->object_map->object_may_exist(this->m_object_no)) {
229 image_ctx->asio_engine->post([this]() { read_parent(); });
230 return;
7c673cae 231 }
f67539c2 232 image_locker.unlock();
7c673cae 233
f67539c2 234 ldout(image_ctx->cct, 20) << "snap_id=" << read_snap_id << dendl;
7c673cae 235
f67539c2
TL
236 neorados::ReadOp read_op;
237 for (auto& extent: *this->m_extents) {
238 if (extent.length >= image_ctx->sparse_read_threshold_bytes) {
239 read_op.sparse_read(extent.offset, extent.length, &extent.bl,
240 &extent.extent_map);
241 } else {
242 read_op.read(extent.offset, extent.length, &extent.bl);
243 }
244 }
245 util::apply_op_flags(
246 m_op_flags, image_ctx->get_read_flags(read_snap_id), &read_op);
247
248 image_ctx->rados_api.execute(
249 {data_object_name(this->m_ictx, this->m_object_no)},
250 *this->m_io_context, std::move(read_op), nullptr,
251 librbd::asio::util::get_callback_adapter(
252 [this](int r) { handle_read_object(r); }), m_version,
253 (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
7c673cae
FG
254}
255
256template <typename I>
b32b8144
FG
257void ObjectReadRequest<I>::handle_read_object(int r) {
258 I *image_ctx = this->m_ictx;
259 ldout(image_ctx->cct, 20) << "r=" << r << dendl;
f67539c2
TL
260 if (m_version != nullptr) {
261 ldout(image_ctx->cct, 20) << "version=" << *m_version << dendl;
262 }
b32b8144
FG
263
264 if (r == -ENOENT) {
265 read_parent();
266 return;
267 } else if (r < 0) {
268 lderr(image_ctx->cct) << "failed to read from object: "
269 << cpp_strerror(r) << dendl;
270 this->finish(r);
271 return;
272 }
7c673cae 273
b32b8144
FG
274 this->finish(0);
275}
276
277template <typename I>
278void ObjectReadRequest<I>::read_parent() {
f67539c2
TL
279 if ((m_read_flags & READ_FLAG_DISABLE_READ_FROM_PARENT) != 0) {
280 this->finish(-ENOENT);
281 return;
282 }
283
b32b8144 284 I *image_ctx = this->m_ictx;
b32b8144
FG
285 ldout(image_ctx->cct, 20) << dendl;
286
f91f0fd5
TL
287 auto ctx = create_context_callback<
288 ObjectReadRequest<I>, &ObjectReadRequest<I>::handle_read_parent>(this);
289
f67539c2
TL
290 io::util::read_parent<I>(
291 image_ctx, this->m_object_no, this->m_extents,
292 this->m_io_context->read_snap().value_or(CEPH_NOSNAP), this->m_trace,
293 ctx);
b32b8144
FG
294}
295
296template <typename I>
297void ObjectReadRequest<I>::handle_read_parent(int r) {
298 I *image_ctx = this->m_ictx;
299 ldout(image_ctx->cct, 20) << "r=" << r << dendl;
300
301 if (r == -ENOENT) {
302 this->finish(r);
303 return;
304 } else if (r < 0) {
305 lderr(image_ctx->cct) << "failed to read parent extents: "
306 << cpp_strerror(r) << dendl;
307 this->finish(r);
308 return;
309 }
310
311 copyup();
312}
313
314template <typename I>
315void ObjectReadRequest<I>::copyup() {
316 I *image_ctx = this->m_ictx;
f67539c2 317 if (!is_copy_on_read(image_ctx, this->m_io_context)) {
b32b8144
FG
318 this->finish(0);
319 return;
320 }
321
9f95a23c
TL
322 image_ctx->owner_lock.lock_shared();
323 image_ctx->image_lock.lock_shared();
b32b8144 324 Extents parent_extents;
1e59de90
TL
325 ImageArea area;
326 if (!this->compute_parent_extents(&parent_extents, &area, true) ||
b32b8144
FG
327 (image_ctx->exclusive_lock != nullptr &&
328 !image_ctx->exclusive_lock->is_lock_owner())) {
9f95a23c
TL
329 image_ctx->image_lock.unlock_shared();
330 image_ctx->owner_lock.unlock_shared();
b32b8144
FG
331 this->finish(0);
332 return;
333 }
334
335 ldout(image_ctx->cct, 20) << dendl;
336
9f95a23c 337 image_ctx->copyup_list_lock.lock();
b32b8144 338 auto it = image_ctx->copyup_list.find(this->m_object_no);
7c673cae
FG
339 if (it == image_ctx->copyup_list.end()) {
340 // create and kick off a CopyupRequest
b32b8144 341 auto new_req = CopyupRequest<I>::create(
1e59de90
TL
342 image_ctx, this->m_object_no, std::move(parent_extents), area,
343 this->m_trace);
7c673cae
FG
344
345 image_ctx->copyup_list[this->m_object_no] = new_req;
9f95a23c
TL
346 image_ctx->copyup_list_lock.unlock();
347 image_ctx->image_lock.unlock_shared();
7c673cae 348 new_req->send();
11fdf7f2 349 } else {
9f95a23c
TL
350 image_ctx->copyup_list_lock.unlock();
351 image_ctx->image_lock.unlock_shared();
7c673cae 352 }
7c673cae 353
9f95a23c 354 image_ctx->owner_lock.unlock_shared();
b32b8144 355 this->finish(0);
7c673cae
FG
356}
357
358/** write **/
359
b32b8144
FG
360template <typename I>
361AbstractObjectWriteRequest<I>::AbstractObjectWriteRequest(
9f95a23c 362 I *ictx, uint64_t object_no, uint64_t object_off, uint64_t len,
f67539c2 363 IOContext io_context, const char *trace_name,
b32b8144 364 const ZTracer::Trace &parent_trace, Context *completion)
f67539c2
TL
365 : ObjectRequest<I>(ictx, object_no, io_context, trace_name, parent_trace,
366 completion),
367 m_object_off(object_off), m_object_len(len)
7c673cae 368{
b32b8144
FG
369 if (this->m_object_off == 0 &&
370 this->m_object_len == ictx->get_object_size()) {
371 m_full_object = true;
372 }
7c673cae 373
11fdf7f2 374 compute_parent_info();
81eedcae 375
9f95a23c 376 ictx->image_lock.lock_shared();
81eedcae
TL
377 if (!ictx->migration_info.empty()) {
378 m_guarding_migration_write = true;
379 }
9f95a23c 380 ictx->image_lock.unlock_shared();
11fdf7f2
TL
381}
382
383template <typename I>
384void AbstractObjectWriteRequest<I>::compute_parent_info() {
385 I *image_ctx = this->m_ictx;
9f95a23c 386 std::shared_lock image_locker{image_ctx->image_lock};
11fdf7f2 387
1e59de90 388 this->compute_parent_extents(&m_parent_extents, &m_image_area, false);
11fdf7f2 389
b32b8144 390 if (!this->has_parent() ||
f67539c2
TL
391 (m_full_object &&
392 !this->m_io_context->write_snap_context() &&
393 !is_post_copyup_write_required())) {
11fdf7f2 394 m_copyup_enabled = false;
7c673cae 395 }
b32b8144 396}
7c673cae 397
b32b8144
FG
398template <typename I>
399void AbstractObjectWriteRequest<I>::add_write_hint(
f67539c2 400 neorados::WriteOp *wr) {
b32b8144 401 I *image_ctx = this->m_ictx;
9f95a23c 402 std::shared_lock image_locker{image_ctx->image_lock};
f67539c2
TL
403 if (image_ctx->object_map == nullptr || !this->m_object_may_exist ||
404 image_ctx->alloc_hint_flags != 0U) {
b32b8144
FG
405 ObjectRequest<I>::add_write_hint(*image_ctx, wr);
406 }
7c673cae
FG
407}
408
b32b8144
FG
409template <typename I>
410void AbstractObjectWriteRequest<I>::send() {
411 I *image_ctx = this->m_ictx;
9f95a23c 412 ldout(image_ctx->cct, 20) << this->get_op_type() << " "
b32b8144
FG
413 << this->m_object_off << "~" << this->m_object_len
414 << dendl;
7c673cae 415 {
9f95a23c 416 std::shared_lock image_lock{image_ctx->image_lock};
b32b8144
FG
417 if (image_ctx->object_map == nullptr) {
418 m_object_may_exist = true;
7c673cae
FG
419 } else {
420 // should have been flushed prior to releasing lock
11fdf7f2 421 ceph_assert(image_ctx->exclusive_lock->is_lock_owner());
b32b8144
FG
422 m_object_may_exist = image_ctx->object_map->object_may_exist(
423 this->m_object_no);
7c673cae
FG
424 }
425 }
426
b32b8144
FG
427 if (!m_object_may_exist && is_no_op_for_nonexistent_object()) {
428 ldout(image_ctx->cct, 20) << "skipping no-op on nonexistent object"
429 << dendl;
430 this->async_finish(0);
431 return;
7c673cae
FG
432 }
433
b32b8144 434 pre_write_object_map_update();
7c673cae
FG
435}
436
b32b8144
FG
437template <typename I>
438void AbstractObjectWriteRequest<I>::pre_write_object_map_update() {
439 I *image_ctx = this->m_ictx;
7c673cae 440
9f95a23c 441 image_ctx->image_lock.lock_shared();
b32b8144 442 if (image_ctx->object_map == nullptr || !is_object_map_update_enabled()) {
9f95a23c 443 image_ctx->image_lock.unlock_shared();
b32b8144
FG
444 write_object();
445 return;
7c673cae
FG
446 }
447
b32b8144
FG
448 if (!m_object_may_exist && m_copyup_enabled) {
449 // optimization: copyup required
9f95a23c 450 image_ctx->image_lock.unlock_shared();
b32b8144
FG
451 copyup();
452 return;
453 }
7c673cae 454
b32b8144 455 uint8_t new_state = this->get_pre_write_object_map_state();
9f95a23c
TL
456 ldout(image_ctx->cct, 20) << this->m_object_off << "~" << this->m_object_len
457 << dendl;
7c673cae 458
b32b8144
FG
459 if (image_ctx->object_map->template aio_update<
460 AbstractObjectWriteRequest<I>,
461 &AbstractObjectWriteRequest<I>::handle_pre_write_object_map_update>(
91327a77
AA
462 CEPH_NOSNAP, this->m_object_no, new_state, {}, this->m_trace, false,
463 this)) {
9f95a23c 464 image_ctx->image_lock.unlock_shared();
b32b8144 465 return;
7c673cae
FG
466 }
467
9f95a23c 468 image_ctx->image_lock.unlock_shared();
b32b8144 469 write_object();
7c673cae
FG
470}
471
b32b8144
FG
472template <typename I>
473void AbstractObjectWriteRequest<I>::handle_pre_write_object_map_update(int r) {
474 I *image_ctx = this->m_ictx;
475 ldout(image_ctx->cct, 20) << "r=" << r << dendl;
11fdf7f2
TL
476 if (r < 0) {
477 lderr(image_ctx->cct) << "failed to update object map: "
478 << cpp_strerror(r) << dendl;
479 this->finish(r);
480 return;
481 }
7c673cae 482
b32b8144 483 write_object();
7c673cae
FG
484}
485
b32b8144
FG
486template <typename I>
487void AbstractObjectWriteRequest<I>::write_object() {
488 I *image_ctx = this->m_ictx;
489 ldout(image_ctx->cct, 20) << dendl;
7c673cae 490
f67539c2 491 neorados::WriteOp write_op;
b32b8144 492 if (m_copyup_enabled) {
81eedcae 493 if (m_guarding_migration_write) {
f67539c2
TL
494 auto snap_seq = (this->m_io_context->write_snap_context() ?
495 this->m_io_context->write_snap_context()->first : 0);
496 ldout(image_ctx->cct, 20) << "guarding write: snap_seq=" << snap_seq
497 << dendl;
498
11fdf7f2 499 cls_client::assert_snapc_seq(
f67539c2 500 &write_op, snap_seq, cls::rbd::ASSERT_SNAPC_SEQ_LE_SNAPSET_SEQ);
11fdf7f2 501 } else {
f67539c2
TL
502 ldout(image_ctx->cct, 20) << "guarding write" << dendl;
503 write_op.assert_exists();
11fdf7f2 504 }
7c673cae
FG
505 }
506
f67539c2
TL
507 add_write_hint(&write_op);
508 add_write_ops(&write_op);
509 ceph_assert(write_op.size() != 0);
7c673cae 510
f67539c2
TL
511 image_ctx->rados_api.execute(
512 {data_object_name(this->m_ictx, this->m_object_no)},
513 *this->m_io_context, std::move(write_op),
514 librbd::asio::util::get_callback_adapter(
515 [this](int r) { handle_write_object(r); }), nullptr,
516 (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
7c673cae 517}
7c673cae 518
b32b8144
FG
519template <typename I>
520void AbstractObjectWriteRequest<I>::handle_write_object(int r) {
521 I *image_ctx = this->m_ictx;
522 ldout(image_ctx->cct, 20) << "r=" << r << dendl;
523
524 r = filter_write_result(r);
525 if (r == -ENOENT) {
526 if (m_copyup_enabled) {
527 copyup();
528 return;
529 }
11fdf7f2 530 } else if (r == -ERANGE && m_guarding_migration_write) {
9f95a23c 531 image_ctx->image_lock.lock_shared();
81eedcae 532 m_guarding_migration_write = !image_ctx->migration_info.empty();
9f95a23c 533 image_ctx->image_lock.unlock_shared();
81eedcae
TL
534
535 if (m_guarding_migration_write) {
11fdf7f2
TL
536 copyup();
537 } else {
538 ldout(image_ctx->cct, 10) << "migration parent gone, restart io" << dendl;
11fdf7f2
TL
539 compute_parent_info();
540 write_object();
541 }
542 return;
b32b8144
FG
543 } else if (r == -EILSEQ) {
544 ldout(image_ctx->cct, 10) << "failed to write object" << dendl;
545 this->finish(r);
546 return;
547 } else if (r < 0) {
548 lderr(image_ctx->cct) << "failed to write object: " << cpp_strerror(r)
549 << dendl;
550 this->finish(r);
551 return;
7c673cae
FG
552 }
553
b32b8144 554 post_write_object_map_update();
7c673cae
FG
555}
556
b32b8144
FG
557template <typename I>
558void AbstractObjectWriteRequest<I>::copyup() {
559 I *image_ctx = this->m_ictx;
560 ldout(image_ctx->cct, 20) << dendl;
7c673cae 561
11fdf7f2 562 ceph_assert(!m_copyup_in_progress);
b32b8144 563 m_copyup_in_progress = true;
7c673cae 564
9f95a23c 565 image_ctx->copyup_list_lock.lock();
b32b8144
FG
566 auto it = image_ctx->copyup_list.find(this->m_object_no);
567 if (it == image_ctx->copyup_list.end()) {
568 auto new_req = CopyupRequest<I>::create(
1e59de90
TL
569 image_ctx, this->m_object_no, std::move(this->m_parent_extents),
570 m_image_area, this->m_trace);
b32b8144
FG
571 this->m_parent_extents.clear();
572
573 // make sure to wait on this CopyupRequest
f67539c2 574 new_req->append_request(this, std::move(get_copyup_overwrite_extents()));
b32b8144
FG
575 image_ctx->copyup_list[this->m_object_no] = new_req;
576
9f95a23c 577 image_ctx->copyup_list_lock.unlock();
b32b8144 578 new_req->send();
31f18b77 579 } else {
f67539c2 580 it->second->append_request(this, std::move(get_copyup_overwrite_extents()));
9f95a23c 581 image_ctx->copyup_list_lock.unlock();
31f18b77 582 }
7c673cae
FG
583}
584
b32b8144
FG
585template <typename I>
586void AbstractObjectWriteRequest<I>::handle_copyup(int r) {
587 I *image_ctx = this->m_ictx;
588 ldout(image_ctx->cct, 20) << "r=" << r << dendl;
589
11fdf7f2 590 ceph_assert(m_copyup_in_progress);
b32b8144
FG
591 m_copyup_in_progress = false;
592
81eedcae 593 if (r < 0 && r != -ERESTART) {
b32b8144
FG
594 lderr(image_ctx->cct) << "failed to copyup object: " << cpp_strerror(r)
595 << dendl;
596 this->finish(r);
597 return;
31f18b77 598 }
31f18b77 599
81eedcae 600 if (r == -ERESTART || is_post_copyup_write_required()) {
b32b8144
FG
601 write_object();
602 return;
7c673cae 603 }
b32b8144
FG
604
605 post_write_object_map_update();
7c673cae
FG
606}
607
b32b8144
FG
608template <typename I>
609void AbstractObjectWriteRequest<I>::post_write_object_map_update() {
610 I *image_ctx = this->m_ictx;
611
9f95a23c 612 image_ctx->image_lock.lock_shared();
b32b8144
FG
613 if (image_ctx->object_map == nullptr || !is_object_map_update_enabled() ||
614 !is_non_existent_post_write_object_map_state()) {
9f95a23c 615 image_ctx->image_lock.unlock_shared();
b32b8144
FG
616 this->finish(0);
617 return;
7c673cae
FG
618 }
619
b32b8144 620 ldout(image_ctx->cct, 20) << dendl;
7c673cae 621
b32b8144 622 // should have been flushed prior to releasing lock
11fdf7f2 623 ceph_assert(image_ctx->exclusive_lock->is_lock_owner());
b32b8144
FG
624 if (image_ctx->object_map->template aio_update<
625 AbstractObjectWriteRequest<I>,
626 &AbstractObjectWriteRequest<I>::handle_post_write_object_map_update>(
627 CEPH_NOSNAP, this->m_object_no, OBJECT_NONEXISTENT, OBJECT_PENDING,
91327a77 628 this->m_trace, false, this)) {
9f95a23c 629 image_ctx->image_lock.unlock_shared();
b32b8144 630 return;
7c673cae
FG
631 }
632
9f95a23c 633 image_ctx->image_lock.unlock_shared();
b32b8144 634 this->finish(0);
7c673cae
FG
635}
636
b32b8144
FG
637template <typename I>
638void AbstractObjectWriteRequest<I>::handle_post_write_object_map_update(int r) {
639 I *image_ctx = this->m_ictx;
640 ldout(image_ctx->cct, 20) << "r=" << r << dendl;
11fdf7f2
TL
641 if (r < 0) {
642 lderr(image_ctx->cct) << "failed to update object map: "
643 << cpp_strerror(r) << dendl;
644 this->finish(r);
645 return;
646 }
c07f9fc5 647
b32b8144
FG
648 this->finish(0);
649}
c07f9fc5 650
b32b8144 651template <typename I>
f67539c2
TL
652void ObjectWriteRequest<I>::add_write_hint(neorados::WriteOp* wr) {
653 if ((m_write_flags & OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE) != 0) {
654 wr->create(true);
655 } else if (m_assert_version.has_value()) {
656 wr->assert_version(m_assert_version.value());
657 }
658 AbstractObjectWriteRequest<I>::add_write_hint(wr);
659}
660
661template <typename I>
662void ObjectWriteRequest<I>::add_write_ops(neorados::WriteOp* wr) {
b32b8144 663 if (this->m_full_object) {
f67539c2 664 wr->write_full(bufferlist{m_write_data});
c07f9fc5 665 } else {
f67539c2 666 wr->write(this->m_object_off, bufferlist{m_write_data});
c07f9fc5 667 }
f67539c2 668 util::apply_op_flags(m_op_flags, 0U, wr);
c07f9fc5
FG
669}
670
b32b8144 671template <typename I>
f67539c2
TL
672void ObjectDiscardRequest<I>::add_write_ops(neorados::WriteOp* wr) {
673 switch (m_discard_action) {
674 case DISCARD_ACTION_REMOVE:
675 wr->remove();
676 break;
677 case DISCARD_ACTION_REMOVE_TRUNCATE:
678 wr->create(false);
679 // fall through
680 case DISCARD_ACTION_TRUNCATE:
681 wr->truncate(this->m_object_off);
682 break;
683 case DISCARD_ACTION_ZERO:
684 wr->zero(this->m_object_off, this->m_object_len);
685 break;
686 default:
687 ceph_abort();
688 break;
689 }
c07f9fc5
FG
690}
691
b32b8144 692template <typename I>
f67539c2
TL
693void ObjectWriteSameRequest<I>::add_write_ops(neorados::WriteOp* wr) {
694 wr->writesame(this->m_object_off, this->m_object_len,
695 bufferlist{m_write_data});
696 util::apply_op_flags(m_op_flags, 0U, wr);
697}
698
699template <typename I>
700void ObjectCompareAndWriteRequest<I>::add_write_ops(neorados::WriteOp* wr) {
701 wr->cmpext(this->m_object_off, bufferlist{m_cmp_bl}, nullptr);
c07f9fc5 702
b32b8144 703 if (this->m_full_object) {
f67539c2 704 wr->write_full(bufferlist{m_write_bl});
b32b8144 705 } else {
f67539c2 706 wr->write(this->m_object_off, bufferlist{m_write_bl});
b32b8144 707 }
f67539c2 708 util::apply_op_flags(m_op_flags, 0U, wr);
b32b8144 709}
c07f9fc5 710
b32b8144
FG
711template <typename I>
712int ObjectCompareAndWriteRequest<I>::filter_write_result(int r) const {
713 if (r <= -MAX_ERRNO) {
714 I *image_ctx = this->m_ictx;
b32b8144
FG
715
716 // object extent compare mismatch
717 uint64_t offset = -MAX_ERRNO - r;
1e59de90
TL
718 auto [image_extents, _] = io::util::object_to_area_extents(
719 image_ctx, this->m_object_no, {{offset, this->m_object_len}});
11fdf7f2 720 ceph_assert(image_extents.size() == 1);
b32b8144
FG
721
722 if (m_mismatch_offset) {
723 *m_mismatch_offset = image_extents[0].first;
c07f9fc5 724 }
b32b8144 725 r = -EILSEQ;
c07f9fc5 726 }
b32b8144 727 return r;
c07f9fc5
FG
728}
729
f67539c2
TL
730template <typename I>
731ObjectListSnapsRequest<I>::ObjectListSnapsRequest(
732 I *ictx, uint64_t objectno, Extents&& object_extents, SnapIds&& snap_ids,
733 int list_snaps_flags, const ZTracer::Trace &parent_trace,
734 SnapshotDelta* snapshot_delta, Context *completion)
735 : ObjectRequest<I>(
736 ictx, objectno, ictx->duplicate_data_io_context(), "snap_list",
737 parent_trace, completion),
738 m_object_extents(std::move(object_extents)),
739 m_snap_ids(std::move(snap_ids)), m_list_snaps_flags(list_snaps_flags),
740 m_snapshot_delta(snapshot_delta) {
741 this->m_io_context->read_snap(CEPH_SNAPDIR);
742}
743
744template <typename I>
745void ObjectListSnapsRequest<I>::send() {
746 I *image_ctx = this->m_ictx;
747 ldout(image_ctx->cct, 20) << dendl;
748
749 if (m_snap_ids.size() < 2) {
750 lderr(image_ctx->cct) << "invalid snap ids: " << m_snap_ids << dendl;
751 this->async_finish(-EINVAL);
752 return;
753 }
754
755 list_snaps();
756}
757
758template <typename I>
759void ObjectListSnapsRequest<I>::list_snaps() {
760 I *image_ctx = this->m_ictx;
761 ldout(image_ctx->cct, 20) << dendl;
762
763 neorados::ReadOp read_op;
764 read_op.list_snaps(&m_snap_set, &m_ec);
765
766 image_ctx->rados_api.execute(
767 {data_object_name(this->m_ictx, this->m_object_no)},
768 *this->m_io_context, std::move(read_op), nullptr,
769 librbd::asio::util::get_callback_adapter(
770 [this](int r) { handle_list_snaps(r); }), nullptr,
771 (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
772}
773
774template <typename I>
775void ObjectListSnapsRequest<I>::handle_list_snaps(int r) {
776 I *image_ctx = this->m_ictx;
777 auto cct = image_ctx->cct;
778
779 if (r >= 0) {
780 r = -m_ec.value();
781 }
782
783 ldout(cct, 20) << "r=" << r << dendl;
784
785 m_snapshot_delta->clear();
786 auto& snapshot_delta = *m_snapshot_delta;
787
788 ceph_assert(!m_snap_ids.empty());
789 librados::snap_t start_snap_id = 0;
790 librados::snap_t first_snap_id = *m_snap_ids.begin();
791 librados::snap_t last_snap_id = *m_snap_ids.rbegin();
792
793 if (r == -ENOENT) {
794 // the object does not exist -- mark the missing extents
795 zero_extent(first_snap_id, true);
796 list_from_parent();
797 return;
798 } else if (r < 0) {
799 lderr(cct) << "failed to retrieve object snapshot list: " << cpp_strerror(r)
800 << dendl;
801 this->finish(r);
802 return;
803 }
804
805 // helper function requires the librados legacy data structure
806 librados::snap_set_t snap_set;
807 convert_snap_set(m_snap_set, &snap_set);
808
809 bool initial_extents_written = false;
810
811 interval_set<uint64_t> object_interval;
812 for (auto& object_extent : m_object_extents) {
813 object_interval.insert(object_extent.first, object_extent.second);
814 }
815 ldout(cct, 20) << "object_interval=" << object_interval << dendl;
816
817 // loop through all expected snapshots and build interval sets for
818 // data and zeroed ranges for each snapshot
819 uint64_t prev_end_size = 0;
820 interval_set<uint64_t> initial_written_extents;
821 for (auto end_snap_id : m_snap_ids) {
822 if (start_snap_id == end_snap_id) {
823 continue;
824 } else if (end_snap_id > last_snap_id) {
825 break;
826 }
827
828 interval_set<uint64_t> diff;
829 uint64_t end_size;
830 bool exists;
831 librados::snap_t clone_end_snap_id;
832 bool read_whole_object;
833 calc_snap_set_diff(cct, snap_set, start_snap_id,
834 end_snap_id, &diff, &end_size, &exists,
835 &clone_end_snap_id, &read_whole_object);
836
837 if (read_whole_object ||
838 (!diff.empty() &&
839 ((m_list_snaps_flags & LIST_SNAPS_FLAG_WHOLE_OBJECT) != 0))) {
840 ldout(cct, 1) << "need to read full object" << dendl;
841 diff.clear();
842 diff.insert(0, image_ctx->layout.object_size);
843 end_size = image_ctx->layout.object_size;
844 clone_end_snap_id = end_snap_id;
845 } else if (!exists) {
846 end_size = 0;
847 }
848
849 if (exists) {
850 // reads should be issued against the newest (existing) snapshot within
851 // the associated snapshot object clone. writes should be issued
852 // against the oldest snapshot in the snap_map.
853 ceph_assert(clone_end_snap_id >= end_snap_id);
854 if (clone_end_snap_id > last_snap_id) {
855 // do not read past the copy point snapshot
856 clone_end_snap_id = last_snap_id;
857 }
858 }
859
860 // clip diff to current object extent
861 interval_set<uint64_t> diff_interval;
862 diff_interval.intersection_of(object_interval, diff);
863
864 // clip diff to size of object (in case it was truncated)
865 interval_set<uint64_t> zero_interval;
866 if (end_size < prev_end_size) {
867 zero_interval.insert(end_size, prev_end_size - end_size);
868 zero_interval.intersection_of(object_interval);
869
870 interval_set<uint64_t> trunc_interval;
871 trunc_interval.intersection_of(zero_interval, diff_interval);
872 if (!trunc_interval.empty()) {
873 diff_interval.subtract(trunc_interval);
874 ldout(cct, 20) << "clearing truncate diff: " << trunc_interval << dendl;
875 }
876 }
877
878 ldout(cct, 20) << "start_snap_id=" << start_snap_id << ", "
879 << "end_snap_id=" << end_snap_id << ", "
880 << "clone_end_snap_id=" << clone_end_snap_id << ", "
881 << "diff=" << diff << ", "
882 << "diff_interval=" << diff_interval<< ", "
883 << "zero_interval=" << zero_interval<< ", "
884 << "end_size=" << end_size << ", "
885 << "prev_end_size=" << prev_end_size << ", "
886 << "exists=" << exists << ", "
887 << "whole_object=" << read_whole_object << dendl;
888
889 // check if object exists prior to start of incremental snap delta so that
890 // we don't DNE the object if no additional deltas exist
891 if (exists && start_snap_id == 0 &&
892 (!diff_interval.empty() || !zero_interval.empty())) {
893 ldout(cct, 20) << "object exists at snap id " << end_snap_id << dendl;
894 initial_extents_written = true;
895 }
896
897 prev_end_size = end_size;
898 start_snap_id = end_snap_id;
899
900 if (end_snap_id <= first_snap_id) {
901 // don't include deltas from the starting snapshots, but we iterate over
902 // it to track its existence and size
903 ldout(cct, 20) << "skipping prior snapshot " << dendl;
904 continue;
905 }
906
907 if (exists) {
908 for (auto& interval : diff_interval) {
909 snapshot_delta[{end_snap_id, clone_end_snap_id}].insert(
910 interval.first, interval.second,
911 SparseExtent(SPARSE_EXTENT_STATE_DATA, interval.second));
912 }
913 } else {
914 zero_interval.union_of(diff_interval);
915 }
916
917 if ((m_list_snaps_flags & LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS) == 0) {
918 for (auto& interval : zero_interval) {
919 snapshot_delta[{end_snap_id, end_snap_id}].insert(
920 interval.first, interval.second,
921 SparseExtent(SPARSE_EXTENT_STATE_ZEROED, interval.second));
922 }
923 }
924 }
925
926 bool snapshot_delta_empty = snapshot_delta.empty();
927 if (!initial_extents_written) {
928 zero_extent(first_snap_id, first_snap_id > 0);
929 }
930 ldout(cct, 20) << "snapshot_delta=" << snapshot_delta << dendl;
931
932 if (snapshot_delta_empty) {
933 list_from_parent();
934 return;
935 }
936
937 this->finish(0);
938}
939
940template <typename I>
941void ObjectListSnapsRequest<I>::list_from_parent() {
942 I *image_ctx = this->m_ictx;
943 auto cct = image_ctx->cct;
944
945 ceph_assert(!m_snap_ids.empty());
946 librados::snap_t snap_id_start = *m_snap_ids.begin();
947 librados::snap_t snap_id_end = *m_snap_ids.rbegin();
948
949 std::unique_lock image_locker{image_ctx->image_lock};
950 if ((snap_id_start > 0) || (image_ctx->parent == nullptr) ||
951 ((m_list_snaps_flags & LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT) != 0)) {
952 image_locker.unlock();
953
954 this->finish(0);
955 return;
956 }
957
1e59de90
TL
958 Extents parent_extents;
959 uint64_t raw_overlap = 0;
f67539c2 960 uint64_t object_overlap = 0;
1e59de90
TL
961 image_ctx->get_parent_overlap(snap_id_end, &raw_overlap);
962 if (raw_overlap > 0) {
963 // calculate reverse mapping onto the parent image
964 std::tie(parent_extents, m_image_area) = io::util::object_to_area_extents(
965 image_ctx, this->m_object_no, m_object_extents);
966 object_overlap = image_ctx->prune_parent_extents(
967 parent_extents, m_image_area, raw_overlap, false);
f67539c2 968 }
f67539c2
TL
969 if (object_overlap == 0) {
970 image_locker.unlock();
971
972 this->finish(0);
973 return;
974 }
975
976 auto ctx = create_context_callback<
977 ObjectListSnapsRequest<I>,
978 &ObjectListSnapsRequest<I>::handle_list_from_parent>(this);
979 auto aio_comp = AioCompletion::create_and_start(
980 ctx, librbd::util::get_image_ctx(image_ctx->parent), AIO_TYPE_GENERIC);
1e59de90
TL
981 ldout(cct, 20) << "completion=" << aio_comp
982 << " parent_extents=" << parent_extents
983 << " area=" << m_image_area << dendl;
f67539c2
TL
984
985 auto list_snaps_flags = (
986 m_list_snaps_flags | LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS);
987
988 ImageListSnapsRequest<I> req(
1e59de90 989 *image_ctx->parent, aio_comp, std::move(parent_extents), m_image_area,
f67539c2
TL
990 {0, image_ctx->parent->snap_id}, list_snaps_flags, &m_parent_snapshot_delta,
991 this->m_trace);
992 req.send();
993}
994
995template <typename I>
996void ObjectListSnapsRequest<I>::handle_list_from_parent(int r) {
997 I *image_ctx = this->m_ictx;
998 auto cct = image_ctx->cct;
999
1000 ldout(cct, 20) << "r=" << r << ", "
1001 << "parent_snapshot_delta=" << m_parent_snapshot_delta
1002 << dendl;
1003
1004 // ignore special-case of fully empty dataset (we ignore zeroes)
1005 if (m_parent_snapshot_delta.empty()) {
1006 this->finish(0);
1007 return;
1008 }
1009
1010 // the write/read snapshot id key is not useful for parent images so
1011 // map the the special-case INITIAL_WRITE_READ_SNAP_IDS key
1012 *m_snapshot_delta = {};
1013 auto& intervals = (*m_snapshot_delta)[INITIAL_WRITE_READ_SNAP_IDS];
1014 for (auto& [key, image_extents] : m_parent_snapshot_delta) {
1015 for (auto image_extent : image_extents) {
1016 auto state = image_extent.get_val().state;
1017
1018 // map image-extents back to this object
1019 striper::LightweightObjectExtents object_extents;
1e59de90
TL
1020 io::util::area_to_object_extents(image_ctx, image_extent.get_off(),
1021 image_extent.get_len(), m_image_area, 0,
1022 &object_extents);
f67539c2
TL
1023 for (auto& object_extent : object_extents) {
1024 ceph_assert(object_extent.object_no == this->m_object_no);
1025 intervals.insert(
1026 object_extent.offset, object_extent.length,
1027 {state, object_extent.length});
1028 }
1029 }
1030 }
1031
1032 ldout(cct, 20) << "snapshot_delta=" << *m_snapshot_delta << dendl;
1033 this->finish(0);
1034}
1035
1036template <typename I>
1037void ObjectListSnapsRequest<I>::zero_extent(uint64_t snap_id, bool dne) {
1038 I *image_ctx = this->m_ictx;
1039 auto cct = image_ctx->cct;
1040
1041 // the object does not exist or is (partially) under whiteout -- mark the
1042 // missing extents which would be any portion of the object that does not
1043 // have data in the initial snapshot set
1044 if ((m_list_snaps_flags & LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS) == 0) {
1045 interval_set<uint64_t> interval;
1046 for (auto [object_offset, object_length] : m_object_extents) {
1047 interval.insert(object_offset, object_length);
1048 }
1049
1050 for (auto [offset, length] : interval) {
1051 ldout(cct, 20) << "snapshot " << snap_id << ": "
1052 << (dne ? "DNE" : "zeroed") << " extent "
1053 << offset << "~" << length << dendl;
1054 (*m_snapshot_delta)[{snap_id, snap_id}].insert(
1055 offset, length,
1056 SparseExtent(
1057 (dne ? SPARSE_EXTENT_STATE_DNE : SPARSE_EXTENT_STATE_ZEROED),
1058 length));
1059 }
1060 }
1061}
1062
7c673cae
FG
1063} // namespace io
1064} // namespace librbd
1065
1066template class librbd::io::ObjectRequest<librbd::ImageCtx>;
1067template class librbd::io::ObjectReadRequest<librbd::ImageCtx>;
b32b8144
FG
1068template class librbd::io::AbstractObjectWriteRequest<librbd::ImageCtx>;
1069template class librbd::io::ObjectWriteRequest<librbd::ImageCtx>;
1070template class librbd::io::ObjectDiscardRequest<librbd::ImageCtx>;
1071template class librbd::io::ObjectWriteSameRequest<librbd::ImageCtx>;
1072template class librbd::io::ObjectCompareAndWriteRequest<librbd::ImageCtx>;
f67539c2 1073template class librbd::io::ObjectListSnapsRequest<librbd::ImageCtx>;