]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include "ObjectCopyRequest.h" | |
5 | #include "librados/snap_set_diff.h" | |
31f18b77 | 6 | #include "librbd/ExclusiveLock.h" |
7c673cae FG |
7 | #include "librbd/ObjectMap.h" |
8 | #include "librbd/Utils.h" | |
9 | #include "common/errno.h" | |
10 | ||
11 | #define dout_context g_ceph_context | |
12 | #define dout_subsys ceph_subsys_rbd_mirror | |
13 | #undef dout_prefix | |
14 | #define dout_prefix *_dout << "rbd::mirror::image_sync::ObjectCopyRequest: " \ | |
15 | << this << " " << __func__ | |
16 | ||
17 | namespace librados { | |
18 | ||
19 | bool operator==(const clone_info_t& rhs, const clone_info_t& lhs) { | |
20 | return (rhs.cloneid == lhs.cloneid && | |
21 | rhs.snaps == lhs.snaps && | |
22 | rhs.overlap == lhs.overlap && | |
23 | rhs.size == lhs.size); | |
24 | } | |
25 | ||
26 | bool operator==(const snap_set_t& rhs, const snap_set_t& lhs) { | |
27 | return (rhs.clones == lhs.clones && | |
28 | rhs.seq == lhs.seq); | |
29 | } | |
30 | ||
31 | } // namespace librados | |
32 | ||
33 | namespace rbd { | |
34 | namespace mirror { | |
35 | namespace image_sync { | |
36 | ||
37 | using librbd::util::create_context_callback; | |
38 | using librbd::util::create_rados_callback; | |
39 | ||
40 | template <typename I> | |
41 | ObjectCopyRequest<I>::ObjectCopyRequest(I *local_image_ctx, I *remote_image_ctx, | |
42 | const SnapMap *snap_map, | |
43 | uint64_t object_number, | |
44 | Context *on_finish) | |
45 | : m_local_image_ctx(local_image_ctx), m_remote_image_ctx(remote_image_ctx), | |
46 | m_snap_map(snap_map), m_object_number(object_number), | |
47 | m_on_finish(on_finish) { | |
48 | assert(!snap_map->empty()); | |
49 | ||
50 | m_local_io_ctx.dup(m_local_image_ctx->data_ctx); | |
51 | m_local_oid = m_local_image_ctx->get_object_name(object_number); | |
52 | ||
53 | m_remote_io_ctx.dup(m_remote_image_ctx->data_ctx); | |
54 | m_remote_oid = m_remote_image_ctx->get_object_name(object_number); | |
55 | ||
56 | dout(20) << ": " | |
57 | << "remote_oid=" << m_remote_oid << ", " | |
58 | << "local_oid=" << m_local_oid << dendl; | |
59 | } | |
60 | ||
61 | template <typename I> | |
62 | void ObjectCopyRequest<I>::send() { | |
63 | send_list_snaps(); | |
64 | } | |
65 | ||
66 | template <typename I> | |
67 | void ObjectCopyRequest<I>::send_list_snaps() { | |
68 | dout(20) << dendl; | |
69 | ||
70 | librados::AioCompletion *rados_completion = create_rados_callback< | |
71 | ObjectCopyRequest<I>, &ObjectCopyRequest<I>::handle_list_snaps>(this); | |
72 | ||
73 | librados::ObjectReadOperation op; | |
74 | m_snap_set = {}; | |
75 | m_snap_ret = 0; | |
76 | op.list_snaps(&m_snap_set, &m_snap_ret); | |
77 | ||
78 | m_remote_io_ctx.snap_set_read(CEPH_SNAPDIR); | |
79 | int r = m_remote_io_ctx.aio_operate(m_remote_oid, rados_completion, &op, | |
80 | nullptr); | |
81 | assert(r == 0); | |
82 | rados_completion->release(); | |
83 | } | |
84 | ||
85 | template <typename I> | |
86 | void ObjectCopyRequest<I>::handle_list_snaps(int r) { | |
87 | if (r == 0 && m_snap_ret < 0) { | |
88 | r = m_snap_ret; | |
89 | } | |
90 | ||
91 | dout(20) << ": r=" << r << dendl; | |
92 | ||
93 | if (r == -ENOENT) { | |
94 | finish(0); | |
95 | return; | |
96 | } | |
97 | ||
98 | if (r < 0) { | |
99 | derr << ": failed to list snaps: " << cpp_strerror(r) << dendl; | |
100 | finish(r); | |
101 | return; | |
102 | } | |
103 | ||
104 | if (m_retry_missing_read) { | |
105 | if (m_snap_set == m_retry_snap_set) { | |
106 | derr << ": read encountered missing object using up-to-date snap set" | |
107 | << dendl; | |
108 | finish(-ENOENT); | |
109 | return; | |
110 | } | |
111 | ||
112 | dout(20) << ": retrying using updated snap set" << dendl; | |
113 | m_retry_missing_read = false; | |
114 | m_retry_snap_set = {}; | |
115 | } | |
116 | ||
117 | compute_diffs(); | |
118 | send_read_object(); | |
119 | } | |
120 | ||
121 | template <typename I> | |
122 | void ObjectCopyRequest<I>::send_read_object() { | |
123 | if (m_snap_sync_ops.empty()) { | |
124 | // no more snapshot diffs to read from remote | |
125 | finish(0); | |
126 | return; | |
127 | } | |
128 | ||
129 | // build the read request | |
130 | auto &sync_ops = m_snap_sync_ops.begin()->second; | |
131 | assert(!sync_ops.empty()); | |
132 | ||
133 | bool read_required = false; | |
134 | librados::ObjectReadOperation op; | |
135 | for (auto &sync_op : sync_ops) { | |
136 | switch (sync_op.type) { | |
137 | case SYNC_OP_TYPE_WRITE: | |
138 | if (!read_required) { | |
139 | // map the sync op start snap id back to the necessary read snap id | |
140 | librados::snap_t remote_snap_seq = | |
141 | m_snap_sync_ops.begin()->first.second; | |
142 | m_remote_io_ctx.snap_set_read(remote_snap_seq); | |
143 | ||
144 | dout(20) << ": remote_snap_seq=" << remote_snap_seq << dendl; | |
145 | read_required = true; | |
146 | } | |
147 | dout(20) << ": read op: " << sync_op.offset << "~" << sync_op.length | |
148 | << dendl; | |
149 | op.sparse_read(sync_op.offset, sync_op.length, &sync_op.extent_map, | |
150 | &sync_op.out_bl, nullptr); | |
151 | op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | | |
152 | LIBRADOS_OP_FLAG_FADVISE_NOCACHE); | |
153 | break; | |
154 | default: | |
155 | break; | |
156 | } | |
157 | } | |
158 | ||
159 | if (!read_required) { | |
160 | // nothing written to this object for this snapshot (must be trunc/remove) | |
161 | send_write_object(); | |
162 | return; | |
163 | } | |
164 | ||
165 | librados::AioCompletion *comp = create_rados_callback< | |
166 | ObjectCopyRequest<I>, &ObjectCopyRequest<I>::handle_read_object>(this); | |
167 | int r = m_remote_io_ctx.aio_operate(m_remote_oid, comp, &op, nullptr); | |
168 | assert(r == 0); | |
169 | comp->release(); | |
170 | } | |
171 | ||
172 | template <typename I> | |
173 | void ObjectCopyRequest<I>::handle_read_object(int r) { | |
174 | dout(20) << ": r=" << r << dendl; | |
175 | ||
176 | if (r == -ENOENT) { | |
177 | m_retry_snap_set = m_snap_set; | |
178 | m_retry_missing_read = true; | |
179 | ||
180 | dout(5) << ": object missing potentially due to removed snapshot" << dendl; | |
181 | send_list_snaps(); | |
182 | return; | |
183 | } | |
184 | ||
185 | if (r < 0) { | |
186 | derr << ": failed to read from remote object: " << cpp_strerror(r) | |
187 | << dendl; | |
188 | finish(r); | |
189 | return; | |
190 | } | |
191 | ||
192 | send_write_object(); | |
193 | } | |
194 | ||
195 | template <typename I> | |
196 | void ObjectCopyRequest<I>::send_write_object() { | |
197 | // retrieve the local snap context for the op | |
198 | SnapIds local_snap_ids; | |
199 | librados::snap_t local_snap_seq = 0; | |
200 | librados::snap_t remote_snap_seq = m_snap_sync_ops.begin()->first.first; | |
201 | if (remote_snap_seq != 0) { | |
202 | auto snap_map_it = m_snap_map->find(remote_snap_seq); | |
203 | assert(snap_map_it != m_snap_map->end()); | |
204 | ||
205 | // write snapshot context should be before actual snapshot | |
206 | if (snap_map_it != m_snap_map->begin()) { | |
207 | --snap_map_it; | |
208 | assert(!snap_map_it->second.empty()); | |
209 | local_snap_seq = snap_map_it->second.front(); | |
210 | local_snap_ids = snap_map_it->second; | |
211 | } | |
212 | } | |
213 | ||
31f18b77 FG |
214 | Context *finish_op_ctx; |
215 | { | |
216 | RWLock::RLocker owner_locker(m_local_image_ctx->owner_lock); | |
217 | finish_op_ctx = start_local_op(m_local_image_ctx->owner_lock); | |
218 | } | |
219 | if (finish_op_ctx == nullptr) { | |
220 | derr << ": lost exclusive lock" << dendl; | |
221 | finish(-EROFS); | |
222 | return; | |
223 | } | |
224 | ||
7c673cae FG |
225 | dout(20) << ": " |
226 | << "local_snap_seq=" << local_snap_seq << ", " | |
227 | << "local_snaps=" << local_snap_ids << dendl; | |
228 | ||
229 | auto &sync_ops = m_snap_sync_ops.begin()->second; | |
230 | assert(!sync_ops.empty()); | |
231 | uint64_t object_offset; | |
232 | uint64_t buffer_offset; | |
233 | librados::ObjectWriteOperation op; | |
234 | for (auto &sync_op : sync_ops) { | |
235 | switch (sync_op.type) { | |
236 | case SYNC_OP_TYPE_WRITE: | |
237 | object_offset = sync_op.offset; | |
238 | buffer_offset = 0; | |
239 | for (auto it : sync_op.extent_map) { | |
240 | if (object_offset < it.first) { | |
241 | dout(20) << ": zero op: " << object_offset << "~" | |
242 | << it.first - object_offset << dendl; | |
243 | op.zero(object_offset, it.first - object_offset); | |
244 | } | |
245 | dout(20) << ": write op: " << it.first << "~" << it.second << dendl; | |
246 | bufferlist tmpbl; | |
247 | tmpbl.substr_of(sync_op.out_bl, buffer_offset, it.second); | |
248 | op.write(it.first, tmpbl); | |
249 | op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | | |
250 | LIBRADOS_OP_FLAG_FADVISE_NOCACHE); | |
251 | buffer_offset += it.second; | |
252 | object_offset = it.first + it.second; | |
253 | } | |
254 | if (object_offset < sync_op.offset + sync_op.length) { | |
255 | uint64_t sync_op_end = sync_op.offset + sync_op.length; | |
256 | assert(sync_op_end <= m_snap_object_sizes[remote_snap_seq]); | |
257 | if (sync_op_end == m_snap_object_sizes[remote_snap_seq]) { | |
258 | dout(20) << ": trunc op: " << object_offset << dendl; | |
259 | op.truncate(object_offset); | |
260 | m_snap_object_sizes[remote_snap_seq] = object_offset; | |
261 | } else { | |
262 | dout(20) << ": zero op: " << object_offset << "~" | |
263 | << sync_op_end - object_offset << dendl; | |
264 | op.zero(object_offset, sync_op_end - object_offset); | |
265 | } | |
266 | } | |
267 | break; | |
268 | case SYNC_OP_TYPE_TRUNC: | |
269 | if (sync_op.offset > m_snap_object_sizes[remote_snap_seq]) { | |
270 | // skip (must have been updated in WRITE op case issuing trunc op) | |
271 | break; | |
272 | } | |
273 | dout(20) << ": trunc op: " << sync_op.offset << dendl; | |
274 | op.truncate(sync_op.offset); | |
275 | break; | |
276 | case SYNC_OP_TYPE_REMOVE: | |
277 | dout(20) << ": remove op" << dendl; | |
278 | op.remove(); | |
279 | break; | |
280 | default: | |
281 | assert(false); | |
282 | } | |
283 | } | |
284 | ||
31f18b77 FG |
285 | auto ctx = new FunctionContext([this, finish_op_ctx](int r) { |
286 | handle_write_object(r); | |
287 | finish_op_ctx->complete(0); | |
288 | }); | |
289 | librados::AioCompletion *comp = create_rados_callback(ctx); | |
7c673cae FG |
290 | int r = m_local_io_ctx.aio_operate(m_local_oid, comp, &op, local_snap_seq, |
291 | local_snap_ids); | |
292 | assert(r == 0); | |
293 | comp->release(); | |
294 | } | |
295 | ||
296 | template <typename I> | |
297 | void ObjectCopyRequest<I>::handle_write_object(int r) { | |
298 | dout(20) << ": r=" << r << dendl; | |
299 | ||
300 | if (r == -ENOENT) { | |
301 | r = 0; | |
302 | } | |
303 | if (r < 0) { | |
304 | derr << ": failed to write to local object: " << cpp_strerror(r) | |
305 | << dendl; | |
306 | finish(r); | |
307 | return; | |
308 | } | |
309 | ||
310 | m_snap_sync_ops.erase(m_snap_sync_ops.begin()); | |
311 | if (!m_snap_sync_ops.empty()) { | |
312 | send_read_object(); | |
313 | return; | |
314 | } | |
315 | ||
316 | send_update_object_map(); | |
317 | } | |
318 | ||
319 | template <typename I> | |
320 | void ObjectCopyRequest<I>::send_update_object_map() { | |
31f18b77 | 321 | m_local_image_ctx->owner_lock.get_read(); |
7c673cae FG |
322 | m_local_image_ctx->snap_lock.get_read(); |
323 | if (!m_local_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP, | |
324 | m_local_image_ctx->snap_lock) || | |
325 | m_snap_object_states.empty()) { | |
326 | m_local_image_ctx->snap_lock.put_read(); | |
31f18b77 | 327 | m_local_image_ctx->owner_lock.put_read(); |
7c673cae FG |
328 | finish(0); |
329 | return; | |
330 | } else if (m_local_image_ctx->object_map == nullptr) { | |
331 | // possible that exclusive lock was lost in background | |
332 | derr << ": object map is not initialized" << dendl; | |
333 | ||
334 | m_local_image_ctx->snap_lock.put_read(); | |
31f18b77 | 335 | m_local_image_ctx->owner_lock.put_read(); |
7c673cae FG |
336 | finish(-EINVAL); |
337 | return; | |
338 | } | |
339 | ||
340 | assert(m_local_image_ctx->object_map != nullptr); | |
341 | ||
342 | auto snap_object_state = *m_snap_object_states.begin(); | |
343 | m_snap_object_states.erase(m_snap_object_states.begin()); | |
344 | ||
345 | dout(20) << ": " | |
346 | << "local_snap_id=" << snap_object_state.first << ", " | |
347 | << "object_state=" << static_cast<uint32_t>(snap_object_state.second) | |
348 | << dendl; | |
349 | ||
31f18b77 FG |
350 | auto finish_op_ctx = start_local_op(m_local_image_ctx->owner_lock); |
351 | if (finish_op_ctx == nullptr) { | |
352 | derr << ": lost exclusive lock" << dendl; | |
353 | m_local_image_ctx->snap_lock.put_read(); | |
354 | m_local_image_ctx->owner_lock.put_read(); | |
355 | finish(-EROFS); | |
356 | return; | |
357 | } | |
358 | ||
359 | auto ctx = new FunctionContext([this, finish_op_ctx](int r) { | |
360 | handle_update_object_map(r); | |
361 | finish_op_ctx->complete(0); | |
362 | }); | |
363 | ||
7c673cae FG |
364 | RWLock::WLocker object_map_locker(m_local_image_ctx->object_map_lock); |
365 | bool sent = m_local_image_ctx->object_map->template aio_update< | |
31f18b77 | 366 | Context, &Context::complete>( |
7c673cae | 367 | snap_object_state.first, m_object_number, snap_object_state.second, {}, |
31f18b77 | 368 | {}, ctx); |
7c673cae FG |
369 | assert(sent); |
370 | m_local_image_ctx->snap_lock.put_read(); | |
31f18b77 | 371 | m_local_image_ctx->owner_lock.put_read(); |
7c673cae FG |
372 | } |
373 | ||
374 | template <typename I> | |
375 | void ObjectCopyRequest<I>::handle_update_object_map(int r) { | |
376 | dout(20) << ": r=" << r << dendl; | |
377 | ||
378 | assert(r == 0); | |
379 | if (!m_snap_object_states.empty()) { | |
380 | send_update_object_map(); | |
381 | return; | |
382 | } | |
383 | finish(0); | |
384 | } | |
385 | ||
31f18b77 FG |
386 | template <typename I> |
387 | Context *ObjectCopyRequest<I>::start_local_op(RWLock &owner_lock) { | |
388 | assert(m_local_image_ctx->owner_lock.is_locked()); | |
389 | if (m_local_image_ctx->exclusive_lock == nullptr) { | |
390 | return nullptr; | |
391 | } | |
392 | return m_local_image_ctx->exclusive_lock->start_op(); | |
393 | } | |
394 | ||
7c673cae FG |
395 | template <typename I> |
396 | void ObjectCopyRequest<I>::compute_diffs() { | |
397 | CephContext *cct = m_local_image_ctx->cct; | |
398 | ||
399 | m_snap_sync_ops = {}; | |
400 | m_snap_object_states = {}; | |
401 | m_snap_object_sizes = {}; | |
402 | ||
403 | librados::snap_t remote_sync_pont_snap_id = m_snap_map->rbegin()->first; | |
404 | uint64_t prev_end_size = 0; | |
405 | bool prev_exists = false; | |
406 | librados::snap_t start_remote_snap_id = 0; | |
407 | for (auto &pair : *m_snap_map) { | |
408 | assert(!pair.second.empty()); | |
409 | librados::snap_t end_remote_snap_id = pair.first; | |
410 | librados::snap_t end_local_snap_id = pair.second.front(); | |
411 | ||
412 | interval_set<uint64_t> diff; | |
413 | uint64_t end_size; | |
414 | bool exists; | |
415 | librados::snap_t clone_end_snap_id; | |
416 | calc_snap_set_diff(cct, m_snap_set, start_remote_snap_id, | |
417 | end_remote_snap_id, &diff, &end_size, &exists, | |
418 | &clone_end_snap_id); | |
419 | ||
420 | dout(20) << ": " | |
421 | << "start_remote_snap=" << start_remote_snap_id << ", " | |
422 | << "end_remote_snap_id=" << end_remote_snap_id << ", " | |
423 | << "clone_end_snap_id=" << clone_end_snap_id << ", " | |
424 | << "end_local_snap_id=" << end_local_snap_id << ", " | |
425 | << "diff=" << diff << ", " | |
426 | << "end_size=" << end_size << ", " | |
427 | << "exists=" << exists << dendl; | |
428 | if (exists) { | |
429 | // clip diff to size of object (in case it was truncated) | |
430 | if (end_size < prev_end_size) { | |
431 | interval_set<uint64_t> trunc; | |
432 | trunc.insert(end_size, prev_end_size); | |
433 | trunc.intersection_of(diff); | |
434 | diff.subtract(trunc); | |
435 | dout(20) << ": clearing truncate diff: " << trunc << dendl; | |
436 | } | |
437 | ||
438 | // prepare the object map state | |
439 | { | |
440 | RWLock::RLocker snap_locker(m_local_image_ctx->snap_lock); | |
441 | uint8_t object_state = OBJECT_EXISTS; | |
442 | if (m_local_image_ctx->test_features(RBD_FEATURE_FAST_DIFF, | |
443 | m_local_image_ctx->snap_lock) && | |
444 | prev_exists && diff.empty() && end_size == prev_end_size) { | |
445 | object_state = OBJECT_EXISTS_CLEAN; | |
446 | } | |
447 | m_snap_object_states[end_local_snap_id] = object_state; | |
448 | } | |
449 | ||
450 | // reads should be issued against the newest (existing) snapshot within | |
451 | // the associated snapshot object clone. writes should be issued | |
452 | // against the oldest snapshot in the snap_map. | |
453 | assert(clone_end_snap_id >= end_remote_snap_id); | |
454 | if (clone_end_snap_id > remote_sync_pont_snap_id) { | |
455 | // do not read past the sync point snapshot | |
456 | clone_end_snap_id = remote_sync_pont_snap_id; | |
457 | } | |
458 | ||
459 | // object write/zero, or truncate | |
460 | // NOTE: a single snapshot clone might represent multiple snapshots, but | |
461 | // the write/zero and truncate ops will only be associated with the first | |
462 | // snapshot encountered within the clone since the diff will be empty for | |
463 | // subsequent snapshots and the size will remain constant for a clone. | |
464 | for (auto it = diff.begin(); it != diff.end(); ++it) { | |
465 | dout(20) << ": read/write op: " << it.get_start() << "~" | |
466 | << it.get_len() << dendl; | |
467 | m_snap_sync_ops[{end_remote_snap_id, clone_end_snap_id}].emplace_back( | |
468 | SYNC_OP_TYPE_WRITE, it.get_start(), it.get_len()); | |
469 | } | |
470 | if (end_size < prev_end_size) { | |
471 | dout(20) << ": trunc op: " << end_size << dendl; | |
472 | m_snap_sync_ops[{end_remote_snap_id, clone_end_snap_id}].emplace_back( | |
473 | SYNC_OP_TYPE_TRUNC, end_size, 0U); | |
474 | } | |
475 | m_snap_object_sizes[end_remote_snap_id] = end_size; | |
476 | } else { | |
477 | if (prev_exists) { | |
478 | // object remove | |
479 | dout(20) << ": remove op" << dendl; | |
480 | m_snap_sync_ops[{end_remote_snap_id, end_remote_snap_id}].emplace_back( | |
481 | SYNC_OP_TYPE_REMOVE, 0U, 0U); | |
482 | } | |
483 | } | |
484 | ||
485 | prev_end_size = end_size; | |
486 | prev_exists = exists; | |
487 | start_remote_snap_id = end_remote_snap_id; | |
488 | } | |
489 | } | |
490 | ||
491 | template <typename I> | |
492 | void ObjectCopyRequest<I>::finish(int r) { | |
493 | dout(20) << ": r=" << r << dendl; | |
494 | ||
495 | // ensure IoCtxs are closed prior to proceeding | |
496 | auto on_finish = m_on_finish; | |
497 | delete this; | |
498 | ||
499 | on_finish->complete(r); | |
500 | } | |
501 | ||
502 | } // namespace image_sync | |
503 | } // namespace mirror | |
504 | } // namespace rbd | |
505 | ||
506 | template class rbd::mirror::image_sync::ObjectCopyRequest<librbd::ImageCtx>; |