]> git.proxmox.com Git - ceph.git/blame - ceph/src/librbd/internal.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / librbd / internal.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3#include "include/int_types.h"
4
5#include <errno.h>
6#include <limits.h>
7
8#include "include/types.h"
9#include "include/uuid.h"
10#include "common/ceph_context.h"
11#include "common/dout.h"
12#include "common/errno.h"
13#include "common/Throttle.h"
14#include "common/event_socket.h"
11fdf7f2
TL
15#include "common/perf_counters.h"
16#include "osdc/Striper.h"
7c673cae
FG
17#include "include/stringify.h"
18
11fdf7f2 19#include "cls/lock/cls_lock_client.h"
7c673cae
FG
20#include "cls/rbd/cls_rbd.h"
21#include "cls/rbd/cls_rbd_types.h"
22#include "cls/rbd/cls_rbd_client.h"
23#include "cls/journal/cls_journal_types.h"
24#include "cls/journal/cls_journal_client.h"
25
f67539c2 26#include "librbd/AsioEngine.h"
7c673cae
FG
27#include "librbd/ExclusiveLock.h"
28#include "librbd/ImageCtx.h"
29#include "librbd/ImageState.h"
30#include "librbd/internal.h"
31#include "librbd/Journal.h"
32#include "librbd/ObjectMap.h"
33#include "librbd/Operations.h"
f67539c2 34#include "librbd/PluginRegistry.h"
7c673cae
FG
35#include "librbd/Types.h"
36#include "librbd/Utils.h"
11fdf7f2 37#include "librbd/api/Config.h"
7c673cae 38#include "librbd/api/Image.h"
f67539c2
TL
39#include "librbd/api/Io.h"
40#include "librbd/cache/Utils.h"
7c673cae
FG
41#include "librbd/exclusive_lock/AutomaticPolicy.h"
42#include "librbd/exclusive_lock/StandardPolicy.h"
9f95a23c 43#include "librbd/deep_copy/MetadataCopyRequest.h"
7c673cae
FG
44#include "librbd/image/CloneRequest.h"
45#include "librbd/image/CreateRequest.h"
9f95a23c 46#include "librbd/image/GetMetadataRequest.h"
1911f103 47#include "librbd/image/Types.h"
7c673cae 48#include "librbd/io/AioCompletion.h"
f67539c2
TL
49#include "librbd/io/ImageDispatchSpec.h"
50#include "librbd/io/ImageDispatcherInterface.h"
51#include "librbd/io/ObjectDispatcherInterface.h"
7c673cae
FG
52#include "librbd/io/ObjectRequest.h"
53#include "librbd/io/ReadResult.h"
54#include "librbd/journal/Types.h"
55#include "librbd/managed_lock/Types.h"
56#include "librbd/mirror/EnableRequest.h"
57#include "librbd/operation/TrimRequest.h"
58
59#include "journal/Journaler.h"
60
61#include <boost/scope_exit.hpp>
62#include <boost/variant.hpp>
11fdf7f2 63#include "include/ceph_assert.h"
7c673cae
FG
64
65#define dout_subsys ceph_subsys_rbd
66#undef dout_prefix
67#define dout_prefix *_dout << "librbd: "
68
69#define rbd_howmany(x, y) (((x) + (y) - 1) / (y))
70
20effc67 71using std::istringstream;
7c673cae
FG
72using std::map;
73using std::pair;
74using std::set;
75using std::string;
76using std::vector;
77// list binds to list() here, so std::list is explicitly used below
78
79using ceph::bufferlist;
80using librados::snap_t;
81using librados::IoCtx;
82using librados::Rados;
83
84namespace librbd {
85
86namespace {
87
88int validate_pool(IoCtx &io_ctx, CephContext *cct) {
11fdf7f2 89 if (!cct->_conf.get_val<bool>("rbd_validate_pool")) {
7c673cae
FG
90 return 0;
91 }
92
93 int r = io_ctx.stat(RBD_DIRECTORY, NULL, NULL);
94 if (r == 0) {
95 return 0;
96 } else if (r < 0 && r != -ENOENT) {
97 lderr(cct) << "failed to stat RBD directory: " << cpp_strerror(r) << dendl;
98 return r;
99 }
100
101 // allocate a self-managed snapshot id if this a new pool to force
102 // self-managed snapshot mode
103 uint64_t snap_id;
104 r = io_ctx.selfmanaged_snap_create(&snap_id);
105 if (r == -EINVAL) {
106 lderr(cct) << "pool not configured for self-managed RBD snapshot support"
107 << dendl;
108 return r;
109 } else if (r < 0) {
110 lderr(cct) << "failed to allocate self-managed snapshot: "
111 << cpp_strerror(r) << dendl;
112 return r;
113 }
114
115 r = io_ctx.selfmanaged_snap_remove(snap_id);
116 if (r < 0) {
117 lderr(cct) << "failed to release self-managed snapshot " << snap_id
118 << ": " << cpp_strerror(r) << dendl;
119 }
120 return 0;
121}
122
7c673cae
FG
123} // anonymous namespace
124
125 int detect_format(IoCtx &io_ctx, const string &name,
126 bool *old_format, uint64_t *size)
127 {
128 CephContext *cct = (CephContext *)io_ctx.cct();
129 if (old_format)
130 *old_format = true;
131 int r = io_ctx.stat(util::old_header_name(name), size, NULL);
132 if (r == -ENOENT) {
133 if (old_format)
134 *old_format = false;
135 r = io_ctx.stat(util::id_obj_name(name), size, NULL);
136 if (r < 0)
137 return r;
138 } else if (r < 0) {
139 return r;
140 }
141
142 ldout(cct, 20) << "detect format of " << name << " : "
143 << (old_format ? (*old_format ? "old" : "new") :
144 "don't care") << dendl;
145 return 0;
146 }
147
148 bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap)
149 {
150 return (parent_pool_id != -1 && off <= overlap);
151 }
152
153 void init_rbd_header(struct rbd_obj_header_ondisk& ondisk,
154 uint64_t size, int order, uint64_t bid)
155 {
156 uint32_t hi = bid >> 32;
157 uint32_t lo = bid & 0xFFFFFFFF;
158 uint32_t extra = rand() % 0xFFFFFFFF;
92f5a8d4 159 // FIPS zeroization audit 20191117: this memset is not security related.
7c673cae
FG
160 memset(&ondisk, 0, sizeof(ondisk));
161
162 memcpy(&ondisk.text, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT));
163 memcpy(&ondisk.signature, RBD_HEADER_SIGNATURE,
164 sizeof(RBD_HEADER_SIGNATURE));
165 memcpy(&ondisk.version, RBD_HEADER_VERSION, sizeof(RBD_HEADER_VERSION));
166
167 snprintf(ondisk.block_name, sizeof(ondisk.block_name), "rb.%x.%x.%x",
168 hi, lo, extra);
169
170 ondisk.image_size = size;
171 ondisk.options.order = order;
172 ondisk.options.crypt_type = RBD_CRYPT_NONE;
173 ondisk.options.comp_type = RBD_COMP_NONE;
174 ondisk.snap_seq = 0;
175 ondisk.snap_count = 0;
176 ondisk.reserved = 0;
177 ondisk.snap_names_len = 0;
178 }
179
180 void image_info(ImageCtx *ictx, image_info_t& info, size_t infosize)
181 {
182 int obj_order = ictx->order;
9f95a23c
TL
183 {
184 std::shared_lock locker{ictx->image_lock};
f67539c2 185 info.size = ictx->get_effective_image_size(ictx->snap_id);
9f95a23c 186 }
7c673cae
FG
187 info.obj_size = 1ULL << obj_order;
188 info.num_objs = Striper::get_num_objects(ictx->layout, info.size);
189 info.order = obj_order;
190 strncpy(info.block_name_prefix, ictx->object_prefix.c_str(),
191 RBD_MAX_BLOCK_NAME_SIZE);
192 info.block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE - 1] = '\0';
193
194 // clear deprecated fields
195 info.parent_pool = -1L;
196 info.parent_name[0] = '\0';
197 }
198
199 uint64_t oid_to_object_no(const string& oid, const string& object_prefix)
200 {
201 istringstream iss(oid);
202 // skip object prefix and separator
203 iss.ignore(object_prefix.length() + 1);
204 uint64_t num;
205 iss >> std::hex >> num;
206 return num;
207 }
208
209 void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx)
210 {
9f95a23c 211 ceph_assert(ceph_mutex_is_locked(ictx->owner_lock));
11fdf7f2
TL
212 ceph_assert(ictx->exclusive_lock == nullptr ||
213 ictx->exclusive_lock->is_lock_owner());
7c673cae
FG
214
215 C_SaferCond ctx;
9f95a23c 216 ictx->image_lock.lock_shared();
7c673cae
FG
217 operation::TrimRequest<> *req = operation::TrimRequest<>::create(
218 *ictx, &ctx, ictx->size, newsize, prog_ctx);
9f95a23c 219 ictx->image_lock.unlock_shared();
7c673cae
FG
220 req->send();
221
222 int r = ctx.wait();
223 if (r < 0) {
224 lderr(ictx->cct) << "warning: failed to remove some object(s): "
225 << cpp_strerror(r) << dendl;
226 }
227 }
228
229 int read_header_bl(IoCtx& io_ctx, const string& header_oid,
230 bufferlist& header, uint64_t *ver)
231 {
232 int r;
233 uint64_t off = 0;
234#define READ_SIZE 4096
235 do {
236 bufferlist bl;
237 r = io_ctx.read(header_oid, bl, READ_SIZE, off);
238 if (r < 0)
239 return r;
240 header.claim_append(bl);
241 off += r;
242 } while (r == READ_SIZE);
243
11fdf7f2
TL
244 static_assert(sizeof(RBD_HEADER_TEXT) == sizeof(RBD_MIGRATE_HEADER_TEXT),
245 "length of rbd headers must be the same");
246
7c673cae 247 if (header.length() < sizeof(RBD_HEADER_TEXT) ||
11fdf7f2
TL
248 (memcmp(RBD_HEADER_TEXT, header.c_str(),
249 sizeof(RBD_HEADER_TEXT)) != 0 &&
250 memcmp(RBD_MIGRATE_HEADER_TEXT, header.c_str(),
251 sizeof(RBD_MIGRATE_HEADER_TEXT)) != 0)) {
7c673cae
FG
252 CephContext *cct = (CephContext *)io_ctx.cct();
253 lderr(cct) << "unrecognized header format" << dendl;
254 return -ENXIO;
255 }
256
257 if (ver)
258 *ver = io_ctx.get_last_version();
259
260 return 0;
261 }
262
263 int read_header(IoCtx& io_ctx, const string& header_oid,
264 struct rbd_obj_header_ondisk *header, uint64_t *ver)
265 {
266 bufferlist header_bl;
267 int r = read_header_bl(io_ctx, header_oid, header_bl, ver);
268 if (r < 0)
269 return r;
270 if (header_bl.length() < (int)sizeof(*header))
271 return -EIO;
272 memcpy(header, header_bl.c_str(), sizeof(*header));
273
274 return 0;
275 }
276
277 int tmap_set(IoCtx& io_ctx, const string& imgname)
278 {
279 bufferlist cmdbl, emptybl;
280 __u8 c = CEPH_OSD_TMAP_SET;
11fdf7f2
TL
281 encode(c, cmdbl);
282 encode(imgname, cmdbl);
283 encode(emptybl, cmdbl);
7c673cae
FG
284 return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl);
285 }
286
287 int tmap_rm(IoCtx& io_ctx, const string& imgname)
288 {
289 bufferlist cmdbl;
290 __u8 c = CEPH_OSD_TMAP_RM;
11fdf7f2
TL
291 encode(c, cmdbl);
292 encode(imgname, cmdbl);
7c673cae
FG
293 return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl);
294 }
295
296 typedef boost::variant<std::string,uint64_t> image_option_value_t;
297 typedef std::map<int,image_option_value_t> image_options_t;
298 typedef std::shared_ptr<image_options_t> image_options_ref;
299
300 enum image_option_type_t {
301 STR,
302 UINT64,
303 };
304
305 const std::map<int, image_option_type_t> IMAGE_OPTIONS_TYPE_MAPPING = {
306 {RBD_IMAGE_OPTION_FORMAT, UINT64},
307 {RBD_IMAGE_OPTION_FEATURES, UINT64},
308 {RBD_IMAGE_OPTION_ORDER, UINT64},
309 {RBD_IMAGE_OPTION_STRIPE_UNIT, UINT64},
310 {RBD_IMAGE_OPTION_STRIPE_COUNT, UINT64},
311 {RBD_IMAGE_OPTION_JOURNAL_ORDER, UINT64},
312 {RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH, UINT64},
313 {RBD_IMAGE_OPTION_JOURNAL_POOL, STR},
314 {RBD_IMAGE_OPTION_FEATURES_SET, UINT64},
315 {RBD_IMAGE_OPTION_FEATURES_CLEAR, UINT64},
316 {RBD_IMAGE_OPTION_DATA_POOL, STR},
11fdf7f2 317 {RBD_IMAGE_OPTION_FLATTEN, UINT64},
92f5a8d4 318 {RBD_IMAGE_OPTION_CLONE_FORMAT, UINT64},
1911f103 319 {RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE, UINT64},
7c673cae
FG
320 };
321
322 std::string image_option_name(int optname) {
323 switch (optname) {
324 case RBD_IMAGE_OPTION_FORMAT:
325 return "format";
326 case RBD_IMAGE_OPTION_FEATURES:
327 return "features";
328 case RBD_IMAGE_OPTION_ORDER:
329 return "order";
330 case RBD_IMAGE_OPTION_STRIPE_UNIT:
331 return "stripe_unit";
332 case RBD_IMAGE_OPTION_STRIPE_COUNT:
333 return "stripe_count";
334 case RBD_IMAGE_OPTION_JOURNAL_ORDER:
335 return "journal_order";
336 case RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH:
337 return "journal_splay_width";
338 case RBD_IMAGE_OPTION_JOURNAL_POOL:
339 return "journal_pool";
340 case RBD_IMAGE_OPTION_FEATURES_SET:
341 return "features_set";
342 case RBD_IMAGE_OPTION_FEATURES_CLEAR:
343 return "features_clear";
344 case RBD_IMAGE_OPTION_DATA_POOL:
345 return "data_pool";
11fdf7f2
TL
346 case RBD_IMAGE_OPTION_FLATTEN:
347 return "flatten";
92f5a8d4
TL
348 case RBD_IMAGE_OPTION_CLONE_FORMAT:
349 return "clone_format";
1911f103
TL
350 case RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE:
351 return "mirror_image_mode";
7c673cae
FG
352 default:
353 return "unknown (" + stringify(optname) + ")";
354 }
355 }
356
7c673cae
FG
357 void image_options_create(rbd_image_options_t* opts)
358 {
359 image_options_ref* opts_ = new image_options_ref(new image_options_t());
360
361 *opts = static_cast<rbd_image_options_t>(opts_);
362 }
363
364 void image_options_create_ref(rbd_image_options_t* opts,
365 rbd_image_options_t orig)
366 {
367 image_options_ref* orig_ = static_cast<image_options_ref*>(orig);
368 image_options_ref* opts_ = new image_options_ref(*orig_);
369
370 *opts = static_cast<rbd_image_options_t>(opts_);
371 }
372
373 void image_options_copy(rbd_image_options_t* opts,
374 const ImageOptions &orig)
375 {
376 image_options_ref* opts_ = new image_options_ref(new image_options_t());
377
378 *opts = static_cast<rbd_image_options_t>(opts_);
379
380 std::string str_val;
381 uint64_t uint64_val;
382 for (auto &i : IMAGE_OPTIONS_TYPE_MAPPING) {
383 switch (i.second) {
384 case STR:
385 if (orig.get(i.first, &str_val) == 0) {
386 image_options_set(*opts, i.first, str_val);
387 }
388 continue;
389 case UINT64:
390 if (orig.get(i.first, &uint64_val) == 0) {
391 image_options_set(*opts, i.first, uint64_val);
392 }
393 continue;
394 }
395 }
396 }
397
398 void image_options_destroy(rbd_image_options_t opts)
399 {
400 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
401
402 delete opts_;
403 }
404
405 int image_options_set(rbd_image_options_t opts, int optname,
406 const std::string& optval)
407 {
408 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
409
410 std::map<int, image_option_type_t>::const_iterator i =
411 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
412
413 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
414 return -EINVAL;
415 }
416
417 (*opts_->get())[optname] = optval;
418 return 0;
419 }
420
421 int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval)
422 {
423 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
424
425 std::map<int, image_option_type_t>::const_iterator i =
426 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
427
428 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
429 return -EINVAL;
430 }
431
432 (*opts_->get())[optname] = optval;
433 return 0;
434 }
435
436 int image_options_get(rbd_image_options_t opts, int optname,
437 std::string* optval)
438 {
439 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
440
441 std::map<int, image_option_type_t>::const_iterator i =
442 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
443
444 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
445 return -EINVAL;
446 }
447
448 image_options_t::const_iterator j = (*opts_)->find(optname);
449
450 if (j == (*opts_)->end()) {
451 return -ENOENT;
452 }
453
454 *optval = boost::get<std::string>(j->second);
455 return 0;
456 }
457
458 int image_options_get(rbd_image_options_t opts, int optname, uint64_t* optval)
459 {
460 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
461
462 std::map<int, image_option_type_t>::const_iterator i =
463 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
464
465 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
466 return -EINVAL;
467 }
468
469 image_options_t::const_iterator j = (*opts_)->find(optname);
470
471 if (j == (*opts_)->end()) {
472 return -ENOENT;
473 }
474
475 *optval = boost::get<uint64_t>(j->second);
476 return 0;
477 }
478
479 int image_options_is_set(rbd_image_options_t opts, int optname,
480 bool* is_set)
481 {
482 if (IMAGE_OPTIONS_TYPE_MAPPING.find(optname) ==
483 IMAGE_OPTIONS_TYPE_MAPPING.end()) {
484 return -EINVAL;
485 }
486
487 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
488 *is_set = ((*opts_)->find(optname) != (*opts_)->end());
489 return 0;
490 }
491
492 int image_options_unset(rbd_image_options_t opts, int optname)
493 {
494 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
495
496 std::map<int, image_option_type_t>::const_iterator i =
497 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
498
499 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end()) {
11fdf7f2 500 ceph_assert((*opts_)->find(optname) == (*opts_)->end());
7c673cae
FG
501 return -EINVAL;
502 }
503
504 image_options_t::const_iterator j = (*opts_)->find(optname);
505
506 if (j == (*opts_)->end()) {
507 return -ENOENT;
508 }
509
510 (*opts_)->erase(j);
511 return 0;
512 }
513
514 void image_options_clear(rbd_image_options_t opts)
515 {
516 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
517
518 (*opts_)->clear();
519 }
520
521 bool image_options_is_empty(rbd_image_options_t opts)
522 {
523 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
524
525 return (*opts_)->empty();
526 }
527
7c673cae
FG
528 int create_v1(IoCtx& io_ctx, const char *imgname, uint64_t size, int order)
529 {
530 CephContext *cct = (CephContext *)io_ctx.cct();
531
532 ldout(cct, 20) << __func__ << " " << &io_ctx << " name = " << imgname
533 << " size = " << size << " order = " << order << dendl;
534 int r = validate_pool(io_ctx, cct);
535 if (r < 0) {
536 return r;
537 }
538
11fdf7f2
TL
539 if (!io_ctx.get_namespace().empty()) {
540 lderr(cct) << "attempting to add v1 image to namespace" << dendl;
541 return -EINVAL;
542 }
543
7c673cae
FG
544 ldout(cct, 2) << "adding rbd image to directory..." << dendl;
545 r = tmap_set(io_ctx, imgname);
546 if (r < 0) {
547 lderr(cct) << "error adding image to directory: " << cpp_strerror(r)
548 << dendl;
549 return r;
550 }
551
552 Rados rados(io_ctx);
553 uint64_t bid = rados.get_instance_id();
554
555 ldout(cct, 2) << "creating rbd image..." << dendl;
556 struct rbd_obj_header_ondisk header;
557 init_rbd_header(header, size, order, bid);
558
559 bufferlist bl;
560 bl.append((const char *)&header, sizeof(header));
561
562 string header_oid = util::old_header_name(imgname);
563 r = io_ctx.write(header_oid, bl, bl.length(), 0);
564 if (r < 0) {
565 lderr(cct) << "Error writing image header: " << cpp_strerror(r)
566 << dendl;
567 int remove_r = tmap_rm(io_ctx, imgname);
568 if (remove_r < 0) {
569 lderr(cct) << "Could not remove image from directory after "
570 << "header creation failed: "
571 << cpp_strerror(remove_r) << dendl;
572 }
573 return r;
574 }
575
576 ldout(cct, 2) << "done." << dendl;
577 return 0;
578 }
579
580 int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
581 int *order)
582 {
583 uint64_t order_ = *order;
584 ImageOptions opts;
585
586 int r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
11fdf7f2 587 ceph_assert(r == 0);
7c673cae
FG
588
589 r = create(io_ctx, imgname, "", size, opts, "", "", false);
590
591 int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
11fdf7f2 592 ceph_assert(r1 == 0);
7c673cae
FG
593 *order = order_;
594
595 return r;
596 }
597
598 int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
599 bool old_format, uint64_t features, int *order,
600 uint64_t stripe_unit, uint64_t stripe_count)
601 {
602 if (!order)
603 return -EINVAL;
604
605 uint64_t order_ = *order;
606 uint64_t format = old_format ? 1 : 2;
607 ImageOptions opts;
608 int r;
609
610 r = opts.set(RBD_IMAGE_OPTION_FORMAT, format);
11fdf7f2 611 ceph_assert(r == 0);
7c673cae 612 r = opts.set(RBD_IMAGE_OPTION_FEATURES, features);
11fdf7f2 613 ceph_assert(r == 0);
7c673cae 614 r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
11fdf7f2 615 ceph_assert(r == 0);
7c673cae 616 r = opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
11fdf7f2 617 ceph_assert(r == 0);
7c673cae 618 r = opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
11fdf7f2 619 ceph_assert(r == 0);
7c673cae
FG
620
621 r = create(io_ctx, imgname, "", size, opts, "", "", false);
622
623 int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
11fdf7f2 624 ceph_assert(r1 == 0);
7c673cae
FG
625 *order = order_;
626
627 return r;
628 }
629
630 int create(IoCtx& io_ctx, const std::string &image_name,
631 const std::string &image_id, uint64_t size,
632 ImageOptions& opts,
633 const std::string &non_primary_global_image_id,
634 const std::string &primary_mirror_uuid,
635 bool skip_mirror_enable)
636 {
637 std::string id(image_id);
638 if (id.empty()) {
639 id = util::generate_image_id(io_ctx);
640 }
641
642 CephContext *cct = (CephContext *)io_ctx.cct();
92f5a8d4
TL
643 uint64_t option;
644 if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) {
11fdf7f2
TL
645 lderr(cct) << "create does not support 'flatten' image option" << dendl;
646 return -EINVAL;
647 }
92f5a8d4
TL
648 if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) {
649 lderr(cct) << "create does not support 'clone_format' image option"
650 << dendl;
651 return -EINVAL;
652 }
11fdf7f2 653
7c673cae
FG
654 ldout(cct, 10) << __func__ << " name=" << image_name << ", "
655 << "id= " << id << ", "
656 << "size=" << size << ", opts=" << opts << dendl;
657
658 uint64_t format;
659 if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0)
11fdf7f2 660 format = cct->_conf.get_val<uint64_t>("rbd_default_format");
7c673cae
FG
661 bool old_format = format == 1;
662
663 // make sure it doesn't already exist, in either format
664 int r = detect_format(io_ctx, image_name, NULL, NULL);
665 if (r != -ENOENT) {
666 if (r) {
667 lderr(cct) << "Could not tell if " << image_name << " already exists"
668 << dendl;
669 return r;
670 }
671 lderr(cct) << "rbd image " << image_name << " already exists" << dendl;
672 return -EEXIST;
673 }
674
675 uint64_t order = 0;
676 if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0 || order == 0) {
11fdf7f2 677 order = cct->_conf.get_val<uint64_t>("rbd_default_order");
7c673cae
FG
678 }
679 r = image::CreateRequest<>::validate_order(cct, order);
680 if (r < 0) {
681 return r;
682 }
683
684 if (old_format) {
11fdf7f2
TL
685 if ( !getenv("RBD_FORCE_ALLOW_V1") ) {
686 lderr(cct) << "Format 1 image creation unsupported. " << dendl;
687 return -EINVAL;
688 }
689 lderr(cct) << "Forced V1 image creation. " << dendl;
7c673cae
FG
690 r = create_v1(io_ctx, image_name.c_str(), size, order);
691 } else {
f67539c2 692 AsioEngine asio_engine(io_ctx);
7c673cae 693
11fdf7f2
TL
694 ConfigProxy config{cct->_conf};
695 api::Config<>::apply_pool_overrides(io_ctx, &config);
696
1911f103
TL
697 uint32_t create_flags = 0U;
698 uint64_t mirror_image_mode = RBD_MIRROR_IMAGE_MODE_JOURNAL;
699 if (skip_mirror_enable) {
700 create_flags = image::CREATE_FLAG_SKIP_MIRROR_ENABLE;
701 } else if (opts.get(RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE,
702 &mirror_image_mode) == 0) {
703 create_flags = image::CREATE_FLAG_FORCE_MIRROR_ENABLE;
704 }
705
7c673cae
FG
706 C_SaferCond cond;
707 image::CreateRequest<> *req = image::CreateRequest<>::create(
1911f103
TL
708 config, io_ctx, image_name, id, size, opts, create_flags,
709 static_cast<cls::rbd::MirrorImageMode>(mirror_image_mode),
f67539c2
TL
710 non_primary_global_image_id, primary_mirror_uuid,
711 asio_engine.get_work_queue(), &cond);
7c673cae
FG
712 req->send();
713
714 r = cond.wait();
715 }
716
717 int r1 = opts.set(RBD_IMAGE_OPTION_ORDER, order);
11fdf7f2 718 ceph_assert(r1 == 0);
7c673cae
FG
719
720 return r;
721 }
722
723 /*
724 * Parent may be in different pool, hence different IoCtx
725 */
726 int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
727 IoCtx& c_ioctx, const char *c_name,
728 uint64_t features, int *c_order,
729 uint64_t stripe_unit, int stripe_count)
730 {
731 uint64_t order = *c_order;
732
733 ImageOptions opts;
734 opts.set(RBD_IMAGE_OPTION_FEATURES, features);
735 opts.set(RBD_IMAGE_OPTION_ORDER, order);
736 opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
737 opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
738
11fdf7f2
TL
739 int r = clone(p_ioctx, nullptr, p_name, p_snap_name, c_ioctx, nullptr,
740 c_name, opts, "", "");
7c673cae
FG
741 opts.get(RBD_IMAGE_OPTION_ORDER, &order);
742 *c_order = order;
743 return r;
744 }
745
11fdf7f2
TL
746 int clone(IoCtx& p_ioctx, const char *p_id, const char *p_name,
747 const char *p_snap_name, IoCtx& c_ioctx, const char *c_id,
748 const char *c_name, ImageOptions& c_opts,
749 const std::string &non_primary_global_image_id,
750 const std::string &primary_mirror_uuid)
7c673cae 751 {
11fdf7f2
TL
752 ceph_assert((p_id == nullptr) ^ (p_name == nullptr));
753
7c673cae 754 CephContext *cct = (CephContext *)p_ioctx.cct();
11fdf7f2 755 if (p_snap_name == nullptr) {
7c673cae
FG
756 lderr(cct) << "image to be cloned must be a snapshot" << dendl;
757 return -EINVAL;
758 }
759
11fdf7f2
TL
760 uint64_t flatten;
761 if (c_opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) {
762 lderr(cct) << "clone does not support 'flatten' image option" << dendl;
763 return -EINVAL;
7c673cae
FG
764 }
765
11fdf7f2
TL
766 int r;
767 std::string parent_id;
768 if (p_id == nullptr) {
769 r = cls_client::dir_get_id(&p_ioctx, RBD_DIRECTORY, p_name,
770 &parent_id);
771 if (r < 0) {
772 if (r != -ENOENT) {
773 lderr(cct) << "failed to retrieve parent image id: "
774 << cpp_strerror(r) << dendl;
775 }
776 return r;
777 }
778 } else {
779 parent_id = p_id;
7c673cae 780 }
7c673cae 781
11fdf7f2
TL
782 std::string clone_id;
783 if (c_id == nullptr) {
784 clone_id = util::generate_image_id(c_ioctx);
785 } else {
786 clone_id = c_id;
7c673cae
FG
787 }
788
7c673cae
FG
789 ldout(cct, 10) << __func__ << " "
790 << "c_name=" << c_name << ", "
11fdf7f2 791 << "c_id= " << clone_id << ", "
7c673cae
FG
792 << "c_opts=" << c_opts << dendl;
793
11fdf7f2
TL
794 ConfigProxy config{reinterpret_cast<CephContext *>(c_ioctx.cct())->_conf};
795 api::Config<>::apply_pool_overrides(c_ioctx, &config);
796
f67539c2 797 AsioEngine asio_engine(p_ioctx);
7c673cae
FG
798
799 C_SaferCond cond;
800 auto *req = image::CloneRequest<>::create(
9f95a23c
TL
801 config, p_ioctx, parent_id, p_snap_name,
802 {cls::rbd::UserSnapshotNamespace{}}, CEPH_NOSNAP, c_ioctx, c_name,
803 clone_id, c_opts, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL,
f67539c2
TL
804 non_primary_global_image_id, primary_mirror_uuid,
805 asio_engine.get_work_queue(), &cond);
7c673cae
FG
806 req->send();
807
11fdf7f2
TL
808 r = cond.wait();
809 if (r < 0) {
810 return r;
811 }
812
813 return 0;
7c673cae
FG
814 }
815
816 int rename(IoCtx& io_ctx, const char *srcname, const char *dstname)
817 {
818 CephContext *cct = (CephContext *)io_ctx.cct();
819 ldout(cct, 20) << "rename " << &io_ctx << " " << srcname << " -> "
820 << dstname << dendl;
821
822 ImageCtx *ictx = new ImageCtx(srcname, "", "", io_ctx, false);
11fdf7f2 823 int r = ictx->state->open(0);
7c673cae 824 if (r < 0) {
181888fb 825 lderr(cct) << "error opening source image: " << cpp_strerror(r) << dendl;
7c673cae
FG
826 return r;
827 }
828 BOOST_SCOPE_EXIT((ictx)) {
829 ictx->state->close();
830 } BOOST_SCOPE_EXIT_END
831
832 return ictx->operations->rename(dstname);
833 }
834
835 int info(ImageCtx *ictx, image_info_t& info, size_t infosize)
836 {
837 ldout(ictx->cct, 20) << "info " << ictx << dendl;
838
839 int r = ictx->state->refresh_if_required();
840 if (r < 0)
841 return r;
842
843 image_info(ictx, info, infosize);
844 return 0;
845 }
846
847 int get_old_format(ImageCtx *ictx, uint8_t *old)
848 {
849 int r = ictx->state->refresh_if_required();
850 if (r < 0)
851 return r;
852 *old = ictx->old_format;
853 return 0;
854 }
855
856 int get_size(ImageCtx *ictx, uint64_t *size)
857 {
858 int r = ictx->state->refresh_if_required();
859 if (r < 0)
860 return r;
9f95a23c 861 std::shared_lock l2{ictx->image_lock};
f67539c2 862 *size = ictx->get_effective_image_size(ictx->snap_id);
7c673cae
FG
863 return 0;
864 }
865
866 int get_features(ImageCtx *ictx, uint64_t *features)
867 {
868 int r = ictx->state->refresh_if_required();
869 if (r < 0)
870 return r;
9f95a23c 871 std::shared_lock l{ictx->image_lock};
7c673cae
FG
872 *features = ictx->features;
873 return 0;
874 }
875
876 int get_overlap(ImageCtx *ictx, uint64_t *overlap)
877 {
878 int r = ictx->state->refresh_if_required();
879 if (r < 0)
880 return r;
9f95a23c 881 std::shared_lock image_locker{ictx->image_lock};
7c673cae
FG
882 return ictx->get_parent_overlap(ictx->snap_id, overlap);
883 }
884
7c673cae
FG
885 int get_flags(ImageCtx *ictx, uint64_t *flags)
886 {
887 int r = ictx->state->refresh_if_required();
888 if (r < 0) {
889 return r;
890 }
891
9f95a23c 892 std::shared_lock l2{ictx->image_lock};
7c673cae
FG
893 return ictx->get_flags(ictx->snap_id, flags);
894 }
895
896 int set_image_notification(ImageCtx *ictx, int fd, int type)
897 {
898 CephContext *cct = ictx->cct;
899 ldout(cct, 20) << __func__ << " " << ictx << " fd " << fd << " type" << type << dendl;
900
901 int r = ictx->state->refresh_if_required();
902 if (r < 0) {
903 return r;
904 }
905
906 if (ictx->event_socket.is_valid())
907 return -EINVAL;
908 return ictx->event_socket.init(fd, type);
909 }
910
911 int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner)
912 {
91327a77
AA
913 CephContext *cct = ictx->cct;
914 ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
7c673cae
FG
915 *is_owner = false;
916
9f95a23c 917 std::shared_lock owner_locker{ictx->owner_lock};
91327a77 918 if (ictx->exclusive_lock == nullptr) {
7c673cae
FG
919 return 0;
920 }
921
f67539c2 922 // might have been blocklisted by peer -- ensure we still own
7c673cae
FG
923 // the lock by pinging the OSD
924 int r = ictx->exclusive_lock->assert_header_locked();
31f18b77
FG
925 if (r == -EBUSY || r == -ENOENT) {
926 return 0;
927 } else if (r < 0) {
7c673cae
FG
928 return r;
929 }
930
931 *is_owner = true;
932 return 0;
933 }
934
935 int lock_acquire(ImageCtx *ictx, rbd_lock_mode_t lock_mode)
936 {
937 CephContext *cct = ictx->cct;
938 ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", "
939 << "lock_mode=" << lock_mode << dendl;
940
941 if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) {
942 return -EOPNOTSUPP;
943 }
944
945 C_SaferCond lock_ctx;
946 {
9f95a23c 947 std::unique_lock l{ictx->owner_lock};
7c673cae
FG
948
949 if (ictx->exclusive_lock == nullptr) {
950 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
951 return -EINVAL;
952 }
953
954 if (ictx->get_exclusive_lock_policy()->may_auto_request_lock()) {
955 ictx->set_exclusive_lock_policy(
956 new exclusive_lock::StandardPolicy(ictx));
957 }
958
959 if (ictx->exclusive_lock->is_lock_owner()) {
960 return 0;
961 }
962
963 ictx->exclusive_lock->acquire_lock(&lock_ctx);
964 }
965
966 int r = lock_ctx.wait();
967 if (r < 0) {
968 lderr(cct) << "failed to request exclusive lock: " << cpp_strerror(r)
969 << dendl;
970 return r;
971 }
972
9f95a23c 973 std::shared_lock l{ictx->owner_lock};
91327a77
AA
974 if (ictx->exclusive_lock == nullptr) {
975 return -EINVAL;
976 } else if (!ictx->exclusive_lock->is_lock_owner()) {
7c673cae 977 lderr(cct) << "failed to acquire exclusive lock" << dendl;
91327a77 978 return ictx->exclusive_lock->get_unlocked_op_error();
7c673cae
FG
979 }
980
981 return 0;
982 }
983
984 int lock_release(ImageCtx *ictx)
985 {
986 CephContext *cct = ictx->cct;
987 ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
988
989 C_SaferCond lock_ctx;
990 {
9f95a23c 991 std::unique_lock l{ictx->owner_lock};
7c673cae
FG
992
993 if (ictx->exclusive_lock == nullptr ||
994 !ictx->exclusive_lock->is_lock_owner()) {
995 lderr(cct) << "not exclusive lock owner" << dendl;
996 return -EINVAL;
997 }
998
999 ictx->exclusive_lock->release_lock(&lock_ctx);
1000 }
1001
1002 int r = lock_ctx.wait();
1003 if (r < 0) {
1004 lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r)
1005 << dendl;
1006 return r;
1007 }
1008 return 0;
1009 }
1010
1011 int lock_get_owners(ImageCtx *ictx, rbd_lock_mode_t *lock_mode,
1012 std::list<std::string> *lock_owners)
1013 {
1014 CephContext *cct = ictx->cct;
1015 ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
1016
7c673cae
FG
1017 managed_lock::Locker locker;
1018 C_SaferCond get_owner_ctx;
9f95a23c
TL
1019 {
1020 std::shared_lock owner_locker{ictx->owner_lock};
1021
1022 if (ictx->exclusive_lock == nullptr) {
1023 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
1024 return -EINVAL;
1025 }
1026
1027 ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx);
1028 }
1029
7c673cae
FG
1030 int r = get_owner_ctx.wait();
1031 if (r == -ENOENT) {
1032 return r;
1033 } else if (r < 0) {
1034 lderr(cct) << "failed to determine current lock owner: "
1035 << cpp_strerror(r) << dendl;
1036 return r;
1037 }
1038
1039 *lock_mode = RBD_LOCK_MODE_EXCLUSIVE;
1040 lock_owners->clear();
1041 lock_owners->emplace_back(locker.address);
1042 return 0;
1043 }
1044
1045 int lock_break(ImageCtx *ictx, rbd_lock_mode_t lock_mode,
11fdf7f2 1046 const std::string &lock_owner) {
7c673cae
FG
1047 CephContext *cct = ictx->cct;
1048 ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", "
1049 << "lock_mode=" << lock_mode << ", "
1050 << "lock_owner=" << lock_owner << dendl;
1051
1052 if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) {
1053 return -EOPNOTSUPP;
1054 }
1055
1056 if (ictx->read_only) {
1057 return -EROFS;
1058 }
1059
1060 managed_lock::Locker locker;
1061 C_SaferCond get_owner_ctx;
1062 {
9f95a23c 1063 std::shared_lock l{ictx->owner_lock};
7c673cae
FG
1064
1065 if (ictx->exclusive_lock == nullptr) {
1066 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
1067 return -EINVAL;
1068 }
1069
1070 ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx);
1071 }
1072 int r = get_owner_ctx.wait();
1073 if (r == -ENOENT) {
1074 return r;
1075 } else if (r < 0) {
1076 lderr(cct) << "failed to determine current lock owner: "
1077 << cpp_strerror(r) << dendl;
1078 return r;
1079 }
1080
1081 if (locker.address != lock_owner) {
1082 return -EBUSY;
1083 }
1084
1085 C_SaferCond break_ctx;
1086 {
9f95a23c 1087 std::shared_lock l{ictx->owner_lock};
7c673cae
FG
1088
1089 if (ictx->exclusive_lock == nullptr) {
1090 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
1091 return -EINVAL;
1092 }
1093
1094 ictx->exclusive_lock->break_lock(locker, true, &break_ctx);
1095 }
1096 r = break_ctx.wait();
1097 if (r == -ENOENT) {
1098 return r;
1099 } else if (r < 0) {
1100 lderr(cct) << "failed to break lock: " << cpp_strerror(r) << dendl;
1101 return r;
1102 }
1103 return 0;
1104 }
1105
7c673cae
FG
1106 int copy(ImageCtx *src, IoCtx& dest_md_ctx, const char *destname,
1107 ImageOptions& opts, ProgressContext &prog_ctx, size_t sparse_size)
1108 {
1109 CephContext *cct = (CephContext *)dest_md_ctx.cct();
92f5a8d4
TL
1110 uint64_t option;
1111 if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) {
11fdf7f2
TL
1112 lderr(cct) << "copy does not support 'flatten' image option" << dendl;
1113 return -EINVAL;
1114 }
92f5a8d4
TL
1115 if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) {
1116 lderr(cct) << "copy does not support 'clone_format' image option"
1117 << dendl;
1118 return -EINVAL;
1119 }
11fdf7f2 1120
7c673cae
FG
1121 ldout(cct, 20) << "copy " << src->name
1122 << (src->snap_name.length() ? "@" + src->snap_name : "")
1123 << " -> " << destname << " opts = " << opts << dendl;
1124
9f95a23c 1125 src->image_lock.lock_shared();
7c673cae
FG
1126 uint64_t features = src->features;
1127 uint64_t src_size = src->get_image_size(src->snap_id);
9f95a23c 1128 src->image_lock.unlock_shared();
e306af50 1129 uint64_t format = 2;
7c673cae
FG
1130 if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) {
1131 opts.set(RBD_IMAGE_OPTION_FORMAT, format);
1132 }
1133 uint64_t stripe_unit = src->stripe_unit;
1134 if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) {
1135 opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
1136 }
1137 uint64_t stripe_count = src->stripe_count;
1138 if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) {
1139 opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
1140 }
1141 uint64_t order = src->order;
1142 if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
1143 opts.set(RBD_IMAGE_OPTION_ORDER, order);
1144 }
1145 if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) {
1146 opts.set(RBD_IMAGE_OPTION_FEATURES, features);
1147 }
1148 if (features & ~RBD_FEATURES_ALL) {
1149 lderr(cct) << "librbd does not support requested features" << dendl;
1150 return -ENOSYS;
1151 }
1152
1153 int r = create(dest_md_ctx, destname, "", src_size, opts, "", "", false);
1154 if (r < 0) {
1155 lderr(cct) << "header creation failed" << dendl;
1156 return r;
1157 }
1158 opts.set(RBD_IMAGE_OPTION_ORDER, static_cast<uint64_t>(order));
1159
11fdf7f2
TL
1160 ImageCtx *dest = new librbd::ImageCtx(destname, "", nullptr, dest_md_ctx,
1161 false);
1162 r = dest->state->open(0);
7c673cae
FG
1163 if (r < 0) {
1164 lderr(cct) << "failed to read newly created header" << dendl;
1165 return r;
1166 }
1167
1168 r = copy(src, dest, prog_ctx, sparse_size);
1169
1170 int close_r = dest->state->close();
1171 if (r == 0 && close_r < 0) {
1172 r = close_r;
1173 }
1174 return r;
1175 }
1176
1177 class C_CopyWrite : public Context {
1178 public:
1179 C_CopyWrite(bufferlist *bl, Context* ctx)
1180 : m_bl(bl), m_ctx(ctx) {}
1181 void finish(int r) override {
1182 delete m_bl;
1183 m_ctx->complete(r);
1184 }
1185 private:
1186 bufferlist *m_bl;
1187 Context *m_ctx;
1188 };
1189
1190 class C_CopyRead : public Context {
1191 public:
1192 C_CopyRead(SimpleThrottle *throttle, ImageCtx *dest, uint64_t offset,
1193 bufferlist *bl, size_t sparse_size)
1194 : m_throttle(throttle), m_dest(dest), m_offset(offset), m_bl(bl),
1195 m_sparse_size(sparse_size) {
1196 m_throttle->start_op();
1197 }
1198 void finish(int r) override {
1199 if (r < 0) {
1200 lderr(m_dest->cct) << "error reading from source image at offset "
1201 << m_offset << ": " << cpp_strerror(r) << dendl;
1202 delete m_bl;
1203 m_throttle->end_op(r);
1204 return;
1205 }
11fdf7f2 1206 ceph_assert(m_bl->length() == (size_t)r);
7c673cae
FG
1207
1208 if (m_bl->is_zero()) {
1209 delete m_bl;
1210 m_throttle->end_op(r);
1211 return;
1212 }
1213
1214 if (!m_sparse_size) {
1215 m_sparse_size = (1 << m_dest->order);
1216 }
1217
1218 auto *throttle = m_throttle;
9f95a23c 1219 auto *end_op_ctx = new LambdaContext([throttle](int r) {
7c673cae
FG
1220 throttle->end_op(r);
1221 });
1222 auto gather_ctx = new C_Gather(m_dest->cct, end_op_ctx);
1223
11fdf7f2 1224 m_bl->rebuild(buffer::ptr_node::create(m_bl->length()));
7c673cae
FG
1225 size_t write_offset = 0;
1226 size_t write_length = 0;
1227 size_t offset = 0;
1228 size_t length = m_bl->length();
11fdf7f2 1229 const auto& m_ptr = m_bl->front();
7c673cae
FG
1230 while (offset < length) {
1231 if (util::calc_sparse_extent(m_ptr,
1232 m_sparse_size,
1233 length,
1234 &write_offset,
1235 &write_length,
1236 &offset)) {
7c673cae 1237 bufferlist *write_bl = new bufferlist();
11fdf7f2
TL
1238 write_bl->push_back(
1239 buffer::ptr_node::create(m_ptr, write_offset, write_length));
7c673cae
FG
1240 Context *ctx = new C_CopyWrite(write_bl, gather_ctx->new_sub());
1241 auto comp = io::AioCompletion::create(ctx);
1242
1243 // coordinate through AIO WQ to ensure lock is acquired if needed
f67539c2
TL
1244 api::Io<>::aio_write(*m_dest, comp, m_offset + write_offset,
1245 write_length, std::move(*write_bl),
1246 LIBRADOS_OP_FLAG_FADVISE_DONTNEED,
1247 std::move(read_trace));
7c673cae
FG
1248 write_offset = offset;
1249 write_length = 0;
1250 }
1251 }
1252 delete m_bl;
11fdf7f2 1253 ceph_assert(gather_ctx->get_sub_created_count() > 0);
7c673cae
FG
1254 gather_ctx->activate();
1255 }
1256
31f18b77
FG
1257 ZTracer::Trace read_trace;
1258
7c673cae
FG
1259 private:
1260 SimpleThrottle *m_throttle;
1261 ImageCtx *m_dest;
1262 uint64_t m_offset;
1263 bufferlist *m_bl;
1264 size_t m_sparse_size;
1265 };
1266
1267 int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx, size_t sparse_size)
1268 {
9f95a23c 1269 src->image_lock.lock_shared();
7c673cae 1270 uint64_t src_size = src->get_image_size(src->snap_id);
9f95a23c 1271 src->image_lock.unlock_shared();
7c673cae 1272
9f95a23c 1273 dest->image_lock.lock_shared();
7c673cae 1274 uint64_t dest_size = dest->get_image_size(dest->snap_id);
9f95a23c 1275 dest->image_lock.unlock_shared();
7c673cae
FG
1276
1277 CephContext *cct = src->cct;
1278 if (dest_size < src_size) {
1279 lderr(cct) << " src size " << src_size << " > dest size "
1280 << dest_size << dendl;
1281 return -EINVAL;
1282 }
b32b8144 1283
20effc67
TL
1284 // ensure previous writes are visible to dest
1285 C_SaferCond flush_ctx;
1286 {
1287 auto aio_comp = io::AioCompletion::create_and_start(&flush_ctx, src,
1288 io::AIO_TYPE_FLUSH);
1289 auto req = io::ImageDispatchSpec::create_flush(
1290 *src, io::IMAGE_DISPATCH_LAYER_INTERNAL_START,
1291 aio_comp, io::FLUSH_SOURCE_INTERNAL, {});
1292 req->send();
1293 }
1294 int r = flush_ctx.wait();
1295 if (r < 0) {
1296 return r;
1297 }
1298
9f95a23c
TL
1299 C_SaferCond ctx;
1300 auto req = deep_copy::MetadataCopyRequest<>::create(
1301 src, dest, &ctx);
1302 req->send();
b32b8144 1303
20effc67 1304 r = ctx.wait();
9f95a23c
TL
1305 if (r < 0) {
1306 lderr(cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl;
1307 return r;
7c673cae
FG
1308 }
1309
31f18b77 1310 ZTracer::Trace trace;
181888fb 1311 if (src->blkin_trace_all) {
31f18b77
FG
1312 trace.init("copy", &src->trace_endpoint);
1313 }
1314
11fdf7f2 1315 SimpleThrottle throttle(src->config.get_val<uint64_t>("rbd_concurrent_management_ops"), false);
7c673cae 1316 uint64_t period = src->get_stripe_period();
31f18b77
FG
1317 unsigned fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
1318 LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
11fdf7f2 1319 uint64_t object_id = 0;
7c673cae
FG
1320 for (uint64_t offset = 0; offset < src_size; offset += period) {
1321 if (throttle.pending_error()) {
1322 return throttle.wait_for_ret();
1323 }
1324
11fdf7f2 1325 {
9f95a23c 1326 std::shared_lock image_locker{src->image_lock};
11fdf7f2
TL
1327 if (src->object_map != nullptr) {
1328 bool skip = true;
1329 // each period is related to src->stripe_count objects, check them all
1330 for (uint64_t i=0; i < src->stripe_count; i++) {
1331 if (object_id < src->object_map->size() &&
1332 src->object_map->object_may_exist(object_id)) {
1333 skip = false;
1334 }
1335 ++object_id;
1336 }
1337
1338 if (skip) continue;
1339 } else {
1340 object_id += src->stripe_count;
1341 }
1342 }
1343
20effc67 1344 uint64_t len = std::min(period, src_size - offset);
7c673cae 1345 bufferlist *bl = new bufferlist();
31f18b77
FG
1346 auto ctx = new C_CopyRead(&throttle, dest, offset, bl, sparse_size);
1347 auto comp = io::AioCompletion::create_and_start<Context>(
1348 ctx, src, io::AIO_TYPE_READ);
f67539c2
TL
1349 auto req = io::ImageDispatchSpec::create_read(
1350 *src, io::IMAGE_DISPATCH_LAYER_NONE, comp,
1351 {{offset, len}}, io::ReadResult{bl},
1352 src->get_data_io_context(), fadvise_flags, 0, trace);
31f18b77 1353
f67539c2
TL
1354 ctx->read_trace = trace;
1355 req->send();
31f18b77 1356
7c673cae
FG
1357 prog_ctx.update_progress(offset, src_size);
1358 }
1359
1360 r = throttle.wait_for_ret();
1361 if (r >= 0)
1362 prog_ctx.update_progress(src_size, src_size);
1363 return r;
1364 }
1365
7c673cae
FG
1366 int list_lockers(ImageCtx *ictx,
1367 std::list<locker_t> *lockers,
1368 bool *exclusive,
1369 string *tag)
1370 {
1371 ldout(ictx->cct, 20) << "list_locks on image " << ictx << dendl;
1372
1373 int r = ictx->state->refresh_if_required();
1374 if (r < 0)
1375 return r;
1376
9f95a23c 1377 std::shared_lock locker{ictx->image_lock};
7c673cae
FG
1378 if (exclusive)
1379 *exclusive = ictx->exclusive_locked;
1380 if (tag)
1381 *tag = ictx->lock_tag;
1382 if (lockers) {
1383 lockers->clear();
1384 map<rados::cls::lock::locker_id_t,
1385 rados::cls::lock::locker_info_t>::const_iterator it;
1386 for (it = ictx->lockers.begin(); it != ictx->lockers.end(); ++it) {
1387 locker_t locker;
1388 locker.client = stringify(it->first.locker);
1389 locker.cookie = it->first.cookie;
11fdf7f2 1390 locker.address = it->second.addr.get_legacy_str();
7c673cae
FG
1391 lockers->push_back(locker);
1392 }
1393 }
1394
1395 return 0;
1396 }
1397
1398 int lock(ImageCtx *ictx, bool exclusive, const string& cookie,
1399 const string& tag)
1400 {
1401 ldout(ictx->cct, 20) << "lock image " << ictx << " exclusive=" << exclusive
1402 << " cookie='" << cookie << "' tag='" << tag << "'"
1403 << dendl;
1404
1405 int r = ictx->state->refresh_if_required();
1406 if (r < 0)
1407 return r;
1408
1409 /**
1410 * If we wanted we could do something more intelligent, like local
1411 * checks that we think we will succeed. But for now, let's not
1412 * duplicate that code.
1413 */
1414 {
9f95a23c 1415 std::shared_lock locker{ictx->image_lock};
7c673cae 1416 r = rados::cls::lock::lock(&ictx->md_ctx, ictx->header_oid, RBD_LOCK_NAME,
f67539c2 1417 exclusive ? ClsLockType::EXCLUSIVE : ClsLockType::SHARED,
7c673cae
FG
1418 cookie, tag, "", utime_t(), 0);
1419 if (r < 0) {
1420 return r;
1421 }
1422 }
1423
1424 ictx->notify_update();
1425 return 0;
1426 }
1427
1428 int unlock(ImageCtx *ictx, const string& cookie)
1429 {
1430 ldout(ictx->cct, 20) << "unlock image " << ictx
1431 << " cookie='" << cookie << "'" << dendl;
1432
1433 int r = ictx->state->refresh_if_required();
1434 if (r < 0)
1435 return r;
1436
1437 {
9f95a23c 1438 std::shared_lock locker{ictx->image_lock};
7c673cae
FG
1439 r = rados::cls::lock::unlock(&ictx->md_ctx, ictx->header_oid,
1440 RBD_LOCK_NAME, cookie);
1441 if (r < 0) {
1442 return r;
1443 }
1444 }
1445
1446 ictx->notify_update();
1447 return 0;
1448 }
1449
1450 int break_lock(ImageCtx *ictx, const string& client,
1451 const string& cookie)
1452 {
1453 ldout(ictx->cct, 20) << "break_lock image " << ictx << " client='" << client
1454 << "' cookie='" << cookie << "'" << dendl;
1455
1456 int r = ictx->state->refresh_if_required();
1457 if (r < 0)
1458 return r;
1459
1460 entity_name_t lock_client;
1461 if (!lock_client.parse(client)) {
1462 lderr(ictx->cct) << "Unable to parse client '" << client
1463 << "'" << dendl;
1464 return -EINVAL;
1465 }
1466
f67539c2 1467 if (ictx->config.get_val<bool>("rbd_blocklist_on_break_lock")) {
7c673cae
FG
1468 typedef std::map<rados::cls::lock::locker_id_t,
1469 rados::cls::lock::locker_info_t> Lockers;
1470 Lockers lockers;
1471 ClsLockType lock_type;
1472 std::string lock_tag;
1473 r = rados::cls::lock::get_lock_info(&ictx->md_ctx, ictx->header_oid,
1474 RBD_LOCK_NAME, &lockers, &lock_type,
1475 &lock_tag);
1476 if (r < 0) {
1477 lderr(ictx->cct) << "unable to retrieve lock info: " << cpp_strerror(r)
1478 << dendl;
1479 return r;
1480 }
1481
1482 std::string client_address;
1483 for (Lockers::iterator it = lockers.begin();
1484 it != lockers.end(); ++it) {
1485 if (it->first.locker == lock_client) {
11fdf7f2 1486 client_address = it->second.addr.get_legacy_str();
7c673cae
FG
1487 break;
1488 }
1489 }
1490 if (client_address.empty()) {
1491 return -ENOENT;
1492 }
1493
7c673cae 1494 librados::Rados rados(ictx->md_ctx);
f67539c2 1495 r = rados.blocklist_add(
11fdf7f2 1496 client_address,
f67539c2 1497 ictx->config.get_val<uint64_t>("rbd_blocklist_expire_seconds"));
7c673cae 1498 if (r < 0) {
f67539c2 1499 lderr(ictx->cct) << "unable to blocklist client: " << cpp_strerror(r)
7c673cae
FG
1500 << dendl;
1501 return r;
1502 }
1503 }
1504
1505 r = rados::cls::lock::break_lock(&ictx->md_ctx, ictx->header_oid,
1506 RBD_LOCK_NAME, cookie, lock_client);
1507 if (r < 0)
1508 return r;
1509 ictx->notify_update();
1510 return 0;
1511 }
1512
1513 void rbd_ctx_cb(completion_t cb, void *arg)
1514 {
1515 Context *ctx = reinterpret_cast<Context *>(arg);
1516 auto comp = reinterpret_cast<io::AioCompletion *>(cb);
1517 ctx->complete(comp->get_return_value());
1518 comp->release();
1519 }
1520
1521 int64_t read_iterate(ImageCtx *ictx, uint64_t off, uint64_t len,
1522 int (*cb)(uint64_t, size_t, const char *, void *),
1523 void *arg)
1524 {
11fdf7f2
TL
1525 coarse_mono_time start_time;
1526 ceph::timespan elapsed;
7c673cae
FG
1527
1528 ldout(ictx->cct, 20) << "read_iterate " << ictx << " off = " << off
1529 << " len = " << len << dendl;
1530
1531 int r = ictx->state->refresh_if_required();
1532 if (r < 0)
1533 return r;
1534
1535 uint64_t mylen = len;
9f95a23c 1536 ictx->image_lock.lock_shared();
7c673cae 1537 r = clip_io(ictx, off, &mylen);
9f95a23c 1538 ictx->image_lock.unlock_shared();
7c673cae
FG
1539 if (r < 0)
1540 return r;
1541
1542 int64_t total_read = 0;
1543 uint64_t period = ictx->get_stripe_period();
1544 uint64_t left = mylen;
1545
31f18b77 1546 ZTracer::Trace trace;
181888fb 1547 if (ictx->blkin_trace_all) {
31f18b77
FG
1548 trace.init("read_iterate", &ictx->trace_endpoint);
1549 }
1550
9f95a23c 1551 std::shared_lock owner_locker{ictx->owner_lock};
11fdf7f2 1552 start_time = coarse_mono_clock::now();
7c673cae
FG
1553 while (left > 0) {
1554 uint64_t period_off = off - (off % period);
20effc67 1555 uint64_t read_len = std::min(period_off + period - off, left);
7c673cae
FG
1556
1557 bufferlist bl;
1558
1559 C_SaferCond ctx;
1560 auto c = io::AioCompletion::create_and_start(&ctx, ictx,
1561 io::AIO_TYPE_READ);
f67539c2
TL
1562 auto req = io::ImageDispatchSpec::create_read(
1563 *ictx, io::IMAGE_DISPATCH_LAYER_NONE, c,
1564 {{off, read_len}}, io::ReadResult{&bl},
1565 ictx->get_data_io_context(), 0, 0, trace);
1566 req->send();
7c673cae
FG
1567
1568 int ret = ctx.wait();
1569 if (ret < 0) {
1570 return ret;
1571 }
1572
1573 r = cb(total_read, ret, bl.c_str(), arg);
1574 if (r < 0) {
1575 return r;
1576 }
1577
1578 total_read += ret;
1579 left -= ret;
1580 off += ret;
1581 }
1582
11fdf7f2 1583 elapsed = coarse_mono_clock::now() - start_time;
7c673cae
FG
1584 ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed);
1585 ictx->perfcounter->inc(l_librbd_rd);
1586 ictx->perfcounter->inc(l_librbd_rd_bytes, mylen);
1587 return total_read;
1588 }
1589
1590 // validate extent against image size; clip to image size if necessary
1591 int clip_io(ImageCtx *ictx, uint64_t off, uint64_t *len)
1592 {
9f95a23c 1593 ceph_assert(ceph_mutex_is_locked(ictx->image_lock));
7c673cae 1594
f67539c2
TL
1595 if (ictx->snap_id != CEPH_NOSNAP &&
1596 ictx->get_snap_info(ictx->snap_id) == nullptr) {
7c673cae 1597 return -ENOENT;
f67539c2
TL
1598 }
1599 uint64_t image_size = ictx->get_effective_image_size(ictx->snap_id);
7c673cae
FG
1600
1601 // special-case "len == 0" requests: always valid
1602 if (*len == 0)
1603 return 0;
1604
1605 // can't start past end
1606 if (off >= image_size)
1607 return -EINVAL;
1608
1609 // clip requests that extend past end to just end
1610 if ((off + *len) > image_size)
1611 *len = (size_t)(image_size - off);
1612
1613 return 0;
1614 }
1615
11fdf7f2 1616 int invalidate_cache(ImageCtx *ictx)
7c673cae
FG
1617 {
1618 CephContext *cct = ictx->cct;
11fdf7f2 1619 ldout(cct, 20) << "invalidate_cache " << ictx << dendl;
7c673cae
FG
1620
1621 int r = ictx->state->refresh_if_required();
1622 if (r < 0) {
1623 return r;
1624 }
1625
7c673cae
FG
1626 C_SaferCond ctx;
1627 {
f67539c2 1628 ictx->io_image_dispatcher->invalidate_cache(&ctx);
7c673cae
FG
1629 }
1630 r = ctx.wait();
f67539c2
TL
1631
1632 if (r < 0) {
1633 ldout(cct, 20) << "failed to invalidate image cache" << dendl;
1634 return r;
1635 }
1636
7c673cae 1637 ictx->perfcounter->inc(l_librbd_invalidate_cache);
f67539c2
TL
1638
1639 // Delete writeback cache if it is not initialized
1640 if ((!ictx->exclusive_lock ||
1641 !ictx->exclusive_lock->is_lock_owner()) &&
1642 ictx->test_features(RBD_FEATURE_DIRTY_CACHE)) {
1643 C_SaferCond ctx3;
1644 ictx->plugin_registry->discard(&ctx3);
1645 r = ctx3.wait();
1646 }
7c673cae
FG
1647 return r;
1648 }
1649
1650 int poll_io_events(ImageCtx *ictx, io::AioCompletion **comps, int numcomp)
1651 {
1652 if (numcomp <= 0)
1653 return -EINVAL;
1654 CephContext *cct = ictx->cct;
1655 ldout(cct, 20) << __func__ << " " << ictx << " numcomp = " << numcomp
1656 << dendl;
1657 int i = 0;
9f95a23c
TL
1658 while (i < numcomp && ictx->event_socket_completions.pop(comps[i])) {
1659 ++i;
7c673cae 1660 }
9f95a23c 1661
7c673cae
FG
1662 return i;
1663 }
1664
1665 int metadata_get(ImageCtx *ictx, const string &key, string *value)
1666 {
1667 CephContext *cct = ictx->cct;
1668 ldout(cct, 20) << "metadata_get " << ictx << " key=" << key << dendl;
1669
1670 int r = ictx->state->refresh_if_required();
1671 if (r < 0) {
1672 return r;
1673 }
1674
1675 return cls_client::metadata_get(&ictx->md_ctx, ictx->header_oid, key, value);
1676 }
1677
1678 int metadata_list(ImageCtx *ictx, const string &start, uint64_t max, map<string, bufferlist> *pairs)
1679 {
1680 CephContext *cct = ictx->cct;
1681 ldout(cct, 20) << "metadata_list " << ictx << dendl;
1682
1683 int r = ictx->state->refresh_if_required();
1684 if (r < 0) {
1685 return r;
1686 }
1687
9f95a23c
TL
1688 C_SaferCond ctx;
1689 auto req = image::GetMetadataRequest<>::create(
1690 ictx->md_ctx, ictx->header_oid, false, "", start, max, pairs, &ctx);
1691 req->send();
7c673cae 1692
9f95a23c 1693 return ctx.wait();
7c673cae
FG
1694 }
1695
11fdf7f2
TL
1696 int list_watchers(ImageCtx *ictx,
1697 std::list<librbd::image_watcher_t> &watchers)
1698 {
1699 int r;
1700 std::string header_oid;
1701 std::list<obj_watch_t> obj_watchers;
1702
1703 if (ictx->old_format) {
1704 header_oid = util::old_header_name(ictx->name);
1705 } else {
1706 header_oid = util::header_name(ictx->id);
1707 }
1708
1709 r = ictx->md_ctx.list_watchers(header_oid, &obj_watchers);
1710 if (r < 0) {
1711 return r;
1712 }
1713
1714 for (auto i = obj_watchers.begin(); i != obj_watchers.end(); ++i) {
1715 librbd::image_watcher_t watcher;
1716 watcher.addr = i->addr;
1717 watcher.id = i->watcher_id;
1718 watcher.cookie = i->cookie;
1719
1720 watchers.push_back(watcher);
1721 }
1722
1723 return 0;
1724 }
1725
1726}
1727
1728std::ostream &operator<<(std::ostream &os, const librbd::ImageOptions &opts) {
1729 os << "[";
1730
1731 const char *delimiter = "";
1732 for (auto &i : librbd::IMAGE_OPTIONS_TYPE_MAPPING) {
1733 if (i.second == librbd::STR) {
1734 std::string val;
1735 if (opts.get(i.first, &val) == 0) {
1736 os << delimiter << librbd::image_option_name(i.first) << "=" << val;
1737 delimiter = ", ";
1738 }
1739 } else if (i.second == librbd::UINT64) {
1740 uint64_t val;
1741 if (opts.get(i.first, &val) == 0) {
1742 os << delimiter << librbd::image_option_name(i.first) << "=" << val;
1743 delimiter = ", ";
1744 }
1745 }
1746 }
7c673cae 1747
11fdf7f2 1748 os << "]";
7c673cae 1749
11fdf7f2 1750 return os;
7c673cae 1751}