]> git.proxmox.com Git - ceph.git/blob - ceph/src/librbd/internal.cc
bump version to 19.2.0-pve1
[ceph.git] / ceph / src / librbd / internal.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #include "include/int_types.h"
4
5 #include <errno.h>
6 #include <limits.h>
7
8 #include "include/types.h"
9 #include "include/uuid.h"
10 #include "common/ceph_context.h"
11 #include "common/dout.h"
12 #include "common/errno.h"
13 #include "common/Throttle.h"
14 #include "common/event_socket.h"
15 #include "common/perf_counters.h"
16 #include "osdc/Striper.h"
17 #include "include/stringify.h"
18
19 #include "cls/lock/cls_lock_client.h"
20 #include "cls/rbd/cls_rbd.h"
21 #include "cls/rbd/cls_rbd_types.h"
22 #include "cls/rbd/cls_rbd_client.h"
23 #include "cls/journal/cls_journal_types.h"
24 #include "cls/journal/cls_journal_client.h"
25
26 #include "librbd/AsioEngine.h"
27 #include "librbd/ExclusiveLock.h"
28 #include "librbd/ImageCtx.h"
29 #include "librbd/ImageState.h"
30 #include "librbd/internal.h"
31 #include "librbd/Journal.h"
32 #include "librbd/ObjectMap.h"
33 #include "librbd/Operations.h"
34 #include "librbd/PluginRegistry.h"
35 #include "librbd/Types.h"
36 #include "librbd/Utils.h"
37 #include "librbd/api/Config.h"
38 #include "librbd/api/Image.h"
39 #include "librbd/api/Io.h"
40 #include "librbd/cache/Utils.h"
41 #include "librbd/exclusive_lock/AutomaticPolicy.h"
42 #include "librbd/exclusive_lock/StandardPolicy.h"
43 #include "librbd/deep_copy/MetadataCopyRequest.h"
44 #include "librbd/image/CloneRequest.h"
45 #include "librbd/image/CreateRequest.h"
46 #include "librbd/image/GetMetadataRequest.h"
47 #include "librbd/image/Types.h"
48 #include "librbd/io/AioCompletion.h"
49 #include "librbd/io/ImageDispatchSpec.h"
50 #include "librbd/io/ImageDispatcherInterface.h"
51 #include "librbd/io/ObjectDispatcherInterface.h"
52 #include "librbd/io/ObjectRequest.h"
53 #include "librbd/io/ReadResult.h"
54 #include "librbd/journal/Types.h"
55 #include "librbd/managed_lock/Types.h"
56 #include "librbd/mirror/EnableRequest.h"
57 #include "librbd/operation/TrimRequest.h"
58
59 #include "journal/Journaler.h"
60
61 #include <boost/scope_exit.hpp>
62 #include <boost/variant.hpp>
63 #include "include/ceph_assert.h"
64
65 #define dout_subsys ceph_subsys_rbd
66 #undef dout_prefix
67 #define dout_prefix *_dout << "librbd: "
68
69 #define rbd_howmany(x, y) (((x) + (y) - 1) / (y))
70
71 using std::istringstream;
72 using std::map;
73 using std::pair;
74 using std::set;
75 using std::string;
76 using std::vector;
77 // list binds to list() here, so std::list is explicitly used below
78
79 using ceph::bufferlist;
80 using librados::snap_t;
81 using librados::IoCtx;
82 using librados::Rados;
83
84 namespace librbd {
85
86 namespace {
87
88 int validate_pool(IoCtx &io_ctx, CephContext *cct) {
89 if (!cct->_conf.get_val<bool>("rbd_validate_pool")) {
90 return 0;
91 }
92
93 int r = io_ctx.stat(RBD_DIRECTORY, NULL, NULL);
94 if (r == 0) {
95 return 0;
96 } else if (r < 0 && r != -ENOENT) {
97 lderr(cct) << "failed to stat RBD directory: " << cpp_strerror(r) << dendl;
98 return r;
99 }
100
101 // allocate a self-managed snapshot id if this a new pool to force
102 // self-managed snapshot mode
103 uint64_t snap_id;
104 r = io_ctx.selfmanaged_snap_create(&snap_id);
105 if (r == -EINVAL) {
106 lderr(cct) << "pool not configured for self-managed RBD snapshot support"
107 << dendl;
108 return r;
109 } else if (r < 0) {
110 lderr(cct) << "failed to allocate self-managed snapshot: "
111 << cpp_strerror(r) << dendl;
112 return r;
113 }
114
115 r = io_ctx.selfmanaged_snap_remove(snap_id);
116 if (r < 0) {
117 lderr(cct) << "failed to release self-managed snapshot " << snap_id
118 << ": " << cpp_strerror(r) << dendl;
119 }
120 return 0;
121 }
122
123 } // anonymous namespace
124
125 int detect_format(IoCtx &io_ctx, const string &name,
126 bool *old_format, uint64_t *size)
127 {
128 CephContext *cct = (CephContext *)io_ctx.cct();
129 if (old_format)
130 *old_format = true;
131 int r = io_ctx.stat(util::old_header_name(name), size, NULL);
132 if (r == -ENOENT) {
133 if (old_format)
134 *old_format = false;
135 r = io_ctx.stat(util::id_obj_name(name), size, NULL);
136 if (r < 0)
137 return r;
138 } else if (r < 0) {
139 return r;
140 }
141
142 ldout(cct, 20) << "detect format of " << name << " : "
143 << (old_format ? (*old_format ? "old" : "new") :
144 "don't care") << dendl;
145 return 0;
146 }
147
148 bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap)
149 {
150 return (parent_pool_id != -1 && off <= overlap);
151 }
152
153 void init_rbd_header(struct rbd_obj_header_ondisk& ondisk,
154 uint64_t size, int order, uint64_t bid)
155 {
156 uint32_t hi = bid >> 32;
157 uint32_t lo = bid & 0xFFFFFFFF;
158 uint32_t extra = rand() % 0xFFFFFFFF;
159 // FIPS zeroization audit 20191117: this memset is not security related.
160 memset(&ondisk, 0, sizeof(ondisk));
161
162 memcpy(&ondisk.text, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT));
163 memcpy(&ondisk.signature, RBD_HEADER_SIGNATURE,
164 sizeof(RBD_HEADER_SIGNATURE));
165 memcpy(&ondisk.version, RBD_HEADER_VERSION, sizeof(RBD_HEADER_VERSION));
166
167 snprintf(ondisk.block_name, sizeof(ondisk.block_name), "rb.%x.%x.%x",
168 hi, lo, extra);
169
170 ondisk.image_size = size;
171 ondisk.options.order = order;
172 ondisk.options.crypt_type = RBD_CRYPT_NONE;
173 ondisk.options.comp_type = RBD_COMP_NONE;
174 ondisk.snap_seq = 0;
175 ondisk.snap_count = 0;
176 ondisk.reserved = 0;
177 ondisk.snap_names_len = 0;
178 }
179
180 void image_info(ImageCtx *ictx, image_info_t& info, size_t infosize)
181 {
182 int obj_order = ictx->order;
183 {
184 std::shared_lock locker{ictx->image_lock};
185 info.size = ictx->get_area_size(io::ImageArea::DATA);
186 }
187 info.obj_size = 1ULL << obj_order;
188 info.num_objs = Striper::get_num_objects(ictx->layout, info.size);
189 info.order = obj_order;
190 strncpy(info.block_name_prefix, ictx->object_prefix.c_str(),
191 RBD_MAX_BLOCK_NAME_SIZE);
192 info.block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE - 1] = '\0';
193
194 // clear deprecated fields
195 info.parent_pool = -1L;
196 info.parent_name[0] = '\0';
197 }
198
199 uint64_t oid_to_object_no(const string& oid, const string& object_prefix)
200 {
201 istringstream iss(oid);
202 // skip object prefix and separator
203 iss.ignore(object_prefix.length() + 1);
204 uint64_t num;
205 iss >> std::hex >> num;
206 return num;
207 }
208
209 int read_header_bl(IoCtx& io_ctx, const string& header_oid,
210 bufferlist& header, uint64_t *ver)
211 {
212 int r;
213 uint64_t off = 0;
214 #define READ_SIZE 4096
215 do {
216 bufferlist bl;
217 r = io_ctx.read(header_oid, bl, READ_SIZE, off);
218 if (r < 0)
219 return r;
220 header.claim_append(bl);
221 off += r;
222 } while (r == READ_SIZE);
223
224 static_assert(sizeof(RBD_HEADER_TEXT) == sizeof(RBD_MIGRATE_HEADER_TEXT),
225 "length of rbd headers must be the same");
226
227 if (header.length() < sizeof(RBD_HEADER_TEXT) ||
228 (memcmp(RBD_HEADER_TEXT, header.c_str(),
229 sizeof(RBD_HEADER_TEXT)) != 0 &&
230 memcmp(RBD_MIGRATE_HEADER_TEXT, header.c_str(),
231 sizeof(RBD_MIGRATE_HEADER_TEXT)) != 0)) {
232 CephContext *cct = (CephContext *)io_ctx.cct();
233 lderr(cct) << "unrecognized header format" << dendl;
234 return -ENXIO;
235 }
236
237 if (ver)
238 *ver = io_ctx.get_last_version();
239
240 return 0;
241 }
242
243 int read_header(IoCtx& io_ctx, const string& header_oid,
244 struct rbd_obj_header_ondisk *header, uint64_t *ver)
245 {
246 bufferlist header_bl;
247 int r = read_header_bl(io_ctx, header_oid, header_bl, ver);
248 if (r < 0)
249 return r;
250 if (header_bl.length() < (int)sizeof(*header))
251 return -EIO;
252 memcpy(header, header_bl.c_str(), sizeof(*header));
253
254 return 0;
255 }
256
257 int tmap_set(IoCtx& io_ctx, const string& imgname)
258 {
259 bufferlist cmdbl, emptybl;
260 __u8 c = CEPH_OSD_TMAP_SET;
261 encode(c, cmdbl);
262 encode(imgname, cmdbl);
263 encode(emptybl, cmdbl);
264 return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl);
265 }
266
267 int tmap_rm(IoCtx& io_ctx, const string& imgname)
268 {
269 bufferlist cmdbl;
270 __u8 c = CEPH_OSD_TMAP_RM;
271 encode(c, cmdbl);
272 encode(imgname, cmdbl);
273 return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl);
274 }
275
276 typedef boost::variant<std::string,uint64_t> image_option_value_t;
277 typedef std::map<int,image_option_value_t> image_options_t;
278 typedef std::shared_ptr<image_options_t> image_options_ref;
279
280 enum image_option_type_t {
281 STR,
282 UINT64,
283 };
284
285 const std::map<int, image_option_type_t> IMAGE_OPTIONS_TYPE_MAPPING = {
286 {RBD_IMAGE_OPTION_FORMAT, UINT64},
287 {RBD_IMAGE_OPTION_FEATURES, UINT64},
288 {RBD_IMAGE_OPTION_ORDER, UINT64},
289 {RBD_IMAGE_OPTION_STRIPE_UNIT, UINT64},
290 {RBD_IMAGE_OPTION_STRIPE_COUNT, UINT64},
291 {RBD_IMAGE_OPTION_JOURNAL_ORDER, UINT64},
292 {RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH, UINT64},
293 {RBD_IMAGE_OPTION_JOURNAL_POOL, STR},
294 {RBD_IMAGE_OPTION_FEATURES_SET, UINT64},
295 {RBD_IMAGE_OPTION_FEATURES_CLEAR, UINT64},
296 {RBD_IMAGE_OPTION_DATA_POOL, STR},
297 {RBD_IMAGE_OPTION_FLATTEN, UINT64},
298 {RBD_IMAGE_OPTION_CLONE_FORMAT, UINT64},
299 {RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE, UINT64},
300 };
301
302 std::string image_option_name(int optname) {
303 switch (optname) {
304 case RBD_IMAGE_OPTION_FORMAT:
305 return "format";
306 case RBD_IMAGE_OPTION_FEATURES:
307 return "features";
308 case RBD_IMAGE_OPTION_ORDER:
309 return "order";
310 case RBD_IMAGE_OPTION_STRIPE_UNIT:
311 return "stripe_unit";
312 case RBD_IMAGE_OPTION_STRIPE_COUNT:
313 return "stripe_count";
314 case RBD_IMAGE_OPTION_JOURNAL_ORDER:
315 return "journal_order";
316 case RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH:
317 return "journal_splay_width";
318 case RBD_IMAGE_OPTION_JOURNAL_POOL:
319 return "journal_pool";
320 case RBD_IMAGE_OPTION_FEATURES_SET:
321 return "features_set";
322 case RBD_IMAGE_OPTION_FEATURES_CLEAR:
323 return "features_clear";
324 case RBD_IMAGE_OPTION_DATA_POOL:
325 return "data_pool";
326 case RBD_IMAGE_OPTION_FLATTEN:
327 return "flatten";
328 case RBD_IMAGE_OPTION_CLONE_FORMAT:
329 return "clone_format";
330 case RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE:
331 return "mirror_image_mode";
332 default:
333 return "unknown (" + stringify(optname) + ")";
334 }
335 }
336
337 void image_options_create(rbd_image_options_t* opts)
338 {
339 image_options_ref* opts_ = new image_options_ref(new image_options_t());
340
341 *opts = static_cast<rbd_image_options_t>(opts_);
342 }
343
344 void image_options_create_ref(rbd_image_options_t* opts,
345 rbd_image_options_t orig)
346 {
347 image_options_ref* orig_ = static_cast<image_options_ref*>(orig);
348 image_options_ref* opts_ = new image_options_ref(*orig_);
349
350 *opts = static_cast<rbd_image_options_t>(opts_);
351 }
352
353 void image_options_copy(rbd_image_options_t* opts,
354 const ImageOptions &orig)
355 {
356 image_options_ref* opts_ = new image_options_ref(new image_options_t());
357
358 *opts = static_cast<rbd_image_options_t>(opts_);
359
360 std::string str_val;
361 uint64_t uint64_val;
362 for (auto &i : IMAGE_OPTIONS_TYPE_MAPPING) {
363 switch (i.second) {
364 case STR:
365 if (orig.get(i.first, &str_val) == 0) {
366 image_options_set(*opts, i.first, str_val);
367 }
368 continue;
369 case UINT64:
370 if (orig.get(i.first, &uint64_val) == 0) {
371 image_options_set(*opts, i.first, uint64_val);
372 }
373 continue;
374 }
375 }
376 }
377
378 void image_options_destroy(rbd_image_options_t opts)
379 {
380 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
381
382 delete opts_;
383 }
384
385 int image_options_set(rbd_image_options_t opts, int optname,
386 const std::string& optval)
387 {
388 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
389
390 std::map<int, image_option_type_t>::const_iterator i =
391 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
392
393 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
394 return -EINVAL;
395 }
396
397 (*opts_->get())[optname] = optval;
398 return 0;
399 }
400
401 int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval)
402 {
403 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
404
405 std::map<int, image_option_type_t>::const_iterator i =
406 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
407
408 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
409 return -EINVAL;
410 }
411
412 (*opts_->get())[optname] = optval;
413 return 0;
414 }
415
416 int image_options_get(rbd_image_options_t opts, int optname,
417 std::string* optval)
418 {
419 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
420
421 std::map<int, image_option_type_t>::const_iterator i =
422 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
423
424 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
425 return -EINVAL;
426 }
427
428 image_options_t::const_iterator j = (*opts_)->find(optname);
429
430 if (j == (*opts_)->end()) {
431 return -ENOENT;
432 }
433
434 *optval = boost::get<std::string>(j->second);
435 return 0;
436 }
437
438 int image_options_get(rbd_image_options_t opts, int optname, uint64_t* optval)
439 {
440 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
441
442 std::map<int, image_option_type_t>::const_iterator i =
443 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
444
445 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
446 return -EINVAL;
447 }
448
449 image_options_t::const_iterator j = (*opts_)->find(optname);
450
451 if (j == (*opts_)->end()) {
452 return -ENOENT;
453 }
454
455 *optval = boost::get<uint64_t>(j->second);
456 return 0;
457 }
458
459 int image_options_is_set(rbd_image_options_t opts, int optname,
460 bool* is_set)
461 {
462 if (IMAGE_OPTIONS_TYPE_MAPPING.find(optname) ==
463 IMAGE_OPTIONS_TYPE_MAPPING.end()) {
464 return -EINVAL;
465 }
466
467 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
468 *is_set = ((*opts_)->find(optname) != (*opts_)->end());
469 return 0;
470 }
471
472 int image_options_unset(rbd_image_options_t opts, int optname)
473 {
474 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
475
476 std::map<int, image_option_type_t>::const_iterator i =
477 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
478
479 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end()) {
480 ceph_assert((*opts_)->find(optname) == (*opts_)->end());
481 return -EINVAL;
482 }
483
484 image_options_t::const_iterator j = (*opts_)->find(optname);
485
486 if (j == (*opts_)->end()) {
487 return -ENOENT;
488 }
489
490 (*opts_)->erase(j);
491 return 0;
492 }
493
494 void image_options_clear(rbd_image_options_t opts)
495 {
496 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
497
498 (*opts_)->clear();
499 }
500
501 bool image_options_is_empty(rbd_image_options_t opts)
502 {
503 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
504
505 return (*opts_)->empty();
506 }
507
508 int create_v1(IoCtx& io_ctx, const char *imgname, uint64_t size, int order)
509 {
510 CephContext *cct = (CephContext *)io_ctx.cct();
511
512 ldout(cct, 20) << __func__ << " " << &io_ctx << " name = " << imgname
513 << " size = " << size << " order = " << order << dendl;
514 int r = validate_pool(io_ctx, cct);
515 if (r < 0) {
516 return r;
517 }
518
519 if (!io_ctx.get_namespace().empty()) {
520 lderr(cct) << "attempting to add v1 image to namespace" << dendl;
521 return -EINVAL;
522 }
523
524 ldout(cct, 2) << "adding rbd image to directory..." << dendl;
525 r = tmap_set(io_ctx, imgname);
526 if (r < 0) {
527 lderr(cct) << "error adding image to directory: " << cpp_strerror(r)
528 << dendl;
529 return r;
530 }
531
532 Rados rados(io_ctx);
533 uint64_t bid = rados.get_instance_id();
534
535 ldout(cct, 2) << "creating rbd image..." << dendl;
536 struct rbd_obj_header_ondisk header;
537 init_rbd_header(header, size, order, bid);
538
539 bufferlist bl;
540 bl.append((const char *)&header, sizeof(header));
541
542 string header_oid = util::old_header_name(imgname);
543 r = io_ctx.write(header_oid, bl, bl.length(), 0);
544 if (r < 0) {
545 lderr(cct) << "Error writing image header: " << cpp_strerror(r)
546 << dendl;
547 int remove_r = tmap_rm(io_ctx, imgname);
548 if (remove_r < 0) {
549 lderr(cct) << "Could not remove image from directory after "
550 << "header creation failed: "
551 << cpp_strerror(remove_r) << dendl;
552 }
553 return r;
554 }
555
556 ldout(cct, 2) << "done." << dendl;
557 return 0;
558 }
559
560 int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
561 int *order)
562 {
563 uint64_t order_ = *order;
564 ImageOptions opts;
565
566 int r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
567 ceph_assert(r == 0);
568
569 r = create(io_ctx, imgname, "", size, opts, "", "", false);
570
571 int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
572 ceph_assert(r1 == 0);
573 *order = order_;
574
575 return r;
576 }
577
578 int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
579 bool old_format, uint64_t features, int *order,
580 uint64_t stripe_unit, uint64_t stripe_count)
581 {
582 if (!order)
583 return -EINVAL;
584
585 uint64_t order_ = *order;
586 uint64_t format = old_format ? 1 : 2;
587 ImageOptions opts;
588 int r;
589
590 r = opts.set(RBD_IMAGE_OPTION_FORMAT, format);
591 ceph_assert(r == 0);
592 r = opts.set(RBD_IMAGE_OPTION_FEATURES, features);
593 ceph_assert(r == 0);
594 r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
595 ceph_assert(r == 0);
596 r = opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
597 ceph_assert(r == 0);
598 r = opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
599 ceph_assert(r == 0);
600
601 r = create(io_ctx, imgname, "", size, opts, "", "", false);
602
603 int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
604 ceph_assert(r1 == 0);
605 *order = order_;
606
607 return r;
608 }
609
610 int create(IoCtx& io_ctx, const std::string &image_name,
611 const std::string &image_id, uint64_t size,
612 ImageOptions& opts,
613 const std::string &non_primary_global_image_id,
614 const std::string &primary_mirror_uuid,
615 bool skip_mirror_enable)
616 {
617 std::string id(image_id);
618 if (id.empty()) {
619 id = util::generate_image_id(io_ctx);
620 }
621
622 CephContext *cct = (CephContext *)io_ctx.cct();
623 uint64_t option;
624 if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) {
625 lderr(cct) << "create does not support 'flatten' image option" << dendl;
626 return -EINVAL;
627 }
628 if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) {
629 lderr(cct) << "create does not support 'clone_format' image option"
630 << dendl;
631 return -EINVAL;
632 }
633
634 ldout(cct, 10) << __func__ << " name=" << image_name << ", "
635 << "id= " << id << ", "
636 << "size=" << size << ", opts=" << opts << dendl;
637
638 uint64_t format;
639 if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0)
640 format = cct->_conf.get_val<uint64_t>("rbd_default_format");
641 bool old_format = format == 1;
642
643 // make sure it doesn't already exist, in either format
644 int r = detect_format(io_ctx, image_name, NULL, NULL);
645 if (r != -ENOENT) {
646 if (r) {
647 lderr(cct) << "Could not tell if " << image_name << " already exists"
648 << dendl;
649 return r;
650 }
651 lderr(cct) << "rbd image " << image_name << " already exists" << dendl;
652 return -EEXIST;
653 }
654
655 uint64_t order = 0;
656 if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0 || order == 0) {
657 order = cct->_conf.get_val<uint64_t>("rbd_default_order");
658 }
659 r = image::CreateRequest<>::validate_order(cct, order);
660 if (r < 0) {
661 return r;
662 }
663
664 if (old_format) {
665 if ( !getenv("RBD_FORCE_ALLOW_V1") ) {
666 lderr(cct) << "Format 1 image creation unsupported. " << dendl;
667 return -EINVAL;
668 }
669 lderr(cct) << "Forced V1 image creation. " << dendl;
670 r = create_v1(io_ctx, image_name.c_str(), size, order);
671 } else {
672 AsioEngine asio_engine(io_ctx);
673
674 ConfigProxy config{cct->_conf};
675 api::Config<>::apply_pool_overrides(io_ctx, &config);
676
677 uint32_t create_flags = 0U;
678 uint64_t mirror_image_mode = RBD_MIRROR_IMAGE_MODE_JOURNAL;
679 if (skip_mirror_enable) {
680 create_flags = image::CREATE_FLAG_SKIP_MIRROR_ENABLE;
681 } else if (opts.get(RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE,
682 &mirror_image_mode) == 0) {
683 create_flags = image::CREATE_FLAG_FORCE_MIRROR_ENABLE;
684 }
685
686 C_SaferCond cond;
687 image::CreateRequest<> *req = image::CreateRequest<>::create(
688 config, io_ctx, image_name, id, size, opts, create_flags,
689 static_cast<cls::rbd::MirrorImageMode>(mirror_image_mode),
690 non_primary_global_image_id, primary_mirror_uuid,
691 asio_engine.get_work_queue(), &cond);
692 req->send();
693
694 r = cond.wait();
695 }
696
697 int r1 = opts.set(RBD_IMAGE_OPTION_ORDER, order);
698 ceph_assert(r1 == 0);
699
700 return r;
701 }
702
703 /*
704 * Parent may be in different pool, hence different IoCtx
705 */
706 int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
707 IoCtx& c_ioctx, const char *c_name,
708 uint64_t features, int *c_order,
709 uint64_t stripe_unit, int stripe_count)
710 {
711 uint64_t order = *c_order;
712
713 ImageOptions opts;
714 opts.set(RBD_IMAGE_OPTION_FEATURES, features);
715 opts.set(RBD_IMAGE_OPTION_ORDER, order);
716 opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
717 opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
718
719 int r = clone(p_ioctx, nullptr, p_name, CEPH_NOSNAP, p_snap_name,
720 c_ioctx, nullptr, c_name, opts, "", "");
721 opts.get(RBD_IMAGE_OPTION_ORDER, &order);
722 *c_order = order;
723 return r;
724 }
725
726 int clone(IoCtx& p_ioctx, const char *p_id, const char *p_name,
727 uint64_t p_snap_id, const char *p_snap_name, IoCtx& c_ioctx,
728 const char *c_id, const char *c_name, ImageOptions& c_opts,
729 const std::string &non_primary_global_image_id,
730 const std::string &primary_mirror_uuid)
731 {
732 CephContext *cct = (CephContext *)p_ioctx.cct();
733 ldout(cct, 10) << __func__
734 << " p_id=" << (p_id ?: "")
735 << ", p_name=" << (p_name ?: "")
736 << ", p_snap_id=" << p_snap_id
737 << ", p_snap_name=" << (p_snap_name ?: "")
738 << ", c_id=" << (c_id ?: "")
739 << ", c_name=" << c_name
740 << ", c_opts=" << c_opts
741 << ", non_primary_global_image_id=" << non_primary_global_image_id
742 << ", primary_mirror_uuid=" << primary_mirror_uuid
743 << dendl;
744
745 if (((p_id == nullptr) ^ (p_name == nullptr)) == 0) {
746 lderr(cct) << "must specify either parent image id or parent image name"
747 << dendl;
748 return -EINVAL;
749 }
750 if (((p_snap_id == CEPH_NOSNAP) ^ (p_snap_name == nullptr)) == 0) {
751 lderr(cct) << "must specify either parent snap id or parent snap name"
752 << dendl;
753 return -EINVAL;
754 }
755
756 uint64_t flatten;
757 if (c_opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) {
758 lderr(cct) << "clone does not support 'flatten' image option" << dendl;
759 return -EINVAL;
760 }
761
762 int r;
763 std::string parent_id;
764 if (p_id == nullptr) {
765 r = cls_client::dir_get_id(&p_ioctx, RBD_DIRECTORY, p_name,
766 &parent_id);
767 if (r < 0) {
768 if (r != -ENOENT) {
769 lderr(cct) << "failed to retrieve parent image id: "
770 << cpp_strerror(r) << dendl;
771 }
772 return r;
773 }
774 } else {
775 parent_id = p_id;
776 }
777
778 std::string clone_id;
779 if (c_id == nullptr) {
780 clone_id = util::generate_image_id(c_ioctx);
781 } else {
782 clone_id = c_id;
783 }
784
785 ldout(cct, 10) << __func__ << " parent_id=" << parent_id
786 << ", clone_id=" << clone_id << dendl;
787
788 ConfigProxy config{reinterpret_cast<CephContext *>(c_ioctx.cct())->_conf};
789 api::Config<>::apply_pool_overrides(c_ioctx, &config);
790
791 AsioEngine asio_engine(p_ioctx);
792
793 C_SaferCond cond;
794 auto *req = image::CloneRequest<>::create(
795 config, p_ioctx, parent_id, (p_snap_name ?: ""),
796 {cls::rbd::UserSnapshotNamespace{}}, p_snap_id, c_ioctx, c_name,
797 clone_id, c_opts, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL,
798 non_primary_global_image_id, primary_mirror_uuid,
799 asio_engine.get_work_queue(), &cond);
800 req->send();
801
802 r = cond.wait();
803 if (r < 0) {
804 return r;
805 }
806
807 return 0;
808 }
809
810 int rename(IoCtx& io_ctx, const char *srcname, const char *dstname)
811 {
812 CephContext *cct = (CephContext *)io_ctx.cct();
813 ldout(cct, 20) << "rename " << &io_ctx << " " << srcname << " -> "
814 << dstname << dendl;
815
816 ImageCtx *ictx = new ImageCtx(srcname, "", "", io_ctx, false);
817 int r = ictx->state->open(0);
818 if (r < 0) {
819 lderr(cct) << "error opening source image: " << cpp_strerror(r) << dendl;
820 return r;
821 }
822 BOOST_SCOPE_EXIT((ictx)) {
823 ictx->state->close();
824 } BOOST_SCOPE_EXIT_END
825
826 return ictx->operations->rename(dstname);
827 }
828
829 int info(ImageCtx *ictx, image_info_t& info, size_t infosize)
830 {
831 ldout(ictx->cct, 20) << "info " << ictx << dendl;
832
833 int r = ictx->state->refresh_if_required();
834 if (r < 0)
835 return r;
836
837 image_info(ictx, info, infosize);
838 return 0;
839 }
840
841 int get_old_format(ImageCtx *ictx, uint8_t *old)
842 {
843 int r = ictx->state->refresh_if_required();
844 if (r < 0)
845 return r;
846 *old = ictx->old_format;
847 return 0;
848 }
849
850 int get_size(ImageCtx *ictx, uint64_t *size)
851 {
852 int r = ictx->state->refresh_if_required();
853 if (r < 0)
854 return r;
855 std::shared_lock l2{ictx->image_lock};
856 *size = ictx->get_area_size(io::ImageArea::DATA);
857 return 0;
858 }
859
860 int get_features(ImageCtx *ictx, uint64_t *features)
861 {
862 int r = ictx->state->refresh_if_required();
863 if (r < 0)
864 return r;
865 std::shared_lock l{ictx->image_lock};
866 *features = ictx->features;
867 return 0;
868 }
869
870 int get_overlap(ImageCtx *ictx, uint64_t *overlap)
871 {
872 int r = ictx->state->refresh_if_required();
873 if (r < 0)
874 return r;
875
876 std::shared_lock image_locker{ictx->image_lock};
877 uint64_t raw_overlap;
878 r = ictx->get_parent_overlap(ictx->snap_id, &raw_overlap);
879 if (r < 0) {
880 return r;
881 }
882 auto _overlap = ictx->reduce_parent_overlap(raw_overlap, false);
883 *overlap = (_overlap.second == io::ImageArea::DATA ? _overlap.first : 0);
884 return 0;
885 }
886
887 int get_flags(ImageCtx *ictx, uint64_t *flags)
888 {
889 int r = ictx->state->refresh_if_required();
890 if (r < 0) {
891 return r;
892 }
893
894 std::shared_lock l2{ictx->image_lock};
895 return ictx->get_flags(ictx->snap_id, flags);
896 }
897
898 int set_image_notification(ImageCtx *ictx, int fd, int type)
899 {
900 CephContext *cct = ictx->cct;
901 ldout(cct, 20) << __func__ << " " << ictx << " fd " << fd << " type" << type << dendl;
902
903 int r = ictx->state->refresh_if_required();
904 if (r < 0) {
905 return r;
906 }
907
908 if (ictx->event_socket.is_valid())
909 return -EINVAL;
910 return ictx->event_socket.init(fd, type);
911 }
912
913 int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner)
914 {
915 CephContext *cct = ictx->cct;
916 ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
917 *is_owner = false;
918
919 std::shared_lock owner_locker{ictx->owner_lock};
920 if (ictx->exclusive_lock == nullptr) {
921 return 0;
922 }
923
924 // might have been blocklisted by peer -- ensure we still own
925 // the lock by pinging the OSD
926 int r = ictx->exclusive_lock->assert_header_locked();
927 if (r == -EBUSY || r == -ENOENT) {
928 return 0;
929 } else if (r < 0) {
930 return r;
931 }
932
933 *is_owner = true;
934 return 0;
935 }
936
937 int lock_acquire(ImageCtx *ictx, rbd_lock_mode_t lock_mode)
938 {
939 CephContext *cct = ictx->cct;
940 ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", "
941 << "lock_mode=" << lock_mode << dendl;
942
943 if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) {
944 return -EOPNOTSUPP;
945 }
946
947 C_SaferCond lock_ctx;
948 {
949 std::unique_lock l{ictx->owner_lock};
950
951 if (ictx->exclusive_lock == nullptr) {
952 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
953 return -EINVAL;
954 }
955
956 if (ictx->get_exclusive_lock_policy()->may_auto_request_lock()) {
957 ictx->set_exclusive_lock_policy(
958 new exclusive_lock::StandardPolicy(ictx));
959 }
960
961 if (ictx->exclusive_lock->is_lock_owner()) {
962 return 0;
963 }
964
965 ictx->exclusive_lock->acquire_lock(&lock_ctx);
966 }
967
968 int r = lock_ctx.wait();
969 if (r < 0) {
970 lderr(cct) << "failed to request exclusive lock: " << cpp_strerror(r)
971 << dendl;
972 return r;
973 }
974
975 std::shared_lock l{ictx->owner_lock};
976 if (ictx->exclusive_lock == nullptr) {
977 return -EINVAL;
978 } else if (!ictx->exclusive_lock->is_lock_owner()) {
979 lderr(cct) << "failed to acquire exclusive lock" << dendl;
980 return ictx->exclusive_lock->get_unlocked_op_error();
981 }
982
983 return 0;
984 }
985
986 int lock_release(ImageCtx *ictx)
987 {
988 CephContext *cct = ictx->cct;
989 ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
990
991 C_SaferCond lock_ctx;
992 {
993 std::unique_lock l{ictx->owner_lock};
994
995 if (ictx->exclusive_lock == nullptr ||
996 !ictx->exclusive_lock->is_lock_owner()) {
997 lderr(cct) << "not exclusive lock owner" << dendl;
998 return -EINVAL;
999 }
1000
1001 ictx->exclusive_lock->release_lock(&lock_ctx);
1002 }
1003
1004 int r = lock_ctx.wait();
1005 if (r < 0) {
1006 lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r)
1007 << dendl;
1008 return r;
1009 }
1010 return 0;
1011 }
1012
1013 int lock_get_owners(ImageCtx *ictx, rbd_lock_mode_t *lock_mode,
1014 std::list<std::string> *lock_owners)
1015 {
1016 CephContext *cct = ictx->cct;
1017 ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
1018
1019 managed_lock::Locker locker;
1020 C_SaferCond get_owner_ctx;
1021 {
1022 std::shared_lock owner_locker{ictx->owner_lock};
1023
1024 if (ictx->exclusive_lock == nullptr) {
1025 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
1026 return -EINVAL;
1027 }
1028
1029 ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx);
1030 }
1031
1032 int r = get_owner_ctx.wait();
1033 if (r == -ENOENT) {
1034 return r;
1035 } else if (r < 0) {
1036 lderr(cct) << "failed to determine current lock owner: "
1037 << cpp_strerror(r) << dendl;
1038 return r;
1039 }
1040
1041 *lock_mode = RBD_LOCK_MODE_EXCLUSIVE;
1042 lock_owners->clear();
1043 lock_owners->emplace_back(locker.address);
1044 return 0;
1045 }
1046
1047 int lock_break(ImageCtx *ictx, rbd_lock_mode_t lock_mode,
1048 const std::string &lock_owner) {
1049 CephContext *cct = ictx->cct;
1050 ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", "
1051 << "lock_mode=" << lock_mode << ", "
1052 << "lock_owner=" << lock_owner << dendl;
1053
1054 if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) {
1055 return -EOPNOTSUPP;
1056 }
1057
1058 if (ictx->read_only) {
1059 return -EROFS;
1060 }
1061
1062 managed_lock::Locker locker;
1063 C_SaferCond get_owner_ctx;
1064 {
1065 std::shared_lock l{ictx->owner_lock};
1066
1067 if (ictx->exclusive_lock == nullptr) {
1068 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
1069 return -EINVAL;
1070 }
1071
1072 ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx);
1073 }
1074 int r = get_owner_ctx.wait();
1075 if (r == -ENOENT) {
1076 return r;
1077 } else if (r < 0) {
1078 lderr(cct) << "failed to determine current lock owner: "
1079 << cpp_strerror(r) << dendl;
1080 return r;
1081 }
1082
1083 if (locker.address != lock_owner) {
1084 return -EBUSY;
1085 }
1086
1087 C_SaferCond break_ctx;
1088 {
1089 std::shared_lock l{ictx->owner_lock};
1090
1091 if (ictx->exclusive_lock == nullptr) {
1092 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
1093 return -EINVAL;
1094 }
1095
1096 ictx->exclusive_lock->break_lock(locker, true, &break_ctx);
1097 }
1098 r = break_ctx.wait();
1099 if (r == -ENOENT) {
1100 return r;
1101 } else if (r < 0) {
1102 lderr(cct) << "failed to break lock: " << cpp_strerror(r) << dendl;
1103 return r;
1104 }
1105 return 0;
1106 }
1107
1108 int copy(ImageCtx *src, IoCtx& dest_md_ctx, const char *destname,
1109 ImageOptions& opts, ProgressContext &prog_ctx, size_t sparse_size)
1110 {
1111 CephContext *cct = (CephContext *)dest_md_ctx.cct();
1112 uint64_t option;
1113 if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) {
1114 lderr(cct) << "copy does not support 'flatten' image option" << dendl;
1115 return -EINVAL;
1116 }
1117 if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) {
1118 lderr(cct) << "copy does not support 'clone_format' image option"
1119 << dendl;
1120 return -EINVAL;
1121 }
1122
1123 ldout(cct, 20) << "copy " << src->name
1124 << (src->snap_name.length() ? "@" + src->snap_name : "")
1125 << " -> " << destname << " opts = " << opts << dendl;
1126
1127 src->image_lock.lock_shared();
1128 uint64_t features = src->features;
1129 uint64_t src_size = src->get_image_size(src->snap_id);
1130 src->image_lock.unlock_shared();
1131 uint64_t format = 2;
1132 if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) {
1133 opts.set(RBD_IMAGE_OPTION_FORMAT, format);
1134 }
1135 uint64_t stripe_unit = src->stripe_unit;
1136 if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) {
1137 opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
1138 }
1139 uint64_t stripe_count = src->stripe_count;
1140 if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) {
1141 opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
1142 }
1143 uint64_t order = src->order;
1144 if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
1145 opts.set(RBD_IMAGE_OPTION_ORDER, order);
1146 }
1147 if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) {
1148 opts.set(RBD_IMAGE_OPTION_FEATURES, features);
1149 }
1150 if (features & ~RBD_FEATURES_ALL) {
1151 lderr(cct) << "librbd does not support requested features" << dendl;
1152 return -ENOSYS;
1153 }
1154
1155 int r = create(dest_md_ctx, destname, "", src_size, opts, "", "", false);
1156 if (r < 0) {
1157 lderr(cct) << "header creation failed" << dendl;
1158 return r;
1159 }
1160 opts.set(RBD_IMAGE_OPTION_ORDER, static_cast<uint64_t>(order));
1161
1162 ImageCtx *dest = new librbd::ImageCtx(destname, "", nullptr, dest_md_ctx,
1163 false);
1164 r = dest->state->open(0);
1165 if (r < 0) {
1166 lderr(cct) << "failed to read newly created header" << dendl;
1167 return r;
1168 }
1169
1170 r = copy(src, dest, prog_ctx, sparse_size);
1171
1172 int close_r = dest->state->close();
1173 if (r == 0 && close_r < 0) {
1174 r = close_r;
1175 }
1176 return r;
1177 }
1178
1179 class C_CopyWrite : public Context {
1180 public:
1181 C_CopyWrite(bufferlist *bl, Context* ctx)
1182 : m_bl(bl), m_ctx(ctx) {}
1183 void finish(int r) override {
1184 delete m_bl;
1185 m_ctx->complete(r);
1186 }
1187 private:
1188 bufferlist *m_bl;
1189 Context *m_ctx;
1190 };
1191
1192 class C_CopyRead : public Context {
1193 public:
1194 C_CopyRead(SimpleThrottle *throttle, ImageCtx *dest, uint64_t offset,
1195 bufferlist *bl, size_t sparse_size)
1196 : m_throttle(throttle), m_dest(dest), m_offset(offset), m_bl(bl),
1197 m_sparse_size(sparse_size) {
1198 m_throttle->start_op();
1199 }
1200 void finish(int r) override {
1201 if (r < 0) {
1202 lderr(m_dest->cct) << "error reading from source image at offset "
1203 << m_offset << ": " << cpp_strerror(r) << dendl;
1204 delete m_bl;
1205 m_throttle->end_op(r);
1206 return;
1207 }
1208 ceph_assert(m_bl->length() == (size_t)r);
1209
1210 if (m_bl->is_zero()) {
1211 delete m_bl;
1212 m_throttle->end_op(r);
1213 return;
1214 }
1215
1216 if (!m_sparse_size) {
1217 m_sparse_size = (1 << m_dest->order);
1218 }
1219
1220 auto *throttle = m_throttle;
1221 auto *end_op_ctx = new LambdaContext([throttle](int r) {
1222 throttle->end_op(r);
1223 });
1224 auto gather_ctx = new C_Gather(m_dest->cct, end_op_ctx);
1225
1226 m_bl->rebuild(buffer::ptr_node::create(m_bl->length()));
1227 size_t write_offset = 0;
1228 size_t write_length = 0;
1229 size_t offset = 0;
1230 size_t length = m_bl->length();
1231 const auto& m_ptr = m_bl->front();
1232 while (offset < length) {
1233 if (util::calc_sparse_extent(m_ptr,
1234 m_sparse_size,
1235 length,
1236 &write_offset,
1237 &write_length,
1238 &offset)) {
1239 bufferlist *write_bl = new bufferlist();
1240 write_bl->push_back(
1241 buffer::ptr_node::create(m_ptr, write_offset, write_length));
1242 Context *ctx = new C_CopyWrite(write_bl, gather_ctx->new_sub());
1243 auto comp = io::AioCompletion::create(ctx);
1244
1245 // coordinate through AIO WQ to ensure lock is acquired if needed
1246 api::Io<>::aio_write(*m_dest, comp, m_offset + write_offset,
1247 write_length, std::move(*write_bl),
1248 LIBRADOS_OP_FLAG_FADVISE_DONTNEED,
1249 std::move(read_trace));
1250 write_offset = offset;
1251 write_length = 0;
1252 }
1253 }
1254 delete m_bl;
1255 ceph_assert(gather_ctx->get_sub_created_count() > 0);
1256 gather_ctx->activate();
1257 }
1258
1259 ZTracer::Trace read_trace;
1260
1261 private:
1262 SimpleThrottle *m_throttle;
1263 ImageCtx *m_dest;
1264 uint64_t m_offset;
1265 bufferlist *m_bl;
1266 size_t m_sparse_size;
1267 };
1268
1269 int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx, size_t sparse_size)
1270 {
1271 src->image_lock.lock_shared();
1272 uint64_t src_size = src->get_image_size(src->snap_id);
1273 src->image_lock.unlock_shared();
1274
1275 dest->image_lock.lock_shared();
1276 uint64_t dest_size = dest->get_image_size(dest->snap_id);
1277 dest->image_lock.unlock_shared();
1278
1279 CephContext *cct = src->cct;
1280 if (dest_size < src_size) {
1281 lderr(cct) << " src size " << src_size << " > dest size "
1282 << dest_size << dendl;
1283 return -EINVAL;
1284 }
1285
1286 // ensure previous writes are visible to dest
1287 C_SaferCond flush_ctx;
1288 {
1289 auto aio_comp = io::AioCompletion::create_and_start(&flush_ctx, src,
1290 io::AIO_TYPE_FLUSH);
1291 auto req = io::ImageDispatchSpec::create_flush(
1292 *src, io::IMAGE_DISPATCH_LAYER_INTERNAL_START,
1293 aio_comp, io::FLUSH_SOURCE_INTERNAL, {});
1294 req->send();
1295 }
1296 int r = flush_ctx.wait();
1297 if (r < 0) {
1298 return r;
1299 }
1300
1301 C_SaferCond ctx;
1302 auto req = deep_copy::MetadataCopyRequest<>::create(
1303 src, dest, &ctx);
1304 req->send();
1305
1306 r = ctx.wait();
1307 if (r < 0) {
1308 lderr(cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl;
1309 return r;
1310 }
1311
1312 ZTracer::Trace trace;
1313 if (src->blkin_trace_all) {
1314 trace.init("copy", &src->trace_endpoint);
1315 }
1316
1317 SimpleThrottle throttle(src->config.get_val<uint64_t>("rbd_concurrent_management_ops"), false);
1318 uint64_t period = src->get_stripe_period();
1319 unsigned fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
1320 LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
1321 uint64_t object_id = 0;
1322 for (uint64_t offset = 0; offset < src_size; offset += period) {
1323 if (throttle.pending_error()) {
1324 return throttle.wait_for_ret();
1325 }
1326
1327 {
1328 std::shared_lock image_locker{src->image_lock};
1329 if (src->object_map != nullptr) {
1330 bool skip = true;
1331 // each period is related to src->stripe_count objects, check them all
1332 for (uint64_t i=0; i < src->stripe_count; i++) {
1333 if (object_id < src->object_map->size() &&
1334 src->object_map->object_may_exist(object_id)) {
1335 skip = false;
1336 }
1337 ++object_id;
1338 }
1339
1340 if (skip) continue;
1341 } else {
1342 object_id += src->stripe_count;
1343 }
1344 }
1345
1346 uint64_t len = std::min(period, src_size - offset);
1347 bufferlist *bl = new bufferlist();
1348 auto ctx = new C_CopyRead(&throttle, dest, offset, bl, sparse_size);
1349 auto comp = io::AioCompletion::create_and_start<Context>(
1350 ctx, src, io::AIO_TYPE_READ);
1351 auto req = io::ImageDispatchSpec::create_read(
1352 *src, io::IMAGE_DISPATCH_LAYER_NONE, comp,
1353 {{offset, len}}, io::ImageArea::DATA, io::ReadResult{bl},
1354 src->get_data_io_context(), fadvise_flags, 0, trace);
1355
1356 ctx->read_trace = trace;
1357 req->send();
1358
1359 prog_ctx.update_progress(offset, src_size);
1360 }
1361
1362 r = throttle.wait_for_ret();
1363 if (r >= 0)
1364 prog_ctx.update_progress(src_size, src_size);
1365 return r;
1366 }
1367
1368 int list_lockers(ImageCtx *ictx,
1369 std::list<locker_t> *lockers,
1370 bool *exclusive,
1371 string *tag)
1372 {
1373 ldout(ictx->cct, 20) << "list_locks on image " << ictx << dendl;
1374
1375 int r = ictx->state->refresh_if_required();
1376 if (r < 0)
1377 return r;
1378
1379 std::shared_lock locker{ictx->image_lock};
1380 if (exclusive)
1381 *exclusive = ictx->exclusive_locked;
1382 if (tag)
1383 *tag = ictx->lock_tag;
1384 if (lockers) {
1385 lockers->clear();
1386 map<rados::cls::lock::locker_id_t,
1387 rados::cls::lock::locker_info_t>::const_iterator it;
1388 for (it = ictx->lockers.begin(); it != ictx->lockers.end(); ++it) {
1389 locker_t locker;
1390 locker.client = stringify(it->first.locker);
1391 locker.cookie = it->first.cookie;
1392 locker.address = it->second.addr.get_legacy_str();
1393 lockers->push_back(locker);
1394 }
1395 }
1396
1397 return 0;
1398 }
1399
1400 int lock(ImageCtx *ictx, bool exclusive, const string& cookie,
1401 const string& tag)
1402 {
1403 ldout(ictx->cct, 20) << "lock image " << ictx << " exclusive=" << exclusive
1404 << " cookie='" << cookie << "' tag='" << tag << "'"
1405 << dendl;
1406
1407 int r = ictx->state->refresh_if_required();
1408 if (r < 0)
1409 return r;
1410
1411 /**
1412 * If we wanted we could do something more intelligent, like local
1413 * checks that we think we will succeed. But for now, let's not
1414 * duplicate that code.
1415 */
1416 {
1417 std::shared_lock locker{ictx->image_lock};
1418 r = rados::cls::lock::lock(&ictx->md_ctx, ictx->header_oid, RBD_LOCK_NAME,
1419 exclusive ? ClsLockType::EXCLUSIVE : ClsLockType::SHARED,
1420 cookie, tag, "", utime_t(), 0);
1421 if (r < 0) {
1422 return r;
1423 }
1424 }
1425
1426 ictx->notify_update();
1427 return 0;
1428 }
1429
1430 int unlock(ImageCtx *ictx, const string& cookie)
1431 {
1432 ldout(ictx->cct, 20) << "unlock image " << ictx
1433 << " cookie='" << cookie << "'" << dendl;
1434
1435 int r = ictx->state->refresh_if_required();
1436 if (r < 0)
1437 return r;
1438
1439 {
1440 std::shared_lock locker{ictx->image_lock};
1441 r = rados::cls::lock::unlock(&ictx->md_ctx, ictx->header_oid,
1442 RBD_LOCK_NAME, cookie);
1443 if (r < 0) {
1444 return r;
1445 }
1446 }
1447
1448 ictx->notify_update();
1449 return 0;
1450 }
1451
1452 int break_lock(ImageCtx *ictx, const string& client,
1453 const string& cookie)
1454 {
1455 ldout(ictx->cct, 20) << "break_lock image " << ictx << " client='" << client
1456 << "' cookie='" << cookie << "'" << dendl;
1457
1458 int r = ictx->state->refresh_if_required();
1459 if (r < 0)
1460 return r;
1461
1462 entity_name_t lock_client;
1463 if (!lock_client.parse(client)) {
1464 lderr(ictx->cct) << "Unable to parse client '" << client
1465 << "'" << dendl;
1466 return -EINVAL;
1467 }
1468
1469 if (ictx->config.get_val<bool>("rbd_blocklist_on_break_lock")) {
1470 typedef std::map<rados::cls::lock::locker_id_t,
1471 rados::cls::lock::locker_info_t> Lockers;
1472 Lockers lockers;
1473 ClsLockType lock_type;
1474 std::string lock_tag;
1475 r = rados::cls::lock::get_lock_info(&ictx->md_ctx, ictx->header_oid,
1476 RBD_LOCK_NAME, &lockers, &lock_type,
1477 &lock_tag);
1478 if (r < 0) {
1479 lderr(ictx->cct) << "unable to retrieve lock info: " << cpp_strerror(r)
1480 << dendl;
1481 return r;
1482 }
1483
1484 std::string client_address;
1485 for (Lockers::iterator it = lockers.begin();
1486 it != lockers.end(); ++it) {
1487 if (it->first.locker == lock_client) {
1488 client_address = it->second.addr.get_legacy_str();
1489 break;
1490 }
1491 }
1492 if (client_address.empty()) {
1493 return -ENOENT;
1494 }
1495
1496 librados::Rados rados(ictx->md_ctx);
1497 r = rados.blocklist_add(
1498 client_address,
1499 ictx->config.get_val<uint64_t>("rbd_blocklist_expire_seconds"));
1500 if (r < 0) {
1501 lderr(ictx->cct) << "unable to blocklist client: " << cpp_strerror(r)
1502 << dendl;
1503 return r;
1504 }
1505 }
1506
1507 r = rados::cls::lock::break_lock(&ictx->md_ctx, ictx->header_oid,
1508 RBD_LOCK_NAME, cookie, lock_client);
1509 if (r < 0)
1510 return r;
1511 ictx->notify_update();
1512 return 0;
1513 }
1514
1515 void rbd_ctx_cb(completion_t cb, void *arg)
1516 {
1517 Context *ctx = reinterpret_cast<Context *>(arg);
1518 auto comp = reinterpret_cast<io::AioCompletion *>(cb);
1519 ctx->complete(comp->get_return_value());
1520 comp->release();
1521 }
1522
1523 int64_t read_iterate(ImageCtx *ictx, uint64_t off, uint64_t len,
1524 int (*cb)(uint64_t, size_t, const char *, void *),
1525 void *arg)
1526 {
1527 coarse_mono_time start_time;
1528 ceph::timespan elapsed;
1529
1530 ldout(ictx->cct, 20) << "read_iterate " << ictx << " off = " << off
1531 << " len = " << len << dendl;
1532
1533 int r = ictx->state->refresh_if_required();
1534 if (r < 0)
1535 return r;
1536
1537 uint64_t mylen = len;
1538 ictx->image_lock.lock_shared();
1539 r = clip_io(ictx, off, &mylen, io::ImageArea::DATA);
1540 ictx->image_lock.unlock_shared();
1541 if (r < 0)
1542 return r;
1543
1544 int64_t total_read = 0;
1545 uint64_t period = ictx->get_stripe_period();
1546 uint64_t left = mylen;
1547
1548 ZTracer::Trace trace;
1549 if (ictx->blkin_trace_all) {
1550 trace.init("read_iterate", &ictx->trace_endpoint);
1551 }
1552
1553 std::shared_lock owner_locker{ictx->owner_lock};
1554 start_time = coarse_mono_clock::now();
1555 while (left > 0) {
1556 uint64_t period_off = off - (off % period);
1557 uint64_t read_len = std::min(period_off + period - off, left);
1558
1559 bufferlist bl;
1560
1561 C_SaferCond ctx;
1562 auto c = io::AioCompletion::create_and_start(&ctx, ictx,
1563 io::AIO_TYPE_READ);
1564 auto req = io::ImageDispatchSpec::create_read(
1565 *ictx, io::IMAGE_DISPATCH_LAYER_NONE, c,
1566 {{off, read_len}}, io::ImageArea::DATA, io::ReadResult{&bl},
1567 ictx->get_data_io_context(), 0, 0, trace);
1568 req->send();
1569
1570 int ret = ctx.wait();
1571 if (ret < 0) {
1572 return ret;
1573 }
1574
1575 r = cb(total_read, ret, bl.c_str(), arg);
1576 if (r < 0) {
1577 return r;
1578 }
1579
1580 total_read += ret;
1581 left -= ret;
1582 off += ret;
1583 }
1584
1585 elapsed = coarse_mono_clock::now() - start_time;
1586 ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed);
1587 ictx->perfcounter->inc(l_librbd_rd);
1588 ictx->perfcounter->inc(l_librbd_rd_bytes, mylen);
1589 return total_read;
1590 }
1591
1592 // validate extent against area size; clip to area size if necessary
1593 int clip_io(ImageCtx* ictx, uint64_t off, uint64_t* len, io::ImageArea area) {
1594 ceph_assert(ceph_mutex_is_locked(ictx->image_lock));
1595
1596 if (ictx->snap_id != CEPH_NOSNAP &&
1597 ictx->get_snap_info(ictx->snap_id) == nullptr) {
1598 return -ENOENT;
1599 }
1600
1601 // special-case "len == 0" requests: always valid
1602 if (*len == 0)
1603 return 0;
1604
1605 uint64_t area_size = ictx->get_area_size(area);
1606
1607 // can't start past end
1608 if (off >= area_size)
1609 return -EINVAL;
1610
1611 // clip requests that extend past end to just end
1612 if ((off + *len) > area_size)
1613 *len = (size_t)(area_size - off);
1614
1615 return 0;
1616 }
1617
1618 int invalidate_cache(ImageCtx *ictx)
1619 {
1620 CephContext *cct = ictx->cct;
1621 ldout(cct, 20) << "invalidate_cache " << ictx << dendl;
1622
1623 int r = ictx->state->refresh_if_required();
1624 if (r < 0) {
1625 return r;
1626 }
1627
1628 C_SaferCond ctx;
1629 {
1630 ictx->io_image_dispatcher->invalidate_cache(&ctx);
1631 }
1632 r = ctx.wait();
1633
1634 if (r < 0) {
1635 ldout(cct, 20) << "failed to invalidate image cache" << dendl;
1636 return r;
1637 }
1638
1639 ictx->perfcounter->inc(l_librbd_invalidate_cache);
1640
1641 // Delete writeback cache if it is not initialized
1642 if ((!ictx->exclusive_lock ||
1643 !ictx->exclusive_lock->is_lock_owner()) &&
1644 ictx->test_features(RBD_FEATURE_DIRTY_CACHE)) {
1645 C_SaferCond ctx3;
1646 ictx->plugin_registry->discard(&ctx3);
1647 r = ctx3.wait();
1648 }
1649 return r;
1650 }
1651
1652 int poll_io_events(ImageCtx *ictx, io::AioCompletion **comps, int numcomp)
1653 {
1654 if (numcomp <= 0)
1655 return -EINVAL;
1656 CephContext *cct = ictx->cct;
1657 ldout(cct, 20) << __func__ << " " << ictx << " numcomp = " << numcomp
1658 << dendl;
1659 int i = 0;
1660 while (i < numcomp && ictx->event_socket_completions.pop(comps[i])) {
1661 ++i;
1662 }
1663
1664 return i;
1665 }
1666
1667 int metadata_get(ImageCtx *ictx, const string &key, string *value)
1668 {
1669 CephContext *cct = ictx->cct;
1670 ldout(cct, 20) << "metadata_get " << ictx << " key=" << key << dendl;
1671
1672 int r = ictx->state->refresh_if_required();
1673 if (r < 0) {
1674 return r;
1675 }
1676
1677 return cls_client::metadata_get(&ictx->md_ctx, ictx->header_oid, key, value);
1678 }
1679
1680 int metadata_list(ImageCtx *ictx, const string &start, uint64_t max, map<string, bufferlist> *pairs)
1681 {
1682 CephContext *cct = ictx->cct;
1683 ldout(cct, 20) << "metadata_list " << ictx << dendl;
1684
1685 int r = ictx->state->refresh_if_required();
1686 if (r < 0) {
1687 return r;
1688 }
1689
1690 C_SaferCond ctx;
1691 auto req = image::GetMetadataRequest<>::create(
1692 ictx->md_ctx, ictx->header_oid, false, "", start, max, pairs, &ctx);
1693 req->send();
1694
1695 return ctx.wait();
1696 }
1697
1698 int list_watchers(ImageCtx *ictx,
1699 std::list<librbd::image_watcher_t> &watchers)
1700 {
1701 int r;
1702 std::string header_oid;
1703 std::list<obj_watch_t> obj_watchers;
1704
1705 if (ictx->old_format) {
1706 header_oid = util::old_header_name(ictx->name);
1707 } else {
1708 header_oid = util::header_name(ictx->id);
1709 }
1710
1711 r = ictx->md_ctx.list_watchers(header_oid, &obj_watchers);
1712 if (r < 0) {
1713 return r;
1714 }
1715
1716 watchers.clear();
1717 for (auto i = obj_watchers.begin(); i != obj_watchers.end(); ++i) {
1718 librbd::image_watcher_t watcher;
1719 watcher.addr = i->addr;
1720 watcher.id = i->watcher_id;
1721 watcher.cookie = i->cookie;
1722
1723 watchers.push_back(watcher);
1724 }
1725
1726 return 0;
1727 }
1728
1729 }
1730
1731 std::ostream &operator<<(std::ostream &os, const librbd::ImageOptions &opts) {
1732 os << "[";
1733
1734 const char *delimiter = "";
1735 for (auto &i : librbd::IMAGE_OPTIONS_TYPE_MAPPING) {
1736 if (i.second == librbd::STR) {
1737 std::string val;
1738 if (opts.get(i.first, &val) == 0) {
1739 os << delimiter << librbd::image_option_name(i.first) << "=" << val;
1740 delimiter = ", ";
1741 }
1742 } else if (i.second == librbd::UINT64) {
1743 uint64_t val;
1744 if (opts.get(i.first, &val) == 0) {
1745 os << delimiter << librbd::image_option_name(i.first) << "=" << val;
1746 delimiter = ", ";
1747 }
1748 }
1749 }
1750
1751 os << "]";
1752
1753 return os;
1754 }