]> git.proxmox.com Git - ceph.git/blame - ceph/src/librbd/internal.cc
d/control: depend on python3-yaml for ceph-mgr
[ceph.git] / ceph / src / librbd / internal.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3#include "include/int_types.h"
4
5#include <errno.h>
6#include <limits.h>
7
8#include "include/types.h"
9#include "include/uuid.h"
10#include "common/ceph_context.h"
11#include "common/dout.h"
12#include "common/errno.h"
13#include "common/Throttle.h"
14#include "common/event_socket.h"
11fdf7f2
TL
15#include "common/perf_counters.h"
16#include "osdc/Striper.h"
7c673cae
FG
17#include "include/stringify.h"
18
11fdf7f2 19#include "cls/lock/cls_lock_client.h"
7c673cae
FG
20#include "cls/rbd/cls_rbd.h"
21#include "cls/rbd/cls_rbd_types.h"
22#include "cls/rbd/cls_rbd_client.h"
23#include "cls/journal/cls_journal_types.h"
24#include "cls/journal/cls_journal_client.h"
25
26#include "librbd/ExclusiveLock.h"
27#include "librbd/ImageCtx.h"
28#include "librbd/ImageState.h"
29#include "librbd/internal.h"
30#include "librbd/Journal.h"
31#include "librbd/ObjectMap.h"
32#include "librbd/Operations.h"
33#include "librbd/Types.h"
34#include "librbd/Utils.h"
11fdf7f2 35#include "librbd/api/Config.h"
7c673cae
FG
36#include "librbd/api/Image.h"
37#include "librbd/exclusive_lock/AutomaticPolicy.h"
38#include "librbd/exclusive_lock/StandardPolicy.h"
9f95a23c 39#include "librbd/deep_copy/MetadataCopyRequest.h"
7c673cae
FG
40#include "librbd/image/CloneRequest.h"
41#include "librbd/image/CreateRequest.h"
9f95a23c 42#include "librbd/image/GetMetadataRequest.h"
1911f103 43#include "librbd/image/Types.h"
7c673cae
FG
44#include "librbd/io/AioCompletion.h"
45#include "librbd/io/ImageRequest.h"
46#include "librbd/io/ImageRequestWQ.h"
11fdf7f2 47#include "librbd/io/ObjectDispatcher.h"
7c673cae
FG
48#include "librbd/io/ObjectRequest.h"
49#include "librbd/io/ReadResult.h"
50#include "librbd/journal/Types.h"
51#include "librbd/managed_lock/Types.h"
52#include "librbd/mirror/EnableRequest.h"
53#include "librbd/operation/TrimRequest.h"
54
55#include "journal/Journaler.h"
56
57#include <boost/scope_exit.hpp>
58#include <boost/variant.hpp>
11fdf7f2 59#include "include/ceph_assert.h"
7c673cae
FG
60
61#define dout_subsys ceph_subsys_rbd
62#undef dout_prefix
63#define dout_prefix *_dout << "librbd: "
64
65#define rbd_howmany(x, y) (((x) + (y) - 1) / (y))
66
67using std::map;
68using std::pair;
69using std::set;
70using std::string;
71using std::vector;
72// list binds to list() here, so std::list is explicitly used below
73
74using ceph::bufferlist;
75using librados::snap_t;
76using librados::IoCtx;
77using librados::Rados;
78
79namespace librbd {
80
81namespace {
82
83int validate_pool(IoCtx &io_ctx, CephContext *cct) {
11fdf7f2 84 if (!cct->_conf.get_val<bool>("rbd_validate_pool")) {
7c673cae
FG
85 return 0;
86 }
87
88 int r = io_ctx.stat(RBD_DIRECTORY, NULL, NULL);
89 if (r == 0) {
90 return 0;
91 } else if (r < 0 && r != -ENOENT) {
92 lderr(cct) << "failed to stat RBD directory: " << cpp_strerror(r) << dendl;
93 return r;
94 }
95
96 // allocate a self-managed snapshot id if this a new pool to force
97 // self-managed snapshot mode
98 uint64_t snap_id;
99 r = io_ctx.selfmanaged_snap_create(&snap_id);
100 if (r == -EINVAL) {
101 lderr(cct) << "pool not configured for self-managed RBD snapshot support"
102 << dendl;
103 return r;
104 } else if (r < 0) {
105 lderr(cct) << "failed to allocate self-managed snapshot: "
106 << cpp_strerror(r) << dendl;
107 return r;
108 }
109
110 r = io_ctx.selfmanaged_snap_remove(snap_id);
111 if (r < 0) {
112 lderr(cct) << "failed to release self-managed snapshot " << snap_id
113 << ": " << cpp_strerror(r) << dendl;
114 }
115 return 0;
116}
117
7c673cae
FG
118} // anonymous namespace
119
120 int detect_format(IoCtx &io_ctx, const string &name,
121 bool *old_format, uint64_t *size)
122 {
123 CephContext *cct = (CephContext *)io_ctx.cct();
124 if (old_format)
125 *old_format = true;
126 int r = io_ctx.stat(util::old_header_name(name), size, NULL);
127 if (r == -ENOENT) {
128 if (old_format)
129 *old_format = false;
130 r = io_ctx.stat(util::id_obj_name(name), size, NULL);
131 if (r < 0)
132 return r;
133 } else if (r < 0) {
134 return r;
135 }
136
137 ldout(cct, 20) << "detect format of " << name << " : "
138 << (old_format ? (*old_format ? "old" : "new") :
139 "don't care") << dendl;
140 return 0;
141 }
142
143 bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap)
144 {
145 return (parent_pool_id != -1 && off <= overlap);
146 }
147
148 void init_rbd_header(struct rbd_obj_header_ondisk& ondisk,
149 uint64_t size, int order, uint64_t bid)
150 {
151 uint32_t hi = bid >> 32;
152 uint32_t lo = bid & 0xFFFFFFFF;
153 uint32_t extra = rand() % 0xFFFFFFFF;
92f5a8d4 154 // FIPS zeroization audit 20191117: this memset is not security related.
7c673cae
FG
155 memset(&ondisk, 0, sizeof(ondisk));
156
157 memcpy(&ondisk.text, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT));
158 memcpy(&ondisk.signature, RBD_HEADER_SIGNATURE,
159 sizeof(RBD_HEADER_SIGNATURE));
160 memcpy(&ondisk.version, RBD_HEADER_VERSION, sizeof(RBD_HEADER_VERSION));
161
162 snprintf(ondisk.block_name, sizeof(ondisk.block_name), "rb.%x.%x.%x",
163 hi, lo, extra);
164
165 ondisk.image_size = size;
166 ondisk.options.order = order;
167 ondisk.options.crypt_type = RBD_CRYPT_NONE;
168 ondisk.options.comp_type = RBD_COMP_NONE;
169 ondisk.snap_seq = 0;
170 ondisk.snap_count = 0;
171 ondisk.reserved = 0;
172 ondisk.snap_names_len = 0;
173 }
174
175 void image_info(ImageCtx *ictx, image_info_t& info, size_t infosize)
176 {
177 int obj_order = ictx->order;
9f95a23c
TL
178 {
179 std::shared_lock locker{ictx->image_lock};
180 info.size = ictx->get_image_size(ictx->snap_id);
181 }
7c673cae
FG
182 info.obj_size = 1ULL << obj_order;
183 info.num_objs = Striper::get_num_objects(ictx->layout, info.size);
184 info.order = obj_order;
185 strncpy(info.block_name_prefix, ictx->object_prefix.c_str(),
186 RBD_MAX_BLOCK_NAME_SIZE);
187 info.block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE - 1] = '\0';
188
189 // clear deprecated fields
190 info.parent_pool = -1L;
191 info.parent_name[0] = '\0';
192 }
193
194 uint64_t oid_to_object_no(const string& oid, const string& object_prefix)
195 {
196 istringstream iss(oid);
197 // skip object prefix and separator
198 iss.ignore(object_prefix.length() + 1);
199 uint64_t num;
200 iss >> std::hex >> num;
201 return num;
202 }
203
204 void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx)
205 {
9f95a23c 206 ceph_assert(ceph_mutex_is_locked(ictx->owner_lock));
11fdf7f2
TL
207 ceph_assert(ictx->exclusive_lock == nullptr ||
208 ictx->exclusive_lock->is_lock_owner());
7c673cae
FG
209
210 C_SaferCond ctx;
9f95a23c 211 ictx->image_lock.lock_shared();
7c673cae
FG
212 operation::TrimRequest<> *req = operation::TrimRequest<>::create(
213 *ictx, &ctx, ictx->size, newsize, prog_ctx);
9f95a23c 214 ictx->image_lock.unlock_shared();
7c673cae
FG
215 req->send();
216
217 int r = ctx.wait();
218 if (r < 0) {
219 lderr(ictx->cct) << "warning: failed to remove some object(s): "
220 << cpp_strerror(r) << dendl;
221 }
222 }
223
224 int read_header_bl(IoCtx& io_ctx, const string& header_oid,
225 bufferlist& header, uint64_t *ver)
226 {
227 int r;
228 uint64_t off = 0;
229#define READ_SIZE 4096
230 do {
231 bufferlist bl;
232 r = io_ctx.read(header_oid, bl, READ_SIZE, off);
233 if (r < 0)
234 return r;
235 header.claim_append(bl);
236 off += r;
237 } while (r == READ_SIZE);
238
11fdf7f2
TL
239 static_assert(sizeof(RBD_HEADER_TEXT) == sizeof(RBD_MIGRATE_HEADER_TEXT),
240 "length of rbd headers must be the same");
241
7c673cae 242 if (header.length() < sizeof(RBD_HEADER_TEXT) ||
11fdf7f2
TL
243 (memcmp(RBD_HEADER_TEXT, header.c_str(),
244 sizeof(RBD_HEADER_TEXT)) != 0 &&
245 memcmp(RBD_MIGRATE_HEADER_TEXT, header.c_str(),
246 sizeof(RBD_MIGRATE_HEADER_TEXT)) != 0)) {
7c673cae
FG
247 CephContext *cct = (CephContext *)io_ctx.cct();
248 lderr(cct) << "unrecognized header format" << dendl;
249 return -ENXIO;
250 }
251
252 if (ver)
253 *ver = io_ctx.get_last_version();
254
255 return 0;
256 }
257
258 int read_header(IoCtx& io_ctx, const string& header_oid,
259 struct rbd_obj_header_ondisk *header, uint64_t *ver)
260 {
261 bufferlist header_bl;
262 int r = read_header_bl(io_ctx, header_oid, header_bl, ver);
263 if (r < 0)
264 return r;
265 if (header_bl.length() < (int)sizeof(*header))
266 return -EIO;
267 memcpy(header, header_bl.c_str(), sizeof(*header));
268
269 return 0;
270 }
271
272 int tmap_set(IoCtx& io_ctx, const string& imgname)
273 {
274 bufferlist cmdbl, emptybl;
275 __u8 c = CEPH_OSD_TMAP_SET;
11fdf7f2
TL
276 encode(c, cmdbl);
277 encode(imgname, cmdbl);
278 encode(emptybl, cmdbl);
7c673cae
FG
279 return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl);
280 }
281
282 int tmap_rm(IoCtx& io_ctx, const string& imgname)
283 {
284 bufferlist cmdbl;
285 __u8 c = CEPH_OSD_TMAP_RM;
11fdf7f2
TL
286 encode(c, cmdbl);
287 encode(imgname, cmdbl);
7c673cae
FG
288 return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl);
289 }
290
291 typedef boost::variant<std::string,uint64_t> image_option_value_t;
292 typedef std::map<int,image_option_value_t> image_options_t;
293 typedef std::shared_ptr<image_options_t> image_options_ref;
294
295 enum image_option_type_t {
296 STR,
297 UINT64,
298 };
299
300 const std::map<int, image_option_type_t> IMAGE_OPTIONS_TYPE_MAPPING = {
301 {RBD_IMAGE_OPTION_FORMAT, UINT64},
302 {RBD_IMAGE_OPTION_FEATURES, UINT64},
303 {RBD_IMAGE_OPTION_ORDER, UINT64},
304 {RBD_IMAGE_OPTION_STRIPE_UNIT, UINT64},
305 {RBD_IMAGE_OPTION_STRIPE_COUNT, UINT64},
306 {RBD_IMAGE_OPTION_JOURNAL_ORDER, UINT64},
307 {RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH, UINT64},
308 {RBD_IMAGE_OPTION_JOURNAL_POOL, STR},
309 {RBD_IMAGE_OPTION_FEATURES_SET, UINT64},
310 {RBD_IMAGE_OPTION_FEATURES_CLEAR, UINT64},
311 {RBD_IMAGE_OPTION_DATA_POOL, STR},
11fdf7f2 312 {RBD_IMAGE_OPTION_FLATTEN, UINT64},
92f5a8d4 313 {RBD_IMAGE_OPTION_CLONE_FORMAT, UINT64},
1911f103 314 {RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE, UINT64},
7c673cae
FG
315 };
316
317 std::string image_option_name(int optname) {
318 switch (optname) {
319 case RBD_IMAGE_OPTION_FORMAT:
320 return "format";
321 case RBD_IMAGE_OPTION_FEATURES:
322 return "features";
323 case RBD_IMAGE_OPTION_ORDER:
324 return "order";
325 case RBD_IMAGE_OPTION_STRIPE_UNIT:
326 return "stripe_unit";
327 case RBD_IMAGE_OPTION_STRIPE_COUNT:
328 return "stripe_count";
329 case RBD_IMAGE_OPTION_JOURNAL_ORDER:
330 return "journal_order";
331 case RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH:
332 return "journal_splay_width";
333 case RBD_IMAGE_OPTION_JOURNAL_POOL:
334 return "journal_pool";
335 case RBD_IMAGE_OPTION_FEATURES_SET:
336 return "features_set";
337 case RBD_IMAGE_OPTION_FEATURES_CLEAR:
338 return "features_clear";
339 case RBD_IMAGE_OPTION_DATA_POOL:
340 return "data_pool";
11fdf7f2
TL
341 case RBD_IMAGE_OPTION_FLATTEN:
342 return "flatten";
92f5a8d4
TL
343 case RBD_IMAGE_OPTION_CLONE_FORMAT:
344 return "clone_format";
1911f103
TL
345 case RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE:
346 return "mirror_image_mode";
7c673cae
FG
347 default:
348 return "unknown (" + stringify(optname) + ")";
349 }
350 }
351
7c673cae
FG
352 void image_options_create(rbd_image_options_t* opts)
353 {
354 image_options_ref* opts_ = new image_options_ref(new image_options_t());
355
356 *opts = static_cast<rbd_image_options_t>(opts_);
357 }
358
359 void image_options_create_ref(rbd_image_options_t* opts,
360 rbd_image_options_t orig)
361 {
362 image_options_ref* orig_ = static_cast<image_options_ref*>(orig);
363 image_options_ref* opts_ = new image_options_ref(*orig_);
364
365 *opts = static_cast<rbd_image_options_t>(opts_);
366 }
367
368 void image_options_copy(rbd_image_options_t* opts,
369 const ImageOptions &orig)
370 {
371 image_options_ref* opts_ = new image_options_ref(new image_options_t());
372
373 *opts = static_cast<rbd_image_options_t>(opts_);
374
375 std::string str_val;
376 uint64_t uint64_val;
377 for (auto &i : IMAGE_OPTIONS_TYPE_MAPPING) {
378 switch (i.second) {
379 case STR:
380 if (orig.get(i.first, &str_val) == 0) {
381 image_options_set(*opts, i.first, str_val);
382 }
383 continue;
384 case UINT64:
385 if (orig.get(i.first, &uint64_val) == 0) {
386 image_options_set(*opts, i.first, uint64_val);
387 }
388 continue;
389 }
390 }
391 }
392
393 void image_options_destroy(rbd_image_options_t opts)
394 {
395 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
396
397 delete opts_;
398 }
399
400 int image_options_set(rbd_image_options_t opts, int optname,
401 const std::string& optval)
402 {
403 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
404
405 std::map<int, image_option_type_t>::const_iterator i =
406 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
407
408 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
409 return -EINVAL;
410 }
411
412 (*opts_->get())[optname] = optval;
413 return 0;
414 }
415
416 int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval)
417 {
418 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
419
420 std::map<int, image_option_type_t>::const_iterator i =
421 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
422
423 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
424 return -EINVAL;
425 }
426
427 (*opts_->get())[optname] = optval;
428 return 0;
429 }
430
431 int image_options_get(rbd_image_options_t opts, int optname,
432 std::string* optval)
433 {
434 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
435
436 std::map<int, image_option_type_t>::const_iterator i =
437 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
438
439 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
440 return -EINVAL;
441 }
442
443 image_options_t::const_iterator j = (*opts_)->find(optname);
444
445 if (j == (*opts_)->end()) {
446 return -ENOENT;
447 }
448
449 *optval = boost::get<std::string>(j->second);
450 return 0;
451 }
452
453 int image_options_get(rbd_image_options_t opts, int optname, uint64_t* optval)
454 {
455 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
456
457 std::map<int, image_option_type_t>::const_iterator i =
458 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
459
460 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
461 return -EINVAL;
462 }
463
464 image_options_t::const_iterator j = (*opts_)->find(optname);
465
466 if (j == (*opts_)->end()) {
467 return -ENOENT;
468 }
469
470 *optval = boost::get<uint64_t>(j->second);
471 return 0;
472 }
473
474 int image_options_is_set(rbd_image_options_t opts, int optname,
475 bool* is_set)
476 {
477 if (IMAGE_OPTIONS_TYPE_MAPPING.find(optname) ==
478 IMAGE_OPTIONS_TYPE_MAPPING.end()) {
479 return -EINVAL;
480 }
481
482 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
483 *is_set = ((*opts_)->find(optname) != (*opts_)->end());
484 return 0;
485 }
486
487 int image_options_unset(rbd_image_options_t opts, int optname)
488 {
489 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
490
491 std::map<int, image_option_type_t>::const_iterator i =
492 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
493
494 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end()) {
11fdf7f2 495 ceph_assert((*opts_)->find(optname) == (*opts_)->end());
7c673cae
FG
496 return -EINVAL;
497 }
498
499 image_options_t::const_iterator j = (*opts_)->find(optname);
500
501 if (j == (*opts_)->end()) {
502 return -ENOENT;
503 }
504
505 (*opts_)->erase(j);
506 return 0;
507 }
508
509 void image_options_clear(rbd_image_options_t opts)
510 {
511 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
512
513 (*opts_)->clear();
514 }
515
516 bool image_options_is_empty(rbd_image_options_t opts)
517 {
518 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
519
520 return (*opts_)->empty();
521 }
522
7c673cae
FG
523 int create_v1(IoCtx& io_ctx, const char *imgname, uint64_t size, int order)
524 {
525 CephContext *cct = (CephContext *)io_ctx.cct();
526
527 ldout(cct, 20) << __func__ << " " << &io_ctx << " name = " << imgname
528 << " size = " << size << " order = " << order << dendl;
529 int r = validate_pool(io_ctx, cct);
530 if (r < 0) {
531 return r;
532 }
533
11fdf7f2
TL
534 if (!io_ctx.get_namespace().empty()) {
535 lderr(cct) << "attempting to add v1 image to namespace" << dendl;
536 return -EINVAL;
537 }
538
7c673cae
FG
539 ldout(cct, 2) << "adding rbd image to directory..." << dendl;
540 r = tmap_set(io_ctx, imgname);
541 if (r < 0) {
542 lderr(cct) << "error adding image to directory: " << cpp_strerror(r)
543 << dendl;
544 return r;
545 }
546
547 Rados rados(io_ctx);
548 uint64_t bid = rados.get_instance_id();
549
550 ldout(cct, 2) << "creating rbd image..." << dendl;
551 struct rbd_obj_header_ondisk header;
552 init_rbd_header(header, size, order, bid);
553
554 bufferlist bl;
555 bl.append((const char *)&header, sizeof(header));
556
557 string header_oid = util::old_header_name(imgname);
558 r = io_ctx.write(header_oid, bl, bl.length(), 0);
559 if (r < 0) {
560 lderr(cct) << "Error writing image header: " << cpp_strerror(r)
561 << dendl;
562 int remove_r = tmap_rm(io_ctx, imgname);
563 if (remove_r < 0) {
564 lderr(cct) << "Could not remove image from directory after "
565 << "header creation failed: "
566 << cpp_strerror(remove_r) << dendl;
567 }
568 return r;
569 }
570
571 ldout(cct, 2) << "done." << dendl;
572 return 0;
573 }
574
575 int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
576 int *order)
577 {
578 uint64_t order_ = *order;
579 ImageOptions opts;
580
581 int r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
11fdf7f2 582 ceph_assert(r == 0);
7c673cae
FG
583
584 r = create(io_ctx, imgname, "", size, opts, "", "", false);
585
586 int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
11fdf7f2 587 ceph_assert(r1 == 0);
7c673cae
FG
588 *order = order_;
589
590 return r;
591 }
592
593 int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
594 bool old_format, uint64_t features, int *order,
595 uint64_t stripe_unit, uint64_t stripe_count)
596 {
597 if (!order)
598 return -EINVAL;
599
600 uint64_t order_ = *order;
601 uint64_t format = old_format ? 1 : 2;
602 ImageOptions opts;
603 int r;
604
605 r = opts.set(RBD_IMAGE_OPTION_FORMAT, format);
11fdf7f2 606 ceph_assert(r == 0);
7c673cae 607 r = opts.set(RBD_IMAGE_OPTION_FEATURES, features);
11fdf7f2 608 ceph_assert(r == 0);
7c673cae 609 r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
11fdf7f2 610 ceph_assert(r == 0);
7c673cae 611 r = opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
11fdf7f2 612 ceph_assert(r == 0);
7c673cae 613 r = opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
11fdf7f2 614 ceph_assert(r == 0);
7c673cae
FG
615
616 r = create(io_ctx, imgname, "", size, opts, "", "", false);
617
618 int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
11fdf7f2 619 ceph_assert(r1 == 0);
7c673cae
FG
620 *order = order_;
621
622 return r;
623 }
624
625 int create(IoCtx& io_ctx, const std::string &image_name,
626 const std::string &image_id, uint64_t size,
627 ImageOptions& opts,
628 const std::string &non_primary_global_image_id,
629 const std::string &primary_mirror_uuid,
630 bool skip_mirror_enable)
631 {
632 std::string id(image_id);
633 if (id.empty()) {
634 id = util::generate_image_id(io_ctx);
635 }
636
637 CephContext *cct = (CephContext *)io_ctx.cct();
92f5a8d4
TL
638 uint64_t option;
639 if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) {
11fdf7f2
TL
640 lderr(cct) << "create does not support 'flatten' image option" << dendl;
641 return -EINVAL;
642 }
92f5a8d4
TL
643 if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) {
644 lderr(cct) << "create does not support 'clone_format' image option"
645 << dendl;
646 return -EINVAL;
647 }
11fdf7f2 648
7c673cae
FG
649 ldout(cct, 10) << __func__ << " name=" << image_name << ", "
650 << "id= " << id << ", "
651 << "size=" << size << ", opts=" << opts << dendl;
652
653 uint64_t format;
654 if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0)
11fdf7f2 655 format = cct->_conf.get_val<uint64_t>("rbd_default_format");
7c673cae
FG
656 bool old_format = format == 1;
657
658 // make sure it doesn't already exist, in either format
659 int r = detect_format(io_ctx, image_name, NULL, NULL);
660 if (r != -ENOENT) {
661 if (r) {
662 lderr(cct) << "Could not tell if " << image_name << " already exists"
663 << dendl;
664 return r;
665 }
666 lderr(cct) << "rbd image " << image_name << " already exists" << dendl;
667 return -EEXIST;
668 }
669
670 uint64_t order = 0;
671 if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0 || order == 0) {
11fdf7f2 672 order = cct->_conf.get_val<uint64_t>("rbd_default_order");
7c673cae
FG
673 }
674 r = image::CreateRequest<>::validate_order(cct, order);
675 if (r < 0) {
676 return r;
677 }
678
679 if (old_format) {
11fdf7f2
TL
680 if ( !getenv("RBD_FORCE_ALLOW_V1") ) {
681 lderr(cct) << "Format 1 image creation unsupported. " << dendl;
682 return -EINVAL;
683 }
684 lderr(cct) << "Forced V1 image creation. " << dendl;
7c673cae
FG
685 r = create_v1(io_ctx, image_name.c_str(), size, order);
686 } else {
687 ThreadPool *thread_pool;
688 ContextWQ *op_work_queue;
689 ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
690
11fdf7f2
TL
691 ConfigProxy config{cct->_conf};
692 api::Config<>::apply_pool_overrides(io_ctx, &config);
693
1911f103
TL
694 uint32_t create_flags = 0U;
695 uint64_t mirror_image_mode = RBD_MIRROR_IMAGE_MODE_JOURNAL;
696 if (skip_mirror_enable) {
697 create_flags = image::CREATE_FLAG_SKIP_MIRROR_ENABLE;
698 } else if (opts.get(RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE,
699 &mirror_image_mode) == 0) {
700 create_flags = image::CREATE_FLAG_FORCE_MIRROR_ENABLE;
701 }
702
7c673cae
FG
703 C_SaferCond cond;
704 image::CreateRequest<> *req = image::CreateRequest<>::create(
1911f103
TL
705 config, io_ctx, image_name, id, size, opts, create_flags,
706 static_cast<cls::rbd::MirrorImageMode>(mirror_image_mode),
707 non_primary_global_image_id, primary_mirror_uuid, op_work_queue, &cond);
7c673cae
FG
708 req->send();
709
710 r = cond.wait();
711 }
712
713 int r1 = opts.set(RBD_IMAGE_OPTION_ORDER, order);
11fdf7f2 714 ceph_assert(r1 == 0);
7c673cae
FG
715
716 return r;
717 }
718
719 /*
720 * Parent may be in different pool, hence different IoCtx
721 */
722 int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
723 IoCtx& c_ioctx, const char *c_name,
724 uint64_t features, int *c_order,
725 uint64_t stripe_unit, int stripe_count)
726 {
727 uint64_t order = *c_order;
728
729 ImageOptions opts;
730 opts.set(RBD_IMAGE_OPTION_FEATURES, features);
731 opts.set(RBD_IMAGE_OPTION_ORDER, order);
732 opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
733 opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
734
11fdf7f2
TL
735 int r = clone(p_ioctx, nullptr, p_name, p_snap_name, c_ioctx, nullptr,
736 c_name, opts, "", "");
7c673cae
FG
737 opts.get(RBD_IMAGE_OPTION_ORDER, &order);
738 *c_order = order;
739 return r;
740 }
741
11fdf7f2
TL
742 int clone(IoCtx& p_ioctx, const char *p_id, const char *p_name,
743 const char *p_snap_name, IoCtx& c_ioctx, const char *c_id,
744 const char *c_name, ImageOptions& c_opts,
745 const std::string &non_primary_global_image_id,
746 const std::string &primary_mirror_uuid)
7c673cae 747 {
11fdf7f2
TL
748 ceph_assert((p_id == nullptr) ^ (p_name == nullptr));
749
7c673cae 750 CephContext *cct = (CephContext *)p_ioctx.cct();
11fdf7f2 751 if (p_snap_name == nullptr) {
7c673cae
FG
752 lderr(cct) << "image to be cloned must be a snapshot" << dendl;
753 return -EINVAL;
754 }
755
11fdf7f2
TL
756 uint64_t flatten;
757 if (c_opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) {
758 lderr(cct) << "clone does not support 'flatten' image option" << dendl;
759 return -EINVAL;
7c673cae
FG
760 }
761
11fdf7f2
TL
762 int r;
763 std::string parent_id;
764 if (p_id == nullptr) {
765 r = cls_client::dir_get_id(&p_ioctx, RBD_DIRECTORY, p_name,
766 &parent_id);
767 if (r < 0) {
768 if (r != -ENOENT) {
769 lderr(cct) << "failed to retrieve parent image id: "
770 << cpp_strerror(r) << dendl;
771 }
772 return r;
773 }
774 } else {
775 parent_id = p_id;
7c673cae 776 }
7c673cae 777
11fdf7f2
TL
778 std::string clone_id;
779 if (c_id == nullptr) {
780 clone_id = util::generate_image_id(c_ioctx);
781 } else {
782 clone_id = c_id;
7c673cae
FG
783 }
784
7c673cae
FG
785 ldout(cct, 10) << __func__ << " "
786 << "c_name=" << c_name << ", "
11fdf7f2 787 << "c_id= " << clone_id << ", "
7c673cae
FG
788 << "c_opts=" << c_opts << dendl;
789
11fdf7f2
TL
790 ConfigProxy config{reinterpret_cast<CephContext *>(c_ioctx.cct())->_conf};
791 api::Config<>::apply_pool_overrides(c_ioctx, &config);
792
7c673cae
FG
793 ThreadPool *thread_pool;
794 ContextWQ *op_work_queue;
795 ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
796
797 C_SaferCond cond;
798 auto *req = image::CloneRequest<>::create(
9f95a23c
TL
799 config, p_ioctx, parent_id, p_snap_name,
800 {cls::rbd::UserSnapshotNamespace{}}, CEPH_NOSNAP, c_ioctx, c_name,
801 clone_id, c_opts, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL,
802 non_primary_global_image_id, primary_mirror_uuid, op_work_queue, &cond);
7c673cae
FG
803 req->send();
804
11fdf7f2
TL
805 r = cond.wait();
806 if (r < 0) {
807 return r;
808 }
809
810 return 0;
7c673cae
FG
811 }
812
813 int rename(IoCtx& io_ctx, const char *srcname, const char *dstname)
814 {
815 CephContext *cct = (CephContext *)io_ctx.cct();
816 ldout(cct, 20) << "rename " << &io_ctx << " " << srcname << " -> "
817 << dstname << dendl;
818
819 ImageCtx *ictx = new ImageCtx(srcname, "", "", io_ctx, false);
11fdf7f2 820 int r = ictx->state->open(0);
7c673cae 821 if (r < 0) {
181888fb 822 lderr(cct) << "error opening source image: " << cpp_strerror(r) << dendl;
7c673cae
FG
823 return r;
824 }
825 BOOST_SCOPE_EXIT((ictx)) {
826 ictx->state->close();
827 } BOOST_SCOPE_EXIT_END
828
829 return ictx->operations->rename(dstname);
830 }
831
832 int info(ImageCtx *ictx, image_info_t& info, size_t infosize)
833 {
834 ldout(ictx->cct, 20) << "info " << ictx << dendl;
835
836 int r = ictx->state->refresh_if_required();
837 if (r < 0)
838 return r;
839
840 image_info(ictx, info, infosize);
841 return 0;
842 }
843
844 int get_old_format(ImageCtx *ictx, uint8_t *old)
845 {
846 int r = ictx->state->refresh_if_required();
847 if (r < 0)
848 return r;
849 *old = ictx->old_format;
850 return 0;
851 }
852
853 int get_size(ImageCtx *ictx, uint64_t *size)
854 {
855 int r = ictx->state->refresh_if_required();
856 if (r < 0)
857 return r;
9f95a23c 858 std::shared_lock l2{ictx->image_lock};
7c673cae
FG
859 *size = ictx->get_image_size(ictx->snap_id);
860 return 0;
861 }
862
863 int get_features(ImageCtx *ictx, uint64_t *features)
864 {
865 int r = ictx->state->refresh_if_required();
866 if (r < 0)
867 return r;
9f95a23c 868 std::shared_lock l{ictx->image_lock};
7c673cae
FG
869 *features = ictx->features;
870 return 0;
871 }
872
873 int get_overlap(ImageCtx *ictx, uint64_t *overlap)
874 {
875 int r = ictx->state->refresh_if_required();
876 if (r < 0)
877 return r;
9f95a23c 878 std::shared_lock image_locker{ictx->image_lock};
7c673cae
FG
879 return ictx->get_parent_overlap(ictx->snap_id, overlap);
880 }
881
7c673cae
FG
882 int get_flags(ImageCtx *ictx, uint64_t *flags)
883 {
884 int r = ictx->state->refresh_if_required();
885 if (r < 0) {
886 return r;
887 }
888
9f95a23c 889 std::shared_lock l2{ictx->image_lock};
7c673cae
FG
890 return ictx->get_flags(ictx->snap_id, flags);
891 }
892
893 int set_image_notification(ImageCtx *ictx, int fd, int type)
894 {
895 CephContext *cct = ictx->cct;
896 ldout(cct, 20) << __func__ << " " << ictx << " fd " << fd << " type" << type << dendl;
897
898 int r = ictx->state->refresh_if_required();
899 if (r < 0) {
900 return r;
901 }
902
903 if (ictx->event_socket.is_valid())
904 return -EINVAL;
905 return ictx->event_socket.init(fd, type);
906 }
907
908 int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner)
909 {
91327a77
AA
910 CephContext *cct = ictx->cct;
911 ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
7c673cae
FG
912 *is_owner = false;
913
9f95a23c 914 std::shared_lock owner_locker{ictx->owner_lock};
91327a77 915 if (ictx->exclusive_lock == nullptr) {
7c673cae
FG
916 return 0;
917 }
918
919 // might have been blacklisted by peer -- ensure we still own
920 // the lock by pinging the OSD
921 int r = ictx->exclusive_lock->assert_header_locked();
31f18b77
FG
922 if (r == -EBUSY || r == -ENOENT) {
923 return 0;
924 } else if (r < 0) {
7c673cae
FG
925 return r;
926 }
927
928 *is_owner = true;
929 return 0;
930 }
931
932 int lock_acquire(ImageCtx *ictx, rbd_lock_mode_t lock_mode)
933 {
934 CephContext *cct = ictx->cct;
935 ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", "
936 << "lock_mode=" << lock_mode << dendl;
937
938 if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) {
939 return -EOPNOTSUPP;
940 }
941
942 C_SaferCond lock_ctx;
943 {
9f95a23c 944 std::unique_lock l{ictx->owner_lock};
7c673cae
FG
945
946 if (ictx->exclusive_lock == nullptr) {
947 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
948 return -EINVAL;
949 }
950
951 if (ictx->get_exclusive_lock_policy()->may_auto_request_lock()) {
952 ictx->set_exclusive_lock_policy(
953 new exclusive_lock::StandardPolicy(ictx));
954 }
955
956 if (ictx->exclusive_lock->is_lock_owner()) {
957 return 0;
958 }
959
960 ictx->exclusive_lock->acquire_lock(&lock_ctx);
961 }
962
963 int r = lock_ctx.wait();
964 if (r < 0) {
965 lderr(cct) << "failed to request exclusive lock: " << cpp_strerror(r)
966 << dendl;
967 return r;
968 }
969
9f95a23c 970 std::shared_lock l{ictx->owner_lock};
91327a77
AA
971 if (ictx->exclusive_lock == nullptr) {
972 return -EINVAL;
973 } else if (!ictx->exclusive_lock->is_lock_owner()) {
7c673cae 974 lderr(cct) << "failed to acquire exclusive lock" << dendl;
91327a77 975 return ictx->exclusive_lock->get_unlocked_op_error();
7c673cae
FG
976 }
977
978 return 0;
979 }
980
981 int lock_release(ImageCtx *ictx)
982 {
983 CephContext *cct = ictx->cct;
984 ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
985
986 C_SaferCond lock_ctx;
987 {
9f95a23c 988 std::unique_lock l{ictx->owner_lock};
7c673cae
FG
989
990 if (ictx->exclusive_lock == nullptr ||
991 !ictx->exclusive_lock->is_lock_owner()) {
992 lderr(cct) << "not exclusive lock owner" << dendl;
993 return -EINVAL;
994 }
995
996 ictx->exclusive_lock->release_lock(&lock_ctx);
997 }
998
999 int r = lock_ctx.wait();
1000 if (r < 0) {
1001 lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r)
1002 << dendl;
1003 return r;
1004 }
1005 return 0;
1006 }
1007
1008 int lock_get_owners(ImageCtx *ictx, rbd_lock_mode_t *lock_mode,
1009 std::list<std::string> *lock_owners)
1010 {
1011 CephContext *cct = ictx->cct;
1012 ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
1013
7c673cae
FG
1014 managed_lock::Locker locker;
1015 C_SaferCond get_owner_ctx;
9f95a23c
TL
1016 {
1017 std::shared_lock owner_locker{ictx->owner_lock};
1018
1019 if (ictx->exclusive_lock == nullptr) {
1020 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
1021 return -EINVAL;
1022 }
1023
1024 ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx);
1025 }
1026
7c673cae
FG
1027 int r = get_owner_ctx.wait();
1028 if (r == -ENOENT) {
1029 return r;
1030 } else if (r < 0) {
1031 lderr(cct) << "failed to determine current lock owner: "
1032 << cpp_strerror(r) << dendl;
1033 return r;
1034 }
1035
1036 *lock_mode = RBD_LOCK_MODE_EXCLUSIVE;
1037 lock_owners->clear();
1038 lock_owners->emplace_back(locker.address);
1039 return 0;
1040 }
1041
1042 int lock_break(ImageCtx *ictx, rbd_lock_mode_t lock_mode,
11fdf7f2 1043 const std::string &lock_owner) {
7c673cae
FG
1044 CephContext *cct = ictx->cct;
1045 ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", "
1046 << "lock_mode=" << lock_mode << ", "
1047 << "lock_owner=" << lock_owner << dendl;
1048
1049 if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) {
1050 return -EOPNOTSUPP;
1051 }
1052
1053 if (ictx->read_only) {
1054 return -EROFS;
1055 }
1056
1057 managed_lock::Locker locker;
1058 C_SaferCond get_owner_ctx;
1059 {
9f95a23c 1060 std::shared_lock l{ictx->owner_lock};
7c673cae
FG
1061
1062 if (ictx->exclusive_lock == nullptr) {
1063 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
1064 return -EINVAL;
1065 }
1066
1067 ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx);
1068 }
1069 int r = get_owner_ctx.wait();
1070 if (r == -ENOENT) {
1071 return r;
1072 } else if (r < 0) {
1073 lderr(cct) << "failed to determine current lock owner: "
1074 << cpp_strerror(r) << dendl;
1075 return r;
1076 }
1077
1078 if (locker.address != lock_owner) {
1079 return -EBUSY;
1080 }
1081
1082 C_SaferCond break_ctx;
1083 {
9f95a23c 1084 std::shared_lock l{ictx->owner_lock};
7c673cae
FG
1085
1086 if (ictx->exclusive_lock == nullptr) {
1087 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
1088 return -EINVAL;
1089 }
1090
1091 ictx->exclusive_lock->break_lock(locker, true, &break_ctx);
1092 }
1093 r = break_ctx.wait();
1094 if (r == -ENOENT) {
1095 return r;
1096 } else if (r < 0) {
1097 lderr(cct) << "failed to break lock: " << cpp_strerror(r) << dendl;
1098 return r;
1099 }
1100 return 0;
1101 }
1102
7c673cae
FG
1103 int copy(ImageCtx *src, IoCtx& dest_md_ctx, const char *destname,
1104 ImageOptions& opts, ProgressContext &prog_ctx, size_t sparse_size)
1105 {
1106 CephContext *cct = (CephContext *)dest_md_ctx.cct();
92f5a8d4
TL
1107 uint64_t option;
1108 if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) {
11fdf7f2
TL
1109 lderr(cct) << "copy does not support 'flatten' image option" << dendl;
1110 return -EINVAL;
1111 }
92f5a8d4
TL
1112 if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) {
1113 lderr(cct) << "copy does not support 'clone_format' image option"
1114 << dendl;
1115 return -EINVAL;
1116 }
11fdf7f2 1117
7c673cae
FG
1118 ldout(cct, 20) << "copy " << src->name
1119 << (src->snap_name.length() ? "@" + src->snap_name : "")
1120 << " -> " << destname << " opts = " << opts << dendl;
1121
9f95a23c 1122 src->image_lock.lock_shared();
7c673cae
FG
1123 uint64_t features = src->features;
1124 uint64_t src_size = src->get_image_size(src->snap_id);
9f95a23c 1125 src->image_lock.unlock_shared();
7c673cae
FG
1126 uint64_t format = src->old_format ? 1 : 2;
1127 if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) {
1128 opts.set(RBD_IMAGE_OPTION_FORMAT, format);
1129 }
1130 uint64_t stripe_unit = src->stripe_unit;
1131 if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) {
1132 opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
1133 }
1134 uint64_t stripe_count = src->stripe_count;
1135 if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) {
1136 opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
1137 }
1138 uint64_t order = src->order;
1139 if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
1140 opts.set(RBD_IMAGE_OPTION_ORDER, order);
1141 }
1142 if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) {
1143 opts.set(RBD_IMAGE_OPTION_FEATURES, features);
1144 }
1145 if (features & ~RBD_FEATURES_ALL) {
1146 lderr(cct) << "librbd does not support requested features" << dendl;
1147 return -ENOSYS;
1148 }
1149
1150 int r = create(dest_md_ctx, destname, "", src_size, opts, "", "", false);
1151 if (r < 0) {
1152 lderr(cct) << "header creation failed" << dendl;
1153 return r;
1154 }
1155 opts.set(RBD_IMAGE_OPTION_ORDER, static_cast<uint64_t>(order));
1156
11fdf7f2
TL
1157 ImageCtx *dest = new librbd::ImageCtx(destname, "", nullptr, dest_md_ctx,
1158 false);
1159 r = dest->state->open(0);
7c673cae
FG
1160 if (r < 0) {
1161 lderr(cct) << "failed to read newly created header" << dendl;
1162 return r;
1163 }
1164
1165 r = copy(src, dest, prog_ctx, sparse_size);
1166
1167 int close_r = dest->state->close();
1168 if (r == 0 && close_r < 0) {
1169 r = close_r;
1170 }
1171 return r;
1172 }
1173
1174 class C_CopyWrite : public Context {
1175 public:
1176 C_CopyWrite(bufferlist *bl, Context* ctx)
1177 : m_bl(bl), m_ctx(ctx) {}
1178 void finish(int r) override {
1179 delete m_bl;
1180 m_ctx->complete(r);
1181 }
1182 private:
1183 bufferlist *m_bl;
1184 Context *m_ctx;
1185 };
1186
1187 class C_CopyRead : public Context {
1188 public:
1189 C_CopyRead(SimpleThrottle *throttle, ImageCtx *dest, uint64_t offset,
1190 bufferlist *bl, size_t sparse_size)
1191 : m_throttle(throttle), m_dest(dest), m_offset(offset), m_bl(bl),
1192 m_sparse_size(sparse_size) {
1193 m_throttle->start_op();
1194 }
1195 void finish(int r) override {
1196 if (r < 0) {
1197 lderr(m_dest->cct) << "error reading from source image at offset "
1198 << m_offset << ": " << cpp_strerror(r) << dendl;
1199 delete m_bl;
1200 m_throttle->end_op(r);
1201 return;
1202 }
11fdf7f2 1203 ceph_assert(m_bl->length() == (size_t)r);
7c673cae
FG
1204
1205 if (m_bl->is_zero()) {
1206 delete m_bl;
1207 m_throttle->end_op(r);
1208 return;
1209 }
1210
1211 if (!m_sparse_size) {
1212 m_sparse_size = (1 << m_dest->order);
1213 }
1214
1215 auto *throttle = m_throttle;
9f95a23c 1216 auto *end_op_ctx = new LambdaContext([throttle](int r) {
7c673cae
FG
1217 throttle->end_op(r);
1218 });
1219 auto gather_ctx = new C_Gather(m_dest->cct, end_op_ctx);
1220
11fdf7f2 1221 m_bl->rebuild(buffer::ptr_node::create(m_bl->length()));
7c673cae
FG
1222 size_t write_offset = 0;
1223 size_t write_length = 0;
1224 size_t offset = 0;
1225 size_t length = m_bl->length();
11fdf7f2 1226 const auto& m_ptr = m_bl->front();
7c673cae
FG
1227 while (offset < length) {
1228 if (util::calc_sparse_extent(m_ptr,
1229 m_sparse_size,
1230 length,
1231 &write_offset,
1232 &write_length,
1233 &offset)) {
7c673cae 1234 bufferlist *write_bl = new bufferlist();
11fdf7f2
TL
1235 write_bl->push_back(
1236 buffer::ptr_node::create(m_ptr, write_offset, write_length));
7c673cae
FG
1237 Context *ctx = new C_CopyWrite(write_bl, gather_ctx->new_sub());
1238 auto comp = io::AioCompletion::create(ctx);
1239
1240 // coordinate through AIO WQ to ensure lock is acquired if needed
1241 m_dest->io_work_queue->aio_write(comp, m_offset + write_offset,
1242 write_length,
1243 std::move(*write_bl),
31f18b77
FG
1244 LIBRADOS_OP_FLAG_FADVISE_DONTNEED,
1245 std::move(read_trace));
7c673cae
FG
1246 write_offset = offset;
1247 write_length = 0;
1248 }
1249 }
1250 delete m_bl;
11fdf7f2 1251 ceph_assert(gather_ctx->get_sub_created_count() > 0);
7c673cae
FG
1252 gather_ctx->activate();
1253 }
1254
31f18b77
FG
1255 ZTracer::Trace read_trace;
1256
7c673cae
FG
1257 private:
1258 SimpleThrottle *m_throttle;
1259 ImageCtx *m_dest;
1260 uint64_t m_offset;
1261 bufferlist *m_bl;
1262 size_t m_sparse_size;
1263 };
1264
1265 int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx, size_t sparse_size)
1266 {
9f95a23c 1267 src->image_lock.lock_shared();
7c673cae 1268 uint64_t src_size = src->get_image_size(src->snap_id);
9f95a23c 1269 src->image_lock.unlock_shared();
7c673cae 1270
9f95a23c 1271 dest->image_lock.lock_shared();
7c673cae 1272 uint64_t dest_size = dest->get_image_size(dest->snap_id);
9f95a23c 1273 dest->image_lock.unlock_shared();
7c673cae
FG
1274
1275 CephContext *cct = src->cct;
1276 if (dest_size < src_size) {
1277 lderr(cct) << " src size " << src_size << " > dest size "
1278 << dest_size << dendl;
1279 return -EINVAL;
1280 }
b32b8144 1281
9f95a23c
TL
1282 C_SaferCond ctx;
1283 auto req = deep_copy::MetadataCopyRequest<>::create(
1284 src, dest, &ctx);
1285 req->send();
b32b8144 1286
9f95a23c
TL
1287 int r = ctx.wait();
1288 if (r < 0) {
1289 lderr(cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl;
1290 return r;
7c673cae
FG
1291 }
1292
31f18b77 1293 ZTracer::Trace trace;
181888fb 1294 if (src->blkin_trace_all) {
31f18b77
FG
1295 trace.init("copy", &src->trace_endpoint);
1296 }
1297
9f95a23c 1298 std::shared_lock owner_lock{src->owner_lock};
11fdf7f2 1299 SimpleThrottle throttle(src->config.get_val<uint64_t>("rbd_concurrent_management_ops"), false);
7c673cae 1300 uint64_t period = src->get_stripe_period();
31f18b77
FG
1301 unsigned fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
1302 LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
11fdf7f2 1303 uint64_t object_id = 0;
7c673cae
FG
1304 for (uint64_t offset = 0; offset < src_size; offset += period) {
1305 if (throttle.pending_error()) {
1306 return throttle.wait_for_ret();
1307 }
1308
11fdf7f2 1309 {
9f95a23c 1310 std::shared_lock image_locker{src->image_lock};
11fdf7f2
TL
1311 if (src->object_map != nullptr) {
1312 bool skip = true;
1313 // each period is related to src->stripe_count objects, check them all
1314 for (uint64_t i=0; i < src->stripe_count; i++) {
1315 if (object_id < src->object_map->size() &&
1316 src->object_map->object_may_exist(object_id)) {
1317 skip = false;
1318 }
1319 ++object_id;
1320 }
1321
1322 if (skip) continue;
1323 } else {
1324 object_id += src->stripe_count;
1325 }
1326 }
1327
7c673cae
FG
1328 uint64_t len = min(period, src_size - offset);
1329 bufferlist *bl = new bufferlist();
31f18b77
FG
1330 auto ctx = new C_CopyRead(&throttle, dest, offset, bl, sparse_size);
1331 auto comp = io::AioCompletion::create_and_start<Context>(
1332 ctx, src, io::AIO_TYPE_READ);
1333
1334 io::ImageReadRequest<> req(*src, comp, {{offset, len}},
1335 io::ReadResult{bl}, fadvise_flags,
1336 std::move(trace));
1337 ctx->read_trace = req.get_trace();
1338
1339 req.send();
7c673cae
FG
1340 prog_ctx.update_progress(offset, src_size);
1341 }
1342
1343 r = throttle.wait_for_ret();
1344 if (r >= 0)
1345 prog_ctx.update_progress(src_size, src_size);
1346 return r;
1347 }
1348
7c673cae
FG
1349 int list_lockers(ImageCtx *ictx,
1350 std::list<locker_t> *lockers,
1351 bool *exclusive,
1352 string *tag)
1353 {
1354 ldout(ictx->cct, 20) << "list_locks on image " << ictx << dendl;
1355
1356 int r = ictx->state->refresh_if_required();
1357 if (r < 0)
1358 return r;
1359
9f95a23c 1360 std::shared_lock locker{ictx->image_lock};
7c673cae
FG
1361 if (exclusive)
1362 *exclusive = ictx->exclusive_locked;
1363 if (tag)
1364 *tag = ictx->lock_tag;
1365 if (lockers) {
1366 lockers->clear();
1367 map<rados::cls::lock::locker_id_t,
1368 rados::cls::lock::locker_info_t>::const_iterator it;
1369 for (it = ictx->lockers.begin(); it != ictx->lockers.end(); ++it) {
1370 locker_t locker;
1371 locker.client = stringify(it->first.locker);
1372 locker.cookie = it->first.cookie;
11fdf7f2 1373 locker.address = it->second.addr.get_legacy_str();
7c673cae
FG
1374 lockers->push_back(locker);
1375 }
1376 }
1377
1378 return 0;
1379 }
1380
1381 int lock(ImageCtx *ictx, bool exclusive, const string& cookie,
1382 const string& tag)
1383 {
1384 ldout(ictx->cct, 20) << "lock image " << ictx << " exclusive=" << exclusive
1385 << " cookie='" << cookie << "' tag='" << tag << "'"
1386 << dendl;
1387
1388 int r = ictx->state->refresh_if_required();
1389 if (r < 0)
1390 return r;
1391
1392 /**
1393 * If we wanted we could do something more intelligent, like local
1394 * checks that we think we will succeed. But for now, let's not
1395 * duplicate that code.
1396 */
1397 {
9f95a23c 1398 std::shared_lock locker{ictx->image_lock};
7c673cae
FG
1399 r = rados::cls::lock::lock(&ictx->md_ctx, ictx->header_oid, RBD_LOCK_NAME,
1400 exclusive ? LOCK_EXCLUSIVE : LOCK_SHARED,
1401 cookie, tag, "", utime_t(), 0);
1402 if (r < 0) {
1403 return r;
1404 }
1405 }
1406
1407 ictx->notify_update();
1408 return 0;
1409 }
1410
1411 int unlock(ImageCtx *ictx, const string& cookie)
1412 {
1413 ldout(ictx->cct, 20) << "unlock image " << ictx
1414 << " cookie='" << cookie << "'" << dendl;
1415
1416 int r = ictx->state->refresh_if_required();
1417 if (r < 0)
1418 return r;
1419
1420 {
9f95a23c 1421 std::shared_lock locker{ictx->image_lock};
7c673cae
FG
1422 r = rados::cls::lock::unlock(&ictx->md_ctx, ictx->header_oid,
1423 RBD_LOCK_NAME, cookie);
1424 if (r < 0) {
1425 return r;
1426 }
1427 }
1428
1429 ictx->notify_update();
1430 return 0;
1431 }
1432
1433 int break_lock(ImageCtx *ictx, const string& client,
1434 const string& cookie)
1435 {
1436 ldout(ictx->cct, 20) << "break_lock image " << ictx << " client='" << client
1437 << "' cookie='" << cookie << "'" << dendl;
1438
1439 int r = ictx->state->refresh_if_required();
1440 if (r < 0)
1441 return r;
1442
1443 entity_name_t lock_client;
1444 if (!lock_client.parse(client)) {
1445 lderr(ictx->cct) << "Unable to parse client '" << client
1446 << "'" << dendl;
1447 return -EINVAL;
1448 }
1449
11fdf7f2 1450 if (ictx->config.get_val<bool>("rbd_blacklist_on_break_lock")) {
7c673cae
FG
1451 typedef std::map<rados::cls::lock::locker_id_t,
1452 rados::cls::lock::locker_info_t> Lockers;
1453 Lockers lockers;
1454 ClsLockType lock_type;
1455 std::string lock_tag;
1456 r = rados::cls::lock::get_lock_info(&ictx->md_ctx, ictx->header_oid,
1457 RBD_LOCK_NAME, &lockers, &lock_type,
1458 &lock_tag);
1459 if (r < 0) {
1460 lderr(ictx->cct) << "unable to retrieve lock info: " << cpp_strerror(r)
1461 << dendl;
1462 return r;
1463 }
1464
1465 std::string client_address;
1466 for (Lockers::iterator it = lockers.begin();
1467 it != lockers.end(); ++it) {
1468 if (it->first.locker == lock_client) {
11fdf7f2 1469 client_address = it->second.addr.get_legacy_str();
7c673cae
FG
1470 break;
1471 }
1472 }
1473 if (client_address.empty()) {
1474 return -ENOENT;
1475 }
1476
7c673cae 1477 librados::Rados rados(ictx->md_ctx);
11fdf7f2
TL
1478 r = rados.blacklist_add(
1479 client_address,
1480 ictx->config.get_val<uint64_t>("rbd_blacklist_expire_seconds"));
7c673cae
FG
1481 if (r < 0) {
1482 lderr(ictx->cct) << "unable to blacklist client: " << cpp_strerror(r)
1483 << dendl;
1484 return r;
1485 }
1486 }
1487
1488 r = rados::cls::lock::break_lock(&ictx->md_ctx, ictx->header_oid,
1489 RBD_LOCK_NAME, cookie, lock_client);
1490 if (r < 0)
1491 return r;
1492 ictx->notify_update();
1493 return 0;
1494 }
1495
1496 void rbd_ctx_cb(completion_t cb, void *arg)
1497 {
1498 Context *ctx = reinterpret_cast<Context *>(arg);
1499 auto comp = reinterpret_cast<io::AioCompletion *>(cb);
1500 ctx->complete(comp->get_return_value());
1501 comp->release();
1502 }
1503
1504 int64_t read_iterate(ImageCtx *ictx, uint64_t off, uint64_t len,
1505 int (*cb)(uint64_t, size_t, const char *, void *),
1506 void *arg)
1507 {
11fdf7f2
TL
1508 coarse_mono_time start_time;
1509 ceph::timespan elapsed;
7c673cae
FG
1510
1511 ldout(ictx->cct, 20) << "read_iterate " << ictx << " off = " << off
1512 << " len = " << len << dendl;
1513
1514 int r = ictx->state->refresh_if_required();
1515 if (r < 0)
1516 return r;
1517
1518 uint64_t mylen = len;
9f95a23c 1519 ictx->image_lock.lock_shared();
7c673cae 1520 r = clip_io(ictx, off, &mylen);
9f95a23c 1521 ictx->image_lock.unlock_shared();
7c673cae
FG
1522 if (r < 0)
1523 return r;
1524
1525 int64_t total_read = 0;
1526 uint64_t period = ictx->get_stripe_period();
1527 uint64_t left = mylen;
1528
31f18b77 1529 ZTracer::Trace trace;
181888fb 1530 if (ictx->blkin_trace_all) {
31f18b77
FG
1531 trace.init("read_iterate", &ictx->trace_endpoint);
1532 }
1533
9f95a23c 1534 std::shared_lock owner_locker{ictx->owner_lock};
11fdf7f2 1535 start_time = coarse_mono_clock::now();
7c673cae
FG
1536 while (left > 0) {
1537 uint64_t period_off = off - (off % period);
1538 uint64_t read_len = min(period_off + period - off, left);
1539
1540 bufferlist bl;
1541
1542 C_SaferCond ctx;
1543 auto c = io::AioCompletion::create_and_start(&ctx, ictx,
1544 io::AIO_TYPE_READ);
1545 io::ImageRequest<>::aio_read(ictx, c, {{off, read_len}},
31f18b77 1546 io::ReadResult{&bl}, 0, std::move(trace));
7c673cae
FG
1547
1548 int ret = ctx.wait();
1549 if (ret < 0) {
1550 return ret;
1551 }
1552
1553 r = cb(total_read, ret, bl.c_str(), arg);
1554 if (r < 0) {
1555 return r;
1556 }
1557
1558 total_read += ret;
1559 left -= ret;
1560 off += ret;
1561 }
1562
11fdf7f2 1563 elapsed = coarse_mono_clock::now() - start_time;
7c673cae
FG
1564 ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed);
1565 ictx->perfcounter->inc(l_librbd_rd);
1566 ictx->perfcounter->inc(l_librbd_rd_bytes, mylen);
1567 return total_read;
1568 }
1569
1570 // validate extent against image size; clip to image size if necessary
1571 int clip_io(ImageCtx *ictx, uint64_t off, uint64_t *len)
1572 {
9f95a23c 1573 ceph_assert(ceph_mutex_is_locked(ictx->image_lock));
7c673cae
FG
1574 uint64_t image_size = ictx->get_image_size(ictx->snap_id);
1575 bool snap_exists = ictx->snap_exists;
1576
1577 if (!snap_exists)
1578 return -ENOENT;
1579
1580 // special-case "len == 0" requests: always valid
1581 if (*len == 0)
1582 return 0;
1583
1584 // can't start past end
1585 if (off >= image_size)
1586 return -EINVAL;
1587
1588 // clip requests that extend past end to just end
1589 if ((off + *len) > image_size)
1590 *len = (size_t)(image_size - off);
1591
1592 return 0;
1593 }
1594
11fdf7f2 1595 int invalidate_cache(ImageCtx *ictx)
7c673cae
FG
1596 {
1597 CephContext *cct = ictx->cct;
11fdf7f2 1598 ldout(cct, 20) << "invalidate_cache " << ictx << dendl;
7c673cae
FG
1599
1600 int r = ictx->state->refresh_if_required();
1601 if (r < 0) {
1602 return r;
1603 }
1604
7c673cae
FG
1605 C_SaferCond ctx;
1606 {
9f95a23c 1607 std::shared_lock owner_locker{ictx->owner_lock};
11fdf7f2 1608 ictx->io_object_dispatcher->invalidate_cache(&ctx);
7c673cae
FG
1609 }
1610 r = ctx.wait();
7c673cae
FG
1611 ictx->perfcounter->inc(l_librbd_invalidate_cache);
1612 return r;
1613 }
1614
1615 int poll_io_events(ImageCtx *ictx, io::AioCompletion **comps, int numcomp)
1616 {
1617 if (numcomp <= 0)
1618 return -EINVAL;
1619 CephContext *cct = ictx->cct;
1620 ldout(cct, 20) << __func__ << " " << ictx << " numcomp = " << numcomp
1621 << dendl;
1622 int i = 0;
9f95a23c
TL
1623 while (i < numcomp && ictx->event_socket_completions.pop(comps[i])) {
1624 ++i;
7c673cae 1625 }
9f95a23c 1626
7c673cae
FG
1627 return i;
1628 }
1629
1630 int metadata_get(ImageCtx *ictx, const string &key, string *value)
1631 {
1632 CephContext *cct = ictx->cct;
1633 ldout(cct, 20) << "metadata_get " << ictx << " key=" << key << dendl;
1634
1635 int r = ictx->state->refresh_if_required();
1636 if (r < 0) {
1637 return r;
1638 }
1639
1640 return cls_client::metadata_get(&ictx->md_ctx, ictx->header_oid, key, value);
1641 }
1642
1643 int metadata_list(ImageCtx *ictx, const string &start, uint64_t max, map<string, bufferlist> *pairs)
1644 {
1645 CephContext *cct = ictx->cct;
1646 ldout(cct, 20) << "metadata_list " << ictx << dendl;
1647
1648 int r = ictx->state->refresh_if_required();
1649 if (r < 0) {
1650 return r;
1651 }
1652
9f95a23c
TL
1653 C_SaferCond ctx;
1654 auto req = image::GetMetadataRequest<>::create(
1655 ictx->md_ctx, ictx->header_oid, false, "", start, max, pairs, &ctx);
1656 req->send();
7c673cae 1657
9f95a23c 1658 return ctx.wait();
7c673cae
FG
1659 }
1660
11fdf7f2
TL
1661 int list_watchers(ImageCtx *ictx,
1662 std::list<librbd::image_watcher_t> &watchers)
1663 {
1664 int r;
1665 std::string header_oid;
1666 std::list<obj_watch_t> obj_watchers;
1667
1668 if (ictx->old_format) {
1669 header_oid = util::old_header_name(ictx->name);
1670 } else {
1671 header_oid = util::header_name(ictx->id);
1672 }
1673
1674 r = ictx->md_ctx.list_watchers(header_oid, &obj_watchers);
1675 if (r < 0) {
1676 return r;
1677 }
1678
1679 for (auto i = obj_watchers.begin(); i != obj_watchers.end(); ++i) {
1680 librbd::image_watcher_t watcher;
1681 watcher.addr = i->addr;
1682 watcher.id = i->watcher_id;
1683 watcher.cookie = i->cookie;
1684
1685 watchers.push_back(watcher);
1686 }
1687
1688 return 0;
1689 }
1690
1691}
1692
1693std::ostream &operator<<(std::ostream &os, const librbd::ImageOptions &opts) {
1694 os << "[";
1695
1696 const char *delimiter = "";
1697 for (auto &i : librbd::IMAGE_OPTIONS_TYPE_MAPPING) {
1698 if (i.second == librbd::STR) {
1699 std::string val;
1700 if (opts.get(i.first, &val) == 0) {
1701 os << delimiter << librbd::image_option_name(i.first) << "=" << val;
1702 delimiter = ", ";
1703 }
1704 } else if (i.second == librbd::UINT64) {
1705 uint64_t val;
1706 if (opts.get(i.first, &val) == 0) {
1707 os << delimiter << librbd::image_option_name(i.first) << "=" << val;
1708 delimiter = ", ";
1709 }
1710 }
1711 }
7c673cae 1712
11fdf7f2 1713 os << "]";
7c673cae 1714
11fdf7f2 1715 return os;
7c673cae 1716}