]> git.proxmox.com Git - ceph.git/blame - ceph/src/librbd/internal.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / librbd / internal.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3#include "include/int_types.h"
4
5#include <errno.h>
6#include <limits.h>
7
8#include "include/types.h"
9#include "include/uuid.h"
10#include "common/ceph_context.h"
11#include "common/dout.h"
12#include "common/errno.h"
13#include "common/Throttle.h"
14#include "common/event_socket.h"
11fdf7f2
TL
15#include "common/perf_counters.h"
16#include "osdc/Striper.h"
7c673cae
FG
17#include "include/stringify.h"
18
11fdf7f2 19#include "cls/lock/cls_lock_client.h"
7c673cae
FG
20#include "cls/rbd/cls_rbd.h"
21#include "cls/rbd/cls_rbd_types.h"
22#include "cls/rbd/cls_rbd_client.h"
23#include "cls/journal/cls_journal_types.h"
24#include "cls/journal/cls_journal_client.h"
25
26#include "librbd/ExclusiveLock.h"
27#include "librbd/ImageCtx.h"
28#include "librbd/ImageState.h"
29#include "librbd/internal.h"
30#include "librbd/Journal.h"
31#include "librbd/ObjectMap.h"
32#include "librbd/Operations.h"
33#include "librbd/Types.h"
34#include "librbd/Utils.h"
11fdf7f2 35#include "librbd/api/Config.h"
7c673cae
FG
36#include "librbd/api/Image.h"
37#include "librbd/exclusive_lock/AutomaticPolicy.h"
38#include "librbd/exclusive_lock/StandardPolicy.h"
9f95a23c 39#include "librbd/deep_copy/MetadataCopyRequest.h"
7c673cae
FG
40#include "librbd/image/CloneRequest.h"
41#include "librbd/image/CreateRequest.h"
9f95a23c 42#include "librbd/image/GetMetadataRequest.h"
7c673cae
FG
43#include "librbd/io/AioCompletion.h"
44#include "librbd/io/ImageRequest.h"
45#include "librbd/io/ImageRequestWQ.h"
11fdf7f2 46#include "librbd/io/ObjectDispatcher.h"
7c673cae
FG
47#include "librbd/io/ObjectRequest.h"
48#include "librbd/io/ReadResult.h"
49#include "librbd/journal/Types.h"
50#include "librbd/managed_lock/Types.h"
51#include "librbd/mirror/EnableRequest.h"
52#include "librbd/operation/TrimRequest.h"
53
54#include "journal/Journaler.h"
55
56#include <boost/scope_exit.hpp>
57#include <boost/variant.hpp>
11fdf7f2 58#include "include/ceph_assert.h"
7c673cae
FG
59
60#define dout_subsys ceph_subsys_rbd
61#undef dout_prefix
62#define dout_prefix *_dout << "librbd: "
63
64#define rbd_howmany(x, y) (((x) + (y) - 1) / (y))
65
66using std::map;
67using std::pair;
68using std::set;
69using std::string;
70using std::vector;
71// list binds to list() here, so std::list is explicitly used below
72
73using ceph::bufferlist;
74using librados::snap_t;
75using librados::IoCtx;
76using librados::Rados;
77
78namespace librbd {
79
80namespace {
81
82int validate_pool(IoCtx &io_ctx, CephContext *cct) {
11fdf7f2 83 if (!cct->_conf.get_val<bool>("rbd_validate_pool")) {
7c673cae
FG
84 return 0;
85 }
86
87 int r = io_ctx.stat(RBD_DIRECTORY, NULL, NULL);
88 if (r == 0) {
89 return 0;
90 } else if (r < 0 && r != -ENOENT) {
91 lderr(cct) << "failed to stat RBD directory: " << cpp_strerror(r) << dendl;
92 return r;
93 }
94
95 // allocate a self-managed snapshot id if this a new pool to force
96 // self-managed snapshot mode
97 uint64_t snap_id;
98 r = io_ctx.selfmanaged_snap_create(&snap_id);
99 if (r == -EINVAL) {
100 lderr(cct) << "pool not configured for self-managed RBD snapshot support"
101 << dendl;
102 return r;
103 } else if (r < 0) {
104 lderr(cct) << "failed to allocate self-managed snapshot: "
105 << cpp_strerror(r) << dendl;
106 return r;
107 }
108
109 r = io_ctx.selfmanaged_snap_remove(snap_id);
110 if (r < 0) {
111 lderr(cct) << "failed to release self-managed snapshot " << snap_id
112 << ": " << cpp_strerror(r) << dendl;
113 }
114 return 0;
115}
116
7c673cae
FG
117} // anonymous namespace
118
119 int detect_format(IoCtx &io_ctx, const string &name,
120 bool *old_format, uint64_t *size)
121 {
122 CephContext *cct = (CephContext *)io_ctx.cct();
123 if (old_format)
124 *old_format = true;
125 int r = io_ctx.stat(util::old_header_name(name), size, NULL);
126 if (r == -ENOENT) {
127 if (old_format)
128 *old_format = false;
129 r = io_ctx.stat(util::id_obj_name(name), size, NULL);
130 if (r < 0)
131 return r;
132 } else if (r < 0) {
133 return r;
134 }
135
136 ldout(cct, 20) << "detect format of " << name << " : "
137 << (old_format ? (*old_format ? "old" : "new") :
138 "don't care") << dendl;
139 return 0;
140 }
141
142 bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap)
143 {
144 return (parent_pool_id != -1 && off <= overlap);
145 }
146
147 void init_rbd_header(struct rbd_obj_header_ondisk& ondisk,
148 uint64_t size, int order, uint64_t bid)
149 {
150 uint32_t hi = bid >> 32;
151 uint32_t lo = bid & 0xFFFFFFFF;
152 uint32_t extra = rand() % 0xFFFFFFFF;
92f5a8d4 153 // FIPS zeroization audit 20191117: this memset is not security related.
7c673cae
FG
154 memset(&ondisk, 0, sizeof(ondisk));
155
156 memcpy(&ondisk.text, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT));
157 memcpy(&ondisk.signature, RBD_HEADER_SIGNATURE,
158 sizeof(RBD_HEADER_SIGNATURE));
159 memcpy(&ondisk.version, RBD_HEADER_VERSION, sizeof(RBD_HEADER_VERSION));
160
161 snprintf(ondisk.block_name, sizeof(ondisk.block_name), "rb.%x.%x.%x",
162 hi, lo, extra);
163
164 ondisk.image_size = size;
165 ondisk.options.order = order;
166 ondisk.options.crypt_type = RBD_CRYPT_NONE;
167 ondisk.options.comp_type = RBD_COMP_NONE;
168 ondisk.snap_seq = 0;
169 ondisk.snap_count = 0;
170 ondisk.reserved = 0;
171 ondisk.snap_names_len = 0;
172 }
173
174 void image_info(ImageCtx *ictx, image_info_t& info, size_t infosize)
175 {
176 int obj_order = ictx->order;
9f95a23c
TL
177 {
178 std::shared_lock locker{ictx->image_lock};
179 info.size = ictx->get_image_size(ictx->snap_id);
180 }
7c673cae
FG
181 info.obj_size = 1ULL << obj_order;
182 info.num_objs = Striper::get_num_objects(ictx->layout, info.size);
183 info.order = obj_order;
184 strncpy(info.block_name_prefix, ictx->object_prefix.c_str(),
185 RBD_MAX_BLOCK_NAME_SIZE);
186 info.block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE - 1] = '\0';
187
188 // clear deprecated fields
189 info.parent_pool = -1L;
190 info.parent_name[0] = '\0';
191 }
192
193 uint64_t oid_to_object_no(const string& oid, const string& object_prefix)
194 {
195 istringstream iss(oid);
196 // skip object prefix and separator
197 iss.ignore(object_prefix.length() + 1);
198 uint64_t num;
199 iss >> std::hex >> num;
200 return num;
201 }
202
203 void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx)
204 {
9f95a23c 205 ceph_assert(ceph_mutex_is_locked(ictx->owner_lock));
11fdf7f2
TL
206 ceph_assert(ictx->exclusive_lock == nullptr ||
207 ictx->exclusive_lock->is_lock_owner());
7c673cae
FG
208
209 C_SaferCond ctx;
9f95a23c 210 ictx->image_lock.lock_shared();
7c673cae
FG
211 operation::TrimRequest<> *req = operation::TrimRequest<>::create(
212 *ictx, &ctx, ictx->size, newsize, prog_ctx);
9f95a23c 213 ictx->image_lock.unlock_shared();
7c673cae
FG
214 req->send();
215
216 int r = ctx.wait();
217 if (r < 0) {
218 lderr(ictx->cct) << "warning: failed to remove some object(s): "
219 << cpp_strerror(r) << dendl;
220 }
221 }
222
223 int read_header_bl(IoCtx& io_ctx, const string& header_oid,
224 bufferlist& header, uint64_t *ver)
225 {
226 int r;
227 uint64_t off = 0;
228#define READ_SIZE 4096
229 do {
230 bufferlist bl;
231 r = io_ctx.read(header_oid, bl, READ_SIZE, off);
232 if (r < 0)
233 return r;
234 header.claim_append(bl);
235 off += r;
236 } while (r == READ_SIZE);
237
11fdf7f2
TL
238 static_assert(sizeof(RBD_HEADER_TEXT) == sizeof(RBD_MIGRATE_HEADER_TEXT),
239 "length of rbd headers must be the same");
240
7c673cae 241 if (header.length() < sizeof(RBD_HEADER_TEXT) ||
11fdf7f2
TL
242 (memcmp(RBD_HEADER_TEXT, header.c_str(),
243 sizeof(RBD_HEADER_TEXT)) != 0 &&
244 memcmp(RBD_MIGRATE_HEADER_TEXT, header.c_str(),
245 sizeof(RBD_MIGRATE_HEADER_TEXT)) != 0)) {
7c673cae
FG
246 CephContext *cct = (CephContext *)io_ctx.cct();
247 lderr(cct) << "unrecognized header format" << dendl;
248 return -ENXIO;
249 }
250
251 if (ver)
252 *ver = io_ctx.get_last_version();
253
254 return 0;
255 }
256
257 int read_header(IoCtx& io_ctx, const string& header_oid,
258 struct rbd_obj_header_ondisk *header, uint64_t *ver)
259 {
260 bufferlist header_bl;
261 int r = read_header_bl(io_ctx, header_oid, header_bl, ver);
262 if (r < 0)
263 return r;
264 if (header_bl.length() < (int)sizeof(*header))
265 return -EIO;
266 memcpy(header, header_bl.c_str(), sizeof(*header));
267
268 return 0;
269 }
270
271 int tmap_set(IoCtx& io_ctx, const string& imgname)
272 {
273 bufferlist cmdbl, emptybl;
274 __u8 c = CEPH_OSD_TMAP_SET;
11fdf7f2
TL
275 encode(c, cmdbl);
276 encode(imgname, cmdbl);
277 encode(emptybl, cmdbl);
7c673cae
FG
278 return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl);
279 }
280
281 int tmap_rm(IoCtx& io_ctx, const string& imgname)
282 {
283 bufferlist cmdbl;
284 __u8 c = CEPH_OSD_TMAP_RM;
11fdf7f2
TL
285 encode(c, cmdbl);
286 encode(imgname, cmdbl);
7c673cae
FG
287 return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl);
288 }
289
290 typedef boost::variant<std::string,uint64_t> image_option_value_t;
291 typedef std::map<int,image_option_value_t> image_options_t;
292 typedef std::shared_ptr<image_options_t> image_options_ref;
293
294 enum image_option_type_t {
295 STR,
296 UINT64,
297 };
298
299 const std::map<int, image_option_type_t> IMAGE_OPTIONS_TYPE_MAPPING = {
300 {RBD_IMAGE_OPTION_FORMAT, UINT64},
301 {RBD_IMAGE_OPTION_FEATURES, UINT64},
302 {RBD_IMAGE_OPTION_ORDER, UINT64},
303 {RBD_IMAGE_OPTION_STRIPE_UNIT, UINT64},
304 {RBD_IMAGE_OPTION_STRIPE_COUNT, UINT64},
305 {RBD_IMAGE_OPTION_JOURNAL_ORDER, UINT64},
306 {RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH, UINT64},
307 {RBD_IMAGE_OPTION_JOURNAL_POOL, STR},
308 {RBD_IMAGE_OPTION_FEATURES_SET, UINT64},
309 {RBD_IMAGE_OPTION_FEATURES_CLEAR, UINT64},
310 {RBD_IMAGE_OPTION_DATA_POOL, STR},
11fdf7f2 311 {RBD_IMAGE_OPTION_FLATTEN, UINT64},
92f5a8d4 312 {RBD_IMAGE_OPTION_CLONE_FORMAT, UINT64},
7c673cae
FG
313 };
314
315 std::string image_option_name(int optname) {
316 switch (optname) {
317 case RBD_IMAGE_OPTION_FORMAT:
318 return "format";
319 case RBD_IMAGE_OPTION_FEATURES:
320 return "features";
321 case RBD_IMAGE_OPTION_ORDER:
322 return "order";
323 case RBD_IMAGE_OPTION_STRIPE_UNIT:
324 return "stripe_unit";
325 case RBD_IMAGE_OPTION_STRIPE_COUNT:
326 return "stripe_count";
327 case RBD_IMAGE_OPTION_JOURNAL_ORDER:
328 return "journal_order";
329 case RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH:
330 return "journal_splay_width";
331 case RBD_IMAGE_OPTION_JOURNAL_POOL:
332 return "journal_pool";
333 case RBD_IMAGE_OPTION_FEATURES_SET:
334 return "features_set";
335 case RBD_IMAGE_OPTION_FEATURES_CLEAR:
336 return "features_clear";
337 case RBD_IMAGE_OPTION_DATA_POOL:
338 return "data_pool";
11fdf7f2
TL
339 case RBD_IMAGE_OPTION_FLATTEN:
340 return "flatten";
92f5a8d4
TL
341 case RBD_IMAGE_OPTION_CLONE_FORMAT:
342 return "clone_format";
7c673cae
FG
343 default:
344 return "unknown (" + stringify(optname) + ")";
345 }
346 }
347
7c673cae
FG
348 void image_options_create(rbd_image_options_t* opts)
349 {
350 image_options_ref* opts_ = new image_options_ref(new image_options_t());
351
352 *opts = static_cast<rbd_image_options_t>(opts_);
353 }
354
355 void image_options_create_ref(rbd_image_options_t* opts,
356 rbd_image_options_t orig)
357 {
358 image_options_ref* orig_ = static_cast<image_options_ref*>(orig);
359 image_options_ref* opts_ = new image_options_ref(*orig_);
360
361 *opts = static_cast<rbd_image_options_t>(opts_);
362 }
363
364 void image_options_copy(rbd_image_options_t* opts,
365 const ImageOptions &orig)
366 {
367 image_options_ref* opts_ = new image_options_ref(new image_options_t());
368
369 *opts = static_cast<rbd_image_options_t>(opts_);
370
371 std::string str_val;
372 uint64_t uint64_val;
373 for (auto &i : IMAGE_OPTIONS_TYPE_MAPPING) {
374 switch (i.second) {
375 case STR:
376 if (orig.get(i.first, &str_val) == 0) {
377 image_options_set(*opts, i.first, str_val);
378 }
379 continue;
380 case UINT64:
381 if (orig.get(i.first, &uint64_val) == 0) {
382 image_options_set(*opts, i.first, uint64_val);
383 }
384 continue;
385 }
386 }
387 }
388
389 void image_options_destroy(rbd_image_options_t opts)
390 {
391 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
392
393 delete opts_;
394 }
395
396 int image_options_set(rbd_image_options_t opts, int optname,
397 const std::string& optval)
398 {
399 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
400
401 std::map<int, image_option_type_t>::const_iterator i =
402 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
403
404 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
405 return -EINVAL;
406 }
407
408 (*opts_->get())[optname] = optval;
409 return 0;
410 }
411
412 int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval)
413 {
414 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
415
416 std::map<int, image_option_type_t>::const_iterator i =
417 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
418
419 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
420 return -EINVAL;
421 }
422
423 (*opts_->get())[optname] = optval;
424 return 0;
425 }
426
427 int image_options_get(rbd_image_options_t opts, int optname,
428 std::string* optval)
429 {
430 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
431
432 std::map<int, image_option_type_t>::const_iterator i =
433 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
434
435 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
436 return -EINVAL;
437 }
438
439 image_options_t::const_iterator j = (*opts_)->find(optname);
440
441 if (j == (*opts_)->end()) {
442 return -ENOENT;
443 }
444
445 *optval = boost::get<std::string>(j->second);
446 return 0;
447 }
448
449 int image_options_get(rbd_image_options_t opts, int optname, uint64_t* optval)
450 {
451 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
452
453 std::map<int, image_option_type_t>::const_iterator i =
454 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
455
456 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
457 return -EINVAL;
458 }
459
460 image_options_t::const_iterator j = (*opts_)->find(optname);
461
462 if (j == (*opts_)->end()) {
463 return -ENOENT;
464 }
465
466 *optval = boost::get<uint64_t>(j->second);
467 return 0;
468 }
469
470 int image_options_is_set(rbd_image_options_t opts, int optname,
471 bool* is_set)
472 {
473 if (IMAGE_OPTIONS_TYPE_MAPPING.find(optname) ==
474 IMAGE_OPTIONS_TYPE_MAPPING.end()) {
475 return -EINVAL;
476 }
477
478 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
479 *is_set = ((*opts_)->find(optname) != (*opts_)->end());
480 return 0;
481 }
482
483 int image_options_unset(rbd_image_options_t opts, int optname)
484 {
485 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
486
487 std::map<int, image_option_type_t>::const_iterator i =
488 IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
489
490 if (i == IMAGE_OPTIONS_TYPE_MAPPING.end()) {
11fdf7f2 491 ceph_assert((*opts_)->find(optname) == (*opts_)->end());
7c673cae
FG
492 return -EINVAL;
493 }
494
495 image_options_t::const_iterator j = (*opts_)->find(optname);
496
497 if (j == (*opts_)->end()) {
498 return -ENOENT;
499 }
500
501 (*opts_)->erase(j);
502 return 0;
503 }
504
505 void image_options_clear(rbd_image_options_t opts)
506 {
507 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
508
509 (*opts_)->clear();
510 }
511
512 bool image_options_is_empty(rbd_image_options_t opts)
513 {
514 image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
515
516 return (*opts_)->empty();
517 }
518
7c673cae
FG
519 int create_v1(IoCtx& io_ctx, const char *imgname, uint64_t size, int order)
520 {
521 CephContext *cct = (CephContext *)io_ctx.cct();
522
523 ldout(cct, 20) << __func__ << " " << &io_ctx << " name = " << imgname
524 << " size = " << size << " order = " << order << dendl;
525 int r = validate_pool(io_ctx, cct);
526 if (r < 0) {
527 return r;
528 }
529
11fdf7f2
TL
530 if (!io_ctx.get_namespace().empty()) {
531 lderr(cct) << "attempting to add v1 image to namespace" << dendl;
532 return -EINVAL;
533 }
534
7c673cae
FG
535 ldout(cct, 2) << "adding rbd image to directory..." << dendl;
536 r = tmap_set(io_ctx, imgname);
537 if (r < 0) {
538 lderr(cct) << "error adding image to directory: " << cpp_strerror(r)
539 << dendl;
540 return r;
541 }
542
543 Rados rados(io_ctx);
544 uint64_t bid = rados.get_instance_id();
545
546 ldout(cct, 2) << "creating rbd image..." << dendl;
547 struct rbd_obj_header_ondisk header;
548 init_rbd_header(header, size, order, bid);
549
550 bufferlist bl;
551 bl.append((const char *)&header, sizeof(header));
552
553 string header_oid = util::old_header_name(imgname);
554 r = io_ctx.write(header_oid, bl, bl.length(), 0);
555 if (r < 0) {
556 lderr(cct) << "Error writing image header: " << cpp_strerror(r)
557 << dendl;
558 int remove_r = tmap_rm(io_ctx, imgname);
559 if (remove_r < 0) {
560 lderr(cct) << "Could not remove image from directory after "
561 << "header creation failed: "
562 << cpp_strerror(remove_r) << dendl;
563 }
564 return r;
565 }
566
567 ldout(cct, 2) << "done." << dendl;
568 return 0;
569 }
570
571 int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
572 int *order)
573 {
574 uint64_t order_ = *order;
575 ImageOptions opts;
576
577 int r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
11fdf7f2 578 ceph_assert(r == 0);
7c673cae
FG
579
580 r = create(io_ctx, imgname, "", size, opts, "", "", false);
581
582 int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
11fdf7f2 583 ceph_assert(r1 == 0);
7c673cae
FG
584 *order = order_;
585
586 return r;
587 }
588
589 int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
590 bool old_format, uint64_t features, int *order,
591 uint64_t stripe_unit, uint64_t stripe_count)
592 {
593 if (!order)
594 return -EINVAL;
595
596 uint64_t order_ = *order;
597 uint64_t format = old_format ? 1 : 2;
598 ImageOptions opts;
599 int r;
600
601 r = opts.set(RBD_IMAGE_OPTION_FORMAT, format);
11fdf7f2 602 ceph_assert(r == 0);
7c673cae 603 r = opts.set(RBD_IMAGE_OPTION_FEATURES, features);
11fdf7f2 604 ceph_assert(r == 0);
7c673cae 605 r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
11fdf7f2 606 ceph_assert(r == 0);
7c673cae 607 r = opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
11fdf7f2 608 ceph_assert(r == 0);
7c673cae 609 r = opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
11fdf7f2 610 ceph_assert(r == 0);
7c673cae
FG
611
612 r = create(io_ctx, imgname, "", size, opts, "", "", false);
613
614 int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
11fdf7f2 615 ceph_assert(r1 == 0);
7c673cae
FG
616 *order = order_;
617
618 return r;
619 }
620
621 int create(IoCtx& io_ctx, const std::string &image_name,
622 const std::string &image_id, uint64_t size,
623 ImageOptions& opts,
624 const std::string &non_primary_global_image_id,
625 const std::string &primary_mirror_uuid,
626 bool skip_mirror_enable)
627 {
628 std::string id(image_id);
629 if (id.empty()) {
630 id = util::generate_image_id(io_ctx);
631 }
632
633 CephContext *cct = (CephContext *)io_ctx.cct();
92f5a8d4
TL
634 uint64_t option;
635 if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) {
11fdf7f2
TL
636 lderr(cct) << "create does not support 'flatten' image option" << dendl;
637 return -EINVAL;
638 }
92f5a8d4
TL
639 if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) {
640 lderr(cct) << "create does not support 'clone_format' image option"
641 << dendl;
642 return -EINVAL;
643 }
11fdf7f2 644
7c673cae
FG
645 ldout(cct, 10) << __func__ << " name=" << image_name << ", "
646 << "id= " << id << ", "
647 << "size=" << size << ", opts=" << opts << dendl;
648
649 uint64_t format;
650 if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0)
11fdf7f2 651 format = cct->_conf.get_val<uint64_t>("rbd_default_format");
7c673cae
FG
652 bool old_format = format == 1;
653
654 // make sure it doesn't already exist, in either format
655 int r = detect_format(io_ctx, image_name, NULL, NULL);
656 if (r != -ENOENT) {
657 if (r) {
658 lderr(cct) << "Could not tell if " << image_name << " already exists"
659 << dendl;
660 return r;
661 }
662 lderr(cct) << "rbd image " << image_name << " already exists" << dendl;
663 return -EEXIST;
664 }
665
666 uint64_t order = 0;
667 if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0 || order == 0) {
11fdf7f2 668 order = cct->_conf.get_val<uint64_t>("rbd_default_order");
7c673cae
FG
669 }
670 r = image::CreateRequest<>::validate_order(cct, order);
671 if (r < 0) {
672 return r;
673 }
674
675 if (old_format) {
11fdf7f2
TL
676 if ( !getenv("RBD_FORCE_ALLOW_V1") ) {
677 lderr(cct) << "Format 1 image creation unsupported. " << dendl;
678 return -EINVAL;
679 }
680 lderr(cct) << "Forced V1 image creation. " << dendl;
7c673cae
FG
681 r = create_v1(io_ctx, image_name.c_str(), size, order);
682 } else {
683 ThreadPool *thread_pool;
684 ContextWQ *op_work_queue;
685 ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
686
11fdf7f2
TL
687 ConfigProxy config{cct->_conf};
688 api::Config<>::apply_pool_overrides(io_ctx, &config);
689
7c673cae
FG
690 C_SaferCond cond;
691 image::CreateRequest<> *req = image::CreateRequest<>::create(
9f95a23c
TL
692 config, io_ctx, image_name, id, size, opts, skip_mirror_enable,
693 cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, non_primary_global_image_id,
694 primary_mirror_uuid, op_work_queue, &cond);
7c673cae
FG
695 req->send();
696
697 r = cond.wait();
698 }
699
700 int r1 = opts.set(RBD_IMAGE_OPTION_ORDER, order);
11fdf7f2 701 ceph_assert(r1 == 0);
7c673cae
FG
702
703 return r;
704 }
705
706 /*
707 * Parent may be in different pool, hence different IoCtx
708 */
709 int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
710 IoCtx& c_ioctx, const char *c_name,
711 uint64_t features, int *c_order,
712 uint64_t stripe_unit, int stripe_count)
713 {
714 uint64_t order = *c_order;
715
716 ImageOptions opts;
717 opts.set(RBD_IMAGE_OPTION_FEATURES, features);
718 opts.set(RBD_IMAGE_OPTION_ORDER, order);
719 opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
720 opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
721
11fdf7f2
TL
722 int r = clone(p_ioctx, nullptr, p_name, p_snap_name, c_ioctx, nullptr,
723 c_name, opts, "", "");
7c673cae
FG
724 opts.get(RBD_IMAGE_OPTION_ORDER, &order);
725 *c_order = order;
726 return r;
727 }
728
11fdf7f2
TL
729 int clone(IoCtx& p_ioctx, const char *p_id, const char *p_name,
730 const char *p_snap_name, IoCtx& c_ioctx, const char *c_id,
731 const char *c_name, ImageOptions& c_opts,
732 const std::string &non_primary_global_image_id,
733 const std::string &primary_mirror_uuid)
7c673cae 734 {
11fdf7f2
TL
735 ceph_assert((p_id == nullptr) ^ (p_name == nullptr));
736
7c673cae 737 CephContext *cct = (CephContext *)p_ioctx.cct();
11fdf7f2 738 if (p_snap_name == nullptr) {
7c673cae
FG
739 lderr(cct) << "image to be cloned must be a snapshot" << dendl;
740 return -EINVAL;
741 }
742
11fdf7f2
TL
743 uint64_t flatten;
744 if (c_opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) {
745 lderr(cct) << "clone does not support 'flatten' image option" << dendl;
746 return -EINVAL;
7c673cae
FG
747 }
748
11fdf7f2
TL
749 int r;
750 std::string parent_id;
751 if (p_id == nullptr) {
752 r = cls_client::dir_get_id(&p_ioctx, RBD_DIRECTORY, p_name,
753 &parent_id);
754 if (r < 0) {
755 if (r != -ENOENT) {
756 lderr(cct) << "failed to retrieve parent image id: "
757 << cpp_strerror(r) << dendl;
758 }
759 return r;
760 }
761 } else {
762 parent_id = p_id;
7c673cae 763 }
7c673cae 764
11fdf7f2
TL
765 std::string clone_id;
766 if (c_id == nullptr) {
767 clone_id = util::generate_image_id(c_ioctx);
768 } else {
769 clone_id = c_id;
7c673cae
FG
770 }
771
7c673cae
FG
772 ldout(cct, 10) << __func__ << " "
773 << "c_name=" << c_name << ", "
11fdf7f2 774 << "c_id= " << clone_id << ", "
7c673cae
FG
775 << "c_opts=" << c_opts << dendl;
776
11fdf7f2
TL
777 ConfigProxy config{reinterpret_cast<CephContext *>(c_ioctx.cct())->_conf};
778 api::Config<>::apply_pool_overrides(c_ioctx, &config);
779
7c673cae
FG
780 ThreadPool *thread_pool;
781 ContextWQ *op_work_queue;
782 ImageCtx::get_thread_pool_instance(cct, &thread_pool, &op_work_queue);
783
784 C_SaferCond cond;
785 auto *req = image::CloneRequest<>::create(
9f95a23c
TL
786 config, p_ioctx, parent_id, p_snap_name,
787 {cls::rbd::UserSnapshotNamespace{}}, CEPH_NOSNAP, c_ioctx, c_name,
788 clone_id, c_opts, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL,
789 non_primary_global_image_id, primary_mirror_uuid, op_work_queue, &cond);
7c673cae
FG
790 req->send();
791
11fdf7f2
TL
792 r = cond.wait();
793 if (r < 0) {
794 return r;
795 }
796
797 return 0;
7c673cae
FG
798 }
799
800 int rename(IoCtx& io_ctx, const char *srcname, const char *dstname)
801 {
802 CephContext *cct = (CephContext *)io_ctx.cct();
803 ldout(cct, 20) << "rename " << &io_ctx << " " << srcname << " -> "
804 << dstname << dendl;
805
806 ImageCtx *ictx = new ImageCtx(srcname, "", "", io_ctx, false);
11fdf7f2 807 int r = ictx->state->open(0);
7c673cae 808 if (r < 0) {
181888fb 809 lderr(cct) << "error opening source image: " << cpp_strerror(r) << dendl;
7c673cae
FG
810 return r;
811 }
812 BOOST_SCOPE_EXIT((ictx)) {
813 ictx->state->close();
814 } BOOST_SCOPE_EXIT_END
815
816 return ictx->operations->rename(dstname);
817 }
818
819 int info(ImageCtx *ictx, image_info_t& info, size_t infosize)
820 {
821 ldout(ictx->cct, 20) << "info " << ictx << dendl;
822
823 int r = ictx->state->refresh_if_required();
824 if (r < 0)
825 return r;
826
827 image_info(ictx, info, infosize);
828 return 0;
829 }
830
831 int get_old_format(ImageCtx *ictx, uint8_t *old)
832 {
833 int r = ictx->state->refresh_if_required();
834 if (r < 0)
835 return r;
836 *old = ictx->old_format;
837 return 0;
838 }
839
840 int get_size(ImageCtx *ictx, uint64_t *size)
841 {
842 int r = ictx->state->refresh_if_required();
843 if (r < 0)
844 return r;
9f95a23c 845 std::shared_lock l2{ictx->image_lock};
7c673cae
FG
846 *size = ictx->get_image_size(ictx->snap_id);
847 return 0;
848 }
849
850 int get_features(ImageCtx *ictx, uint64_t *features)
851 {
852 int r = ictx->state->refresh_if_required();
853 if (r < 0)
854 return r;
9f95a23c 855 std::shared_lock l{ictx->image_lock};
7c673cae
FG
856 *features = ictx->features;
857 return 0;
858 }
859
860 int get_overlap(ImageCtx *ictx, uint64_t *overlap)
861 {
862 int r = ictx->state->refresh_if_required();
863 if (r < 0)
864 return r;
9f95a23c 865 std::shared_lock image_locker{ictx->image_lock};
7c673cae
FG
866 return ictx->get_parent_overlap(ictx->snap_id, overlap);
867 }
868
7c673cae
FG
869 int get_flags(ImageCtx *ictx, uint64_t *flags)
870 {
871 int r = ictx->state->refresh_if_required();
872 if (r < 0) {
873 return r;
874 }
875
9f95a23c 876 std::shared_lock l2{ictx->image_lock};
7c673cae
FG
877 return ictx->get_flags(ictx->snap_id, flags);
878 }
879
880 int set_image_notification(ImageCtx *ictx, int fd, int type)
881 {
882 CephContext *cct = ictx->cct;
883 ldout(cct, 20) << __func__ << " " << ictx << " fd " << fd << " type" << type << dendl;
884
885 int r = ictx->state->refresh_if_required();
886 if (r < 0) {
887 return r;
888 }
889
890 if (ictx->event_socket.is_valid())
891 return -EINVAL;
892 return ictx->event_socket.init(fd, type);
893 }
894
895 int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner)
896 {
91327a77
AA
897 CephContext *cct = ictx->cct;
898 ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
7c673cae
FG
899 *is_owner = false;
900
9f95a23c 901 std::shared_lock owner_locker{ictx->owner_lock};
91327a77 902 if (ictx->exclusive_lock == nullptr) {
7c673cae
FG
903 return 0;
904 }
905
906 // might have been blacklisted by peer -- ensure we still own
907 // the lock by pinging the OSD
908 int r = ictx->exclusive_lock->assert_header_locked();
31f18b77
FG
909 if (r == -EBUSY || r == -ENOENT) {
910 return 0;
911 } else if (r < 0) {
7c673cae
FG
912 return r;
913 }
914
915 *is_owner = true;
916 return 0;
917 }
918
919 int lock_acquire(ImageCtx *ictx, rbd_lock_mode_t lock_mode)
920 {
921 CephContext *cct = ictx->cct;
922 ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", "
923 << "lock_mode=" << lock_mode << dendl;
924
925 if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) {
926 return -EOPNOTSUPP;
927 }
928
929 C_SaferCond lock_ctx;
930 {
9f95a23c 931 std::unique_lock l{ictx->owner_lock};
7c673cae
FG
932
933 if (ictx->exclusive_lock == nullptr) {
934 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
935 return -EINVAL;
936 }
937
938 if (ictx->get_exclusive_lock_policy()->may_auto_request_lock()) {
939 ictx->set_exclusive_lock_policy(
940 new exclusive_lock::StandardPolicy(ictx));
941 }
942
943 if (ictx->exclusive_lock->is_lock_owner()) {
944 return 0;
945 }
946
947 ictx->exclusive_lock->acquire_lock(&lock_ctx);
948 }
949
950 int r = lock_ctx.wait();
951 if (r < 0) {
952 lderr(cct) << "failed to request exclusive lock: " << cpp_strerror(r)
953 << dendl;
954 return r;
955 }
956
9f95a23c 957 std::shared_lock l{ictx->owner_lock};
91327a77
AA
958 if (ictx->exclusive_lock == nullptr) {
959 return -EINVAL;
960 } else if (!ictx->exclusive_lock->is_lock_owner()) {
7c673cae 961 lderr(cct) << "failed to acquire exclusive lock" << dendl;
91327a77 962 return ictx->exclusive_lock->get_unlocked_op_error();
7c673cae
FG
963 }
964
965 return 0;
966 }
967
968 int lock_release(ImageCtx *ictx)
969 {
970 CephContext *cct = ictx->cct;
971 ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
972
973 C_SaferCond lock_ctx;
974 {
9f95a23c 975 std::unique_lock l{ictx->owner_lock};
7c673cae
FG
976
977 if (ictx->exclusive_lock == nullptr ||
978 !ictx->exclusive_lock->is_lock_owner()) {
979 lderr(cct) << "not exclusive lock owner" << dendl;
980 return -EINVAL;
981 }
982
983 ictx->exclusive_lock->release_lock(&lock_ctx);
984 }
985
986 int r = lock_ctx.wait();
987 if (r < 0) {
988 lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r)
989 << dendl;
990 return r;
991 }
992 return 0;
993 }
994
995 int lock_get_owners(ImageCtx *ictx, rbd_lock_mode_t *lock_mode,
996 std::list<std::string> *lock_owners)
997 {
998 CephContext *cct = ictx->cct;
999 ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl;
1000
7c673cae
FG
1001 managed_lock::Locker locker;
1002 C_SaferCond get_owner_ctx;
9f95a23c
TL
1003 {
1004 std::shared_lock owner_locker{ictx->owner_lock};
1005
1006 if (ictx->exclusive_lock == nullptr) {
1007 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
1008 return -EINVAL;
1009 }
1010
1011 ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx);
1012 }
1013
7c673cae
FG
1014 int r = get_owner_ctx.wait();
1015 if (r == -ENOENT) {
1016 return r;
1017 } else if (r < 0) {
1018 lderr(cct) << "failed to determine current lock owner: "
1019 << cpp_strerror(r) << dendl;
1020 return r;
1021 }
1022
1023 *lock_mode = RBD_LOCK_MODE_EXCLUSIVE;
1024 lock_owners->clear();
1025 lock_owners->emplace_back(locker.address);
1026 return 0;
1027 }
1028
1029 int lock_break(ImageCtx *ictx, rbd_lock_mode_t lock_mode,
11fdf7f2 1030 const std::string &lock_owner) {
7c673cae
FG
1031 CephContext *cct = ictx->cct;
1032 ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", "
1033 << "lock_mode=" << lock_mode << ", "
1034 << "lock_owner=" << lock_owner << dendl;
1035
1036 if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) {
1037 return -EOPNOTSUPP;
1038 }
1039
1040 if (ictx->read_only) {
1041 return -EROFS;
1042 }
1043
1044 managed_lock::Locker locker;
1045 C_SaferCond get_owner_ctx;
1046 {
9f95a23c 1047 std::shared_lock l{ictx->owner_lock};
7c673cae
FG
1048
1049 if (ictx->exclusive_lock == nullptr) {
1050 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
1051 return -EINVAL;
1052 }
1053
1054 ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx);
1055 }
1056 int r = get_owner_ctx.wait();
1057 if (r == -ENOENT) {
1058 return r;
1059 } else if (r < 0) {
1060 lderr(cct) << "failed to determine current lock owner: "
1061 << cpp_strerror(r) << dendl;
1062 return r;
1063 }
1064
1065 if (locker.address != lock_owner) {
1066 return -EBUSY;
1067 }
1068
1069 C_SaferCond break_ctx;
1070 {
9f95a23c 1071 std::shared_lock l{ictx->owner_lock};
7c673cae
FG
1072
1073 if (ictx->exclusive_lock == nullptr) {
1074 lderr(cct) << "exclusive-lock feature is not enabled" << dendl;
1075 return -EINVAL;
1076 }
1077
1078 ictx->exclusive_lock->break_lock(locker, true, &break_ctx);
1079 }
1080 r = break_ctx.wait();
1081 if (r == -ENOENT) {
1082 return r;
1083 } else if (r < 0) {
1084 lderr(cct) << "failed to break lock: " << cpp_strerror(r) << dendl;
1085 return r;
1086 }
1087 return 0;
1088 }
1089
7c673cae
FG
1090 int copy(ImageCtx *src, IoCtx& dest_md_ctx, const char *destname,
1091 ImageOptions& opts, ProgressContext &prog_ctx, size_t sparse_size)
1092 {
1093 CephContext *cct = (CephContext *)dest_md_ctx.cct();
92f5a8d4
TL
1094 uint64_t option;
1095 if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) {
11fdf7f2
TL
1096 lderr(cct) << "copy does not support 'flatten' image option" << dendl;
1097 return -EINVAL;
1098 }
92f5a8d4
TL
1099 if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) {
1100 lderr(cct) << "copy does not support 'clone_format' image option"
1101 << dendl;
1102 return -EINVAL;
1103 }
11fdf7f2 1104
7c673cae
FG
1105 ldout(cct, 20) << "copy " << src->name
1106 << (src->snap_name.length() ? "@" + src->snap_name : "")
1107 << " -> " << destname << " opts = " << opts << dendl;
1108
9f95a23c 1109 src->image_lock.lock_shared();
7c673cae
FG
1110 uint64_t features = src->features;
1111 uint64_t src_size = src->get_image_size(src->snap_id);
9f95a23c 1112 src->image_lock.unlock_shared();
7c673cae
FG
1113 uint64_t format = src->old_format ? 1 : 2;
1114 if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) {
1115 opts.set(RBD_IMAGE_OPTION_FORMAT, format);
1116 }
1117 uint64_t stripe_unit = src->stripe_unit;
1118 if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) {
1119 opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
1120 }
1121 uint64_t stripe_count = src->stripe_count;
1122 if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) {
1123 opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
1124 }
1125 uint64_t order = src->order;
1126 if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
1127 opts.set(RBD_IMAGE_OPTION_ORDER, order);
1128 }
1129 if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) {
1130 opts.set(RBD_IMAGE_OPTION_FEATURES, features);
1131 }
1132 if (features & ~RBD_FEATURES_ALL) {
1133 lderr(cct) << "librbd does not support requested features" << dendl;
1134 return -ENOSYS;
1135 }
1136
1137 int r = create(dest_md_ctx, destname, "", src_size, opts, "", "", false);
1138 if (r < 0) {
1139 lderr(cct) << "header creation failed" << dendl;
1140 return r;
1141 }
1142 opts.set(RBD_IMAGE_OPTION_ORDER, static_cast<uint64_t>(order));
1143
11fdf7f2
TL
1144 ImageCtx *dest = new librbd::ImageCtx(destname, "", nullptr, dest_md_ctx,
1145 false);
1146 r = dest->state->open(0);
7c673cae
FG
1147 if (r < 0) {
1148 lderr(cct) << "failed to read newly created header" << dendl;
1149 return r;
1150 }
1151
1152 r = copy(src, dest, prog_ctx, sparse_size);
1153
1154 int close_r = dest->state->close();
1155 if (r == 0 && close_r < 0) {
1156 r = close_r;
1157 }
1158 return r;
1159 }
1160
1161 class C_CopyWrite : public Context {
1162 public:
1163 C_CopyWrite(bufferlist *bl, Context* ctx)
1164 : m_bl(bl), m_ctx(ctx) {}
1165 void finish(int r) override {
1166 delete m_bl;
1167 m_ctx->complete(r);
1168 }
1169 private:
1170 bufferlist *m_bl;
1171 Context *m_ctx;
1172 };
1173
1174 class C_CopyRead : public Context {
1175 public:
1176 C_CopyRead(SimpleThrottle *throttle, ImageCtx *dest, uint64_t offset,
1177 bufferlist *bl, size_t sparse_size)
1178 : m_throttle(throttle), m_dest(dest), m_offset(offset), m_bl(bl),
1179 m_sparse_size(sparse_size) {
1180 m_throttle->start_op();
1181 }
1182 void finish(int r) override {
1183 if (r < 0) {
1184 lderr(m_dest->cct) << "error reading from source image at offset "
1185 << m_offset << ": " << cpp_strerror(r) << dendl;
1186 delete m_bl;
1187 m_throttle->end_op(r);
1188 return;
1189 }
11fdf7f2 1190 ceph_assert(m_bl->length() == (size_t)r);
7c673cae
FG
1191
1192 if (m_bl->is_zero()) {
1193 delete m_bl;
1194 m_throttle->end_op(r);
1195 return;
1196 }
1197
1198 if (!m_sparse_size) {
1199 m_sparse_size = (1 << m_dest->order);
1200 }
1201
1202 auto *throttle = m_throttle;
9f95a23c 1203 auto *end_op_ctx = new LambdaContext([throttle](int r) {
7c673cae
FG
1204 throttle->end_op(r);
1205 });
1206 auto gather_ctx = new C_Gather(m_dest->cct, end_op_ctx);
1207
11fdf7f2 1208 m_bl->rebuild(buffer::ptr_node::create(m_bl->length()));
7c673cae
FG
1209 size_t write_offset = 0;
1210 size_t write_length = 0;
1211 size_t offset = 0;
1212 size_t length = m_bl->length();
11fdf7f2 1213 const auto& m_ptr = m_bl->front();
7c673cae
FG
1214 while (offset < length) {
1215 if (util::calc_sparse_extent(m_ptr,
1216 m_sparse_size,
1217 length,
1218 &write_offset,
1219 &write_length,
1220 &offset)) {
7c673cae 1221 bufferlist *write_bl = new bufferlist();
11fdf7f2
TL
1222 write_bl->push_back(
1223 buffer::ptr_node::create(m_ptr, write_offset, write_length));
7c673cae
FG
1224 Context *ctx = new C_CopyWrite(write_bl, gather_ctx->new_sub());
1225 auto comp = io::AioCompletion::create(ctx);
1226
1227 // coordinate through AIO WQ to ensure lock is acquired if needed
1228 m_dest->io_work_queue->aio_write(comp, m_offset + write_offset,
1229 write_length,
1230 std::move(*write_bl),
31f18b77
FG
1231 LIBRADOS_OP_FLAG_FADVISE_DONTNEED,
1232 std::move(read_trace));
7c673cae
FG
1233 write_offset = offset;
1234 write_length = 0;
1235 }
1236 }
1237 delete m_bl;
11fdf7f2 1238 ceph_assert(gather_ctx->get_sub_created_count() > 0);
7c673cae
FG
1239 gather_ctx->activate();
1240 }
1241
31f18b77
FG
1242 ZTracer::Trace read_trace;
1243
7c673cae
FG
1244 private:
1245 SimpleThrottle *m_throttle;
1246 ImageCtx *m_dest;
1247 uint64_t m_offset;
1248 bufferlist *m_bl;
1249 size_t m_sparse_size;
1250 };
1251
1252 int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx, size_t sparse_size)
1253 {
9f95a23c 1254 src->image_lock.lock_shared();
7c673cae 1255 uint64_t src_size = src->get_image_size(src->snap_id);
9f95a23c 1256 src->image_lock.unlock_shared();
7c673cae 1257
9f95a23c 1258 dest->image_lock.lock_shared();
7c673cae 1259 uint64_t dest_size = dest->get_image_size(dest->snap_id);
9f95a23c 1260 dest->image_lock.unlock_shared();
7c673cae
FG
1261
1262 CephContext *cct = src->cct;
1263 if (dest_size < src_size) {
1264 lderr(cct) << " src size " << src_size << " > dest size "
1265 << dest_size << dendl;
1266 return -EINVAL;
1267 }
b32b8144 1268
9f95a23c
TL
1269 C_SaferCond ctx;
1270 auto req = deep_copy::MetadataCopyRequest<>::create(
1271 src, dest, &ctx);
1272 req->send();
b32b8144 1273
9f95a23c
TL
1274 int r = ctx.wait();
1275 if (r < 0) {
1276 lderr(cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl;
1277 return r;
7c673cae
FG
1278 }
1279
31f18b77 1280 ZTracer::Trace trace;
181888fb 1281 if (src->blkin_trace_all) {
31f18b77
FG
1282 trace.init("copy", &src->trace_endpoint);
1283 }
1284
9f95a23c 1285 std::shared_lock owner_lock{src->owner_lock};
11fdf7f2 1286 SimpleThrottle throttle(src->config.get_val<uint64_t>("rbd_concurrent_management_ops"), false);
7c673cae 1287 uint64_t period = src->get_stripe_period();
31f18b77
FG
1288 unsigned fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
1289 LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
11fdf7f2 1290 uint64_t object_id = 0;
7c673cae
FG
1291 for (uint64_t offset = 0; offset < src_size; offset += period) {
1292 if (throttle.pending_error()) {
1293 return throttle.wait_for_ret();
1294 }
1295
11fdf7f2 1296 {
9f95a23c 1297 std::shared_lock image_locker{src->image_lock};
11fdf7f2
TL
1298 if (src->object_map != nullptr) {
1299 bool skip = true;
1300 // each period is related to src->stripe_count objects, check them all
1301 for (uint64_t i=0; i < src->stripe_count; i++) {
1302 if (object_id < src->object_map->size() &&
1303 src->object_map->object_may_exist(object_id)) {
1304 skip = false;
1305 }
1306 ++object_id;
1307 }
1308
1309 if (skip) continue;
1310 } else {
1311 object_id += src->stripe_count;
1312 }
1313 }
1314
7c673cae
FG
1315 uint64_t len = min(period, src_size - offset);
1316 bufferlist *bl = new bufferlist();
31f18b77
FG
1317 auto ctx = new C_CopyRead(&throttle, dest, offset, bl, sparse_size);
1318 auto comp = io::AioCompletion::create_and_start<Context>(
1319 ctx, src, io::AIO_TYPE_READ);
1320
1321 io::ImageReadRequest<> req(*src, comp, {{offset, len}},
1322 io::ReadResult{bl}, fadvise_flags,
1323 std::move(trace));
1324 ctx->read_trace = req.get_trace();
1325
1326 req.send();
7c673cae
FG
1327 prog_ctx.update_progress(offset, src_size);
1328 }
1329
1330 r = throttle.wait_for_ret();
1331 if (r >= 0)
1332 prog_ctx.update_progress(src_size, src_size);
1333 return r;
1334 }
1335
7c673cae
FG
1336 int list_lockers(ImageCtx *ictx,
1337 std::list<locker_t> *lockers,
1338 bool *exclusive,
1339 string *tag)
1340 {
1341 ldout(ictx->cct, 20) << "list_locks on image " << ictx << dendl;
1342
1343 int r = ictx->state->refresh_if_required();
1344 if (r < 0)
1345 return r;
1346
9f95a23c 1347 std::shared_lock locker{ictx->image_lock};
7c673cae
FG
1348 if (exclusive)
1349 *exclusive = ictx->exclusive_locked;
1350 if (tag)
1351 *tag = ictx->lock_tag;
1352 if (lockers) {
1353 lockers->clear();
1354 map<rados::cls::lock::locker_id_t,
1355 rados::cls::lock::locker_info_t>::const_iterator it;
1356 for (it = ictx->lockers.begin(); it != ictx->lockers.end(); ++it) {
1357 locker_t locker;
1358 locker.client = stringify(it->first.locker);
1359 locker.cookie = it->first.cookie;
11fdf7f2 1360 locker.address = it->second.addr.get_legacy_str();
7c673cae
FG
1361 lockers->push_back(locker);
1362 }
1363 }
1364
1365 return 0;
1366 }
1367
1368 int lock(ImageCtx *ictx, bool exclusive, const string& cookie,
1369 const string& tag)
1370 {
1371 ldout(ictx->cct, 20) << "lock image " << ictx << " exclusive=" << exclusive
1372 << " cookie='" << cookie << "' tag='" << tag << "'"
1373 << dendl;
1374
1375 int r = ictx->state->refresh_if_required();
1376 if (r < 0)
1377 return r;
1378
1379 /**
1380 * If we wanted we could do something more intelligent, like local
1381 * checks that we think we will succeed. But for now, let's not
1382 * duplicate that code.
1383 */
1384 {
9f95a23c 1385 std::shared_lock locker{ictx->image_lock};
7c673cae
FG
1386 r = rados::cls::lock::lock(&ictx->md_ctx, ictx->header_oid, RBD_LOCK_NAME,
1387 exclusive ? LOCK_EXCLUSIVE : LOCK_SHARED,
1388 cookie, tag, "", utime_t(), 0);
1389 if (r < 0) {
1390 return r;
1391 }
1392 }
1393
1394 ictx->notify_update();
1395 return 0;
1396 }
1397
1398 int unlock(ImageCtx *ictx, const string& cookie)
1399 {
1400 ldout(ictx->cct, 20) << "unlock image " << ictx
1401 << " cookie='" << cookie << "'" << dendl;
1402
1403 int r = ictx->state->refresh_if_required();
1404 if (r < 0)
1405 return r;
1406
1407 {
9f95a23c 1408 std::shared_lock locker{ictx->image_lock};
7c673cae
FG
1409 r = rados::cls::lock::unlock(&ictx->md_ctx, ictx->header_oid,
1410 RBD_LOCK_NAME, cookie);
1411 if (r < 0) {
1412 return r;
1413 }
1414 }
1415
1416 ictx->notify_update();
1417 return 0;
1418 }
1419
1420 int break_lock(ImageCtx *ictx, const string& client,
1421 const string& cookie)
1422 {
1423 ldout(ictx->cct, 20) << "break_lock image " << ictx << " client='" << client
1424 << "' cookie='" << cookie << "'" << dendl;
1425
1426 int r = ictx->state->refresh_if_required();
1427 if (r < 0)
1428 return r;
1429
1430 entity_name_t lock_client;
1431 if (!lock_client.parse(client)) {
1432 lderr(ictx->cct) << "Unable to parse client '" << client
1433 << "'" << dendl;
1434 return -EINVAL;
1435 }
1436
11fdf7f2 1437 if (ictx->config.get_val<bool>("rbd_blacklist_on_break_lock")) {
7c673cae
FG
1438 typedef std::map<rados::cls::lock::locker_id_t,
1439 rados::cls::lock::locker_info_t> Lockers;
1440 Lockers lockers;
1441 ClsLockType lock_type;
1442 std::string lock_tag;
1443 r = rados::cls::lock::get_lock_info(&ictx->md_ctx, ictx->header_oid,
1444 RBD_LOCK_NAME, &lockers, &lock_type,
1445 &lock_tag);
1446 if (r < 0) {
1447 lderr(ictx->cct) << "unable to retrieve lock info: " << cpp_strerror(r)
1448 << dendl;
1449 return r;
1450 }
1451
1452 std::string client_address;
1453 for (Lockers::iterator it = lockers.begin();
1454 it != lockers.end(); ++it) {
1455 if (it->first.locker == lock_client) {
11fdf7f2 1456 client_address = it->second.addr.get_legacy_str();
7c673cae
FG
1457 break;
1458 }
1459 }
1460 if (client_address.empty()) {
1461 return -ENOENT;
1462 }
1463
7c673cae 1464 librados::Rados rados(ictx->md_ctx);
11fdf7f2
TL
1465 r = rados.blacklist_add(
1466 client_address,
1467 ictx->config.get_val<uint64_t>("rbd_blacklist_expire_seconds"));
7c673cae
FG
1468 if (r < 0) {
1469 lderr(ictx->cct) << "unable to blacklist client: " << cpp_strerror(r)
1470 << dendl;
1471 return r;
1472 }
1473 }
1474
1475 r = rados::cls::lock::break_lock(&ictx->md_ctx, ictx->header_oid,
1476 RBD_LOCK_NAME, cookie, lock_client);
1477 if (r < 0)
1478 return r;
1479 ictx->notify_update();
1480 return 0;
1481 }
1482
1483 void rbd_ctx_cb(completion_t cb, void *arg)
1484 {
1485 Context *ctx = reinterpret_cast<Context *>(arg);
1486 auto comp = reinterpret_cast<io::AioCompletion *>(cb);
1487 ctx->complete(comp->get_return_value());
1488 comp->release();
1489 }
1490
1491 int64_t read_iterate(ImageCtx *ictx, uint64_t off, uint64_t len,
1492 int (*cb)(uint64_t, size_t, const char *, void *),
1493 void *arg)
1494 {
11fdf7f2
TL
1495 coarse_mono_time start_time;
1496 ceph::timespan elapsed;
7c673cae
FG
1497
1498 ldout(ictx->cct, 20) << "read_iterate " << ictx << " off = " << off
1499 << " len = " << len << dendl;
1500
1501 int r = ictx->state->refresh_if_required();
1502 if (r < 0)
1503 return r;
1504
1505 uint64_t mylen = len;
9f95a23c 1506 ictx->image_lock.lock_shared();
7c673cae 1507 r = clip_io(ictx, off, &mylen);
9f95a23c 1508 ictx->image_lock.unlock_shared();
7c673cae
FG
1509 if (r < 0)
1510 return r;
1511
1512 int64_t total_read = 0;
1513 uint64_t period = ictx->get_stripe_period();
1514 uint64_t left = mylen;
1515
31f18b77 1516 ZTracer::Trace trace;
181888fb 1517 if (ictx->blkin_trace_all) {
31f18b77
FG
1518 trace.init("read_iterate", &ictx->trace_endpoint);
1519 }
1520
9f95a23c 1521 std::shared_lock owner_locker{ictx->owner_lock};
11fdf7f2 1522 start_time = coarse_mono_clock::now();
7c673cae
FG
1523 while (left > 0) {
1524 uint64_t period_off = off - (off % period);
1525 uint64_t read_len = min(period_off + period - off, left);
1526
1527 bufferlist bl;
1528
1529 C_SaferCond ctx;
1530 auto c = io::AioCompletion::create_and_start(&ctx, ictx,
1531 io::AIO_TYPE_READ);
1532 io::ImageRequest<>::aio_read(ictx, c, {{off, read_len}},
31f18b77 1533 io::ReadResult{&bl}, 0, std::move(trace));
7c673cae
FG
1534
1535 int ret = ctx.wait();
1536 if (ret < 0) {
1537 return ret;
1538 }
1539
1540 r = cb(total_read, ret, bl.c_str(), arg);
1541 if (r < 0) {
1542 return r;
1543 }
1544
1545 total_read += ret;
1546 left -= ret;
1547 off += ret;
1548 }
1549
11fdf7f2 1550 elapsed = coarse_mono_clock::now() - start_time;
7c673cae
FG
1551 ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed);
1552 ictx->perfcounter->inc(l_librbd_rd);
1553 ictx->perfcounter->inc(l_librbd_rd_bytes, mylen);
1554 return total_read;
1555 }
1556
1557 // validate extent against image size; clip to image size if necessary
1558 int clip_io(ImageCtx *ictx, uint64_t off, uint64_t *len)
1559 {
9f95a23c 1560 ceph_assert(ceph_mutex_is_locked(ictx->image_lock));
7c673cae
FG
1561 uint64_t image_size = ictx->get_image_size(ictx->snap_id);
1562 bool snap_exists = ictx->snap_exists;
1563
1564 if (!snap_exists)
1565 return -ENOENT;
1566
1567 // special-case "len == 0" requests: always valid
1568 if (*len == 0)
1569 return 0;
1570
1571 // can't start past end
1572 if (off >= image_size)
1573 return -EINVAL;
1574
1575 // clip requests that extend past end to just end
1576 if ((off + *len) > image_size)
1577 *len = (size_t)(image_size - off);
1578
1579 return 0;
1580 }
1581
11fdf7f2 1582 int invalidate_cache(ImageCtx *ictx)
7c673cae
FG
1583 {
1584 CephContext *cct = ictx->cct;
11fdf7f2 1585 ldout(cct, 20) << "invalidate_cache " << ictx << dendl;
7c673cae
FG
1586
1587 int r = ictx->state->refresh_if_required();
1588 if (r < 0) {
1589 return r;
1590 }
1591
7c673cae
FG
1592 C_SaferCond ctx;
1593 {
9f95a23c 1594 std::shared_lock owner_locker{ictx->owner_lock};
11fdf7f2 1595 ictx->io_object_dispatcher->invalidate_cache(&ctx);
7c673cae
FG
1596 }
1597 r = ctx.wait();
7c673cae
FG
1598 ictx->perfcounter->inc(l_librbd_invalidate_cache);
1599 return r;
1600 }
1601
1602 int poll_io_events(ImageCtx *ictx, io::AioCompletion **comps, int numcomp)
1603 {
1604 if (numcomp <= 0)
1605 return -EINVAL;
1606 CephContext *cct = ictx->cct;
1607 ldout(cct, 20) << __func__ << " " << ictx << " numcomp = " << numcomp
1608 << dendl;
1609 int i = 0;
9f95a23c
TL
1610 while (i < numcomp && ictx->event_socket_completions.pop(comps[i])) {
1611 ++i;
7c673cae 1612 }
9f95a23c 1613
7c673cae
FG
1614 return i;
1615 }
1616
1617 int metadata_get(ImageCtx *ictx, const string &key, string *value)
1618 {
1619 CephContext *cct = ictx->cct;
1620 ldout(cct, 20) << "metadata_get " << ictx << " key=" << key << dendl;
1621
1622 int r = ictx->state->refresh_if_required();
1623 if (r < 0) {
1624 return r;
1625 }
1626
1627 return cls_client::metadata_get(&ictx->md_ctx, ictx->header_oid, key, value);
1628 }
1629
1630 int metadata_list(ImageCtx *ictx, const string &start, uint64_t max, map<string, bufferlist> *pairs)
1631 {
1632 CephContext *cct = ictx->cct;
1633 ldout(cct, 20) << "metadata_list " << ictx << dendl;
1634
1635 int r = ictx->state->refresh_if_required();
1636 if (r < 0) {
1637 return r;
1638 }
1639
9f95a23c
TL
1640 C_SaferCond ctx;
1641 auto req = image::GetMetadataRequest<>::create(
1642 ictx->md_ctx, ictx->header_oid, false, "", start, max, pairs, &ctx);
1643 req->send();
7c673cae 1644
9f95a23c 1645 return ctx.wait();
7c673cae
FG
1646 }
1647
11fdf7f2
TL
1648 int list_watchers(ImageCtx *ictx,
1649 std::list<librbd::image_watcher_t> &watchers)
1650 {
1651 int r;
1652 std::string header_oid;
1653 std::list<obj_watch_t> obj_watchers;
1654
1655 if (ictx->old_format) {
1656 header_oid = util::old_header_name(ictx->name);
1657 } else {
1658 header_oid = util::header_name(ictx->id);
1659 }
1660
1661 r = ictx->md_ctx.list_watchers(header_oid, &obj_watchers);
1662 if (r < 0) {
1663 return r;
1664 }
1665
1666 for (auto i = obj_watchers.begin(); i != obj_watchers.end(); ++i) {
1667 librbd::image_watcher_t watcher;
1668 watcher.addr = i->addr;
1669 watcher.id = i->watcher_id;
1670 watcher.cookie = i->cookie;
1671
1672 watchers.push_back(watcher);
1673 }
1674
1675 return 0;
1676 }
1677
1678}
1679
1680std::ostream &operator<<(std::ostream &os, const librbd::ImageOptions &opts) {
1681 os << "[";
1682
1683 const char *delimiter = "";
1684 for (auto &i : librbd::IMAGE_OPTIONS_TYPE_MAPPING) {
1685 if (i.second == librbd::STR) {
1686 std::string val;
1687 if (opts.get(i.first, &val) == 0) {
1688 os << delimiter << librbd::image_option_name(i.first) << "=" << val;
1689 delimiter = ", ";
1690 }
1691 } else if (i.second == librbd::UINT64) {
1692 uint64_t val;
1693 if (opts.get(i.first, &val) == 0) {
1694 os << delimiter << librbd::image_option_name(i.first) << "=" << val;
1695 delimiter = ", ";
1696 }
1697 }
1698 }
7c673cae 1699
11fdf7f2 1700 os << "]";
7c673cae 1701
11fdf7f2 1702 return os;
7c673cae 1703}