]> git.proxmox.com Git - ceph.git/blame - ceph/src/test/librbd/fsx.cc
import ceph 14.2.5
[ceph.git] / ceph / src / test / librbd / fsx.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:8; indent-tabs-mode:t -*-
2// vim: ts=8 sw=8 smarttab
3/*
4 * Copyright (C) 1991, NeXT Computer, Inc. All Rights Reserverd.
5 *
6 * File: fsx.cc
7 * Author: Avadis Tevanian, Jr.
8 *
9 * File system exerciser.
10 *
11 * Rewritten 8/98 by Conrad Minshall.
12 *
13 * Small changes to work under Linux -- davej.
14 *
15 * Checks for mmap last-page zero fill.
16 */
17
18#include <sys/types.h>
19#include <unistd.h>
11fdf7f2 20#include <getopt.h>
7c673cae 21#include <limits.h>
7c673cae 22#include <strings.h>
11fdf7f2
TL
23#if defined(__FreeBSD__)
24#include <sys/disk.h>
25#endif
7c673cae
FG
26#include <sys/file.h>
27#include <sys/stat.h>
28#include <sys/mman.h>
11fdf7f2 29#if defined(__linux__)
7c673cae 30#include <linux/fs.h>
11fdf7f2 31#endif
7c673cae
FG
32#include <sys/ioctl.h>
33#ifdef HAVE_ERR_H
34#include <err.h>
35#endif
36#include <signal.h>
7c673cae
FG
37#include <stddef.h>
38#include <stdio.h>
39#include <stdlib.h>
40#include <string.h>
41#include <stdarg.h>
42#include <assert.h>
43#include <errno.h>
44#include <math.h>
45#include <fcntl.h>
46#include <random>
47
11fdf7f2 48#include "include/compat.h"
7c673cae 49#include "include/intarith.h"
11fdf7f2 50#if defined(WITH_KRBD)
7c673cae 51#include "include/krbd.h"
11fdf7f2 52#endif
7c673cae
FG
53#include "include/rados/librados.h"
54#include "include/rados/librados.hpp"
55#include "include/rbd/librbd.h"
56#include "include/rbd/librbd.hpp"
57#include "common/Cond.h"
58#include "common/SubProcess.h"
59#include "common/safe_io.h"
60#include "journal/Journaler.h"
61#include "journal/ReplayEntry.h"
62#include "journal/ReplayHandler.h"
63#include "journal/Settings.h"
64
65#include <boost/scope_exit.hpp>
66
67#define NUMPRINTCOLUMNS 32 /* # columns of data to print on each line */
68
69/*
70 * A log entry is an operation and a bunch of arguments.
71 */
72
73struct log_entry {
74 int operation;
75 int args[3];
76};
77
78#define LOGSIZE 1000
79
80struct log_entry oplog[LOGSIZE]; /* the log */
81int logptr = 0; /* current position in log */
82int logcount = 0; /* total ops */
83
84/*
85 * The operation matrix is complex due to conditional execution of different
86 * features. Hence when we come to deciding what operation to run, we need to
87 * be careful in how we select the different operations. The active operations
88 * are mapped to numbers as follows:
89 *
90 * lite !lite
91 * READ: 0 0
92 * WRITE: 1 1
93 * MAPREAD: 2 2
94 * MAPWRITE: 3 3
95 * TRUNCATE: - 4
96 * FALLOCATE: - 5
97 * PUNCH HOLE: - 6
98 * WRITESAME: - 7
c07f9fc5 99 * COMPAREANDWRITE: - 8
7c673cae
FG
100 *
101 * When mapped read/writes are disabled, they are simply converted to normal
102 * reads and writes. When fallocate/fpunch calls are disabled, they are
103 * converted to OP_SKIPPED. Hence OP_SKIPPED needs to have a number higher than
104 * the operation selction matrix, as does the OP_CLOSEOPEN which is an
105 * operation modifier rather than an operation in itself.
106 *
107 * Because of the "lite" version, we also need to have different "maximum
108 * operation" defines to allow the ops to be selected correctly based on the
109 * mode being run.
110 */
111
112/* common operations */
113#define OP_READ 0
114#define OP_WRITE 1
115#define OP_MAPREAD 2
116#define OP_MAPWRITE 3
117#define OP_MAX_LITE 4
118
119/* !lite operations */
120#define OP_TRUNCATE 4
121#define OP_FALLOCATE 5
122#define OP_PUNCH_HOLE 6
123#define OP_WRITESAME 7
c07f9fc5 124#define OP_COMPARE_AND_WRITE 8
7c673cae 125/* rbd-specific operations */
c07f9fc5
FG
126#define OP_CLONE 9
127#define OP_FLATTEN 10
128#define OP_MAX_FULL 11
7c673cae
FG
129
130/* operation modifiers */
131#define OP_CLOSEOPEN 100
132#define OP_SKIPPED 101
133
134#undef PAGE_SIZE
135#define PAGE_SIZE getpagesize()
136#undef PAGE_MASK
137#define PAGE_MASK (PAGE_SIZE - 1)
138
139
140char *original_buf; /* a pointer to the original data */
141char *good_buf; /* a pointer to the correct data */
142char *temp_buf; /* a pointer to the current data */
143
144char dirpath[1024];
145
146off_t file_size = 0;
147off_t biggest = 0;
148unsigned long testcalls = 0; /* calls to function "test" */
149
11fdf7f2
TL
150const char* cluster_name = "ceph"; /* --cluster optional */
151const char* client_id = "admin"; /* --id optional */
152
7c673cae
FG
153unsigned long simulatedopcount = 0; /* -b flag */
154int closeprob = 0; /* -c flag */
155int debug = 0; /* -d flag */
156unsigned long debugstart = 0; /* -D flag */
157int flush_enabled = 0; /* -f flag */
11fdf7f2 158int deep_copy = 0; /* -g flag */
7c673cae
FG
159int holebdy = 1; /* -h flag */
160bool journal_replay = false; /* -j flah */
161int keep_on_success = 0; /* -k flag */
162int do_fsync = 0; /* -y flag */
163unsigned long maxfilelen = 256 * 1024; /* -l flag */
164int sizechecks = 1; /* -n flag disables them */
165int maxoplen = 64 * 1024; /* -o flag */
166int quiet = 0; /* -q flag */
167unsigned long progressinterval = 0; /* -p flag */
168int readbdy = 1; /* -r flag */
169int style = 0; /* -s flag */
170int prealloc = 0; /* -x flag */
171int truncbdy = 1; /* -t flag */
172int writebdy = 1; /* -w flag */
173long monitorstart = -1; /* -m flag */
174long monitorend = -1; /* -m flag */
175int lite = 0; /* -L flag */
176long numops = -1; /* -N flag */
177int randomoplen = 1; /* -O flag disables it */
178int seed = 1; /* -S flag */
179int mapped_writes = 0; /* -W flag disables */
180int fallocate_calls = 0; /* -F flag disables */
181int punch_hole_calls = 1; /* -H flag disables */
182int clone_calls = 1; /* -C flag disables */
183int randomize_striping = 1; /* -U flag disables */
184int randomize_parent_overlap = 1;
185int mapped_reads = 0; /* -R flag disables it */
186int fsxgoodfd = 0;
187int o_direct = 0; /* -Z flag */
188
189int num_clones = 0;
190
191int page_size;
192int page_mask;
193int mmap_mask;
194
195FILE * fsxlogf = NULL;
196int badoff = -1;
197int closeopen = 0;
198
199void
200vwarnc(int code, const char *fmt, va_list ap) {
201 fprintf(stderr, "fsx: ");
202 if (fmt != NULL) {
203 vfprintf(stderr, fmt, ap);
204 fprintf(stderr, ": ");
205 }
206 fprintf(stderr, "%s\n", strerror(code));
207}
208
209void
210warn(const char * fmt, ...) {
211 va_list ap;
212 va_start(ap, fmt);
213 vwarnc(errno, fmt, ap);
214 va_end(ap);
215}
216
217#define BUF_SIZE 1024
218
219void
220prt(const char *fmt, ...)
221{
222 va_list args;
223 char buffer[BUF_SIZE];
224
225 va_start(args, fmt);
226 vsnprintf(buffer, BUF_SIZE, fmt, args);
227 va_end(args);
228 fprintf(stdout, "%s", buffer);
229 if (fsxlogf)
230 fprintf(fsxlogf, "%s", buffer);
231}
232
233void
234prterr(const char *prefix)
235{
236 prt("%s%s%s\n", prefix, prefix ? ": " : "", strerror(errno));
237}
238
239void
240prterrcode(const char *prefix, int code)
241{
242 prt("%s%s%s\n", prefix, prefix ? ": " : "", strerror(-code));
243}
244
245void
246simple_err(const char *msg, int err)
247{
248 fprintf(stderr, "%s: %s\n", msg, strerror(-err));
249}
250
251/*
252 * random
253 */
254std::mt19937 random_generator;
255
256uint_fast32_t
257get_random(void)
258{
259 return random_generator();
260}
261
11fdf7f2 262int get_features(uint64_t* features);
7c673cae
FG
263void replay_imagename(char *buf, size_t len, int clones);
264
265namespace {
266
267static const std::string JOURNAL_CLIENT_ID("fsx");
268
269struct ReplayHandler : public journal::ReplayHandler {
270 journal::Journaler *journaler;
271 journal::Journaler *replay_journaler;
272 Context *on_finish;
273
274 ReplayHandler(journal::Journaler *journaler,
275 journal::Journaler *replay_journaler, Context *on_finish)
276 : journaler(journaler), replay_journaler(replay_journaler),
277 on_finish(on_finish) {
278 }
279
280 void get() override {
281 }
282 void put() override {
283 }
284
285 void handle_entries_available() override {
286 while (true) {
287 journal::ReplayEntry replay_entry;
288 if (!journaler->try_pop_front(&replay_entry)) {
289 return;
290 }
291
292 replay_journaler->append(0, replay_entry.get_data());
293 }
294 }
295
296 void handle_complete(int r) override {
297 on_finish->complete(r);
298 }
299};
300
301int get_image_id(librados::IoCtx &io_ctx, const char *image_name,
302 std::string *image_id) {
303 librbd::RBD rbd;
304 librbd::Image image;
305 int r = rbd.open(io_ctx, image, image_name);
306 if (r < 0) {
307 simple_err("failed to open image", r);
308 return r;
309 }
310
311 rbd_image_info_t info;
312 r = image.stat(info, sizeof(info));
313 if (r < 0) {
314 simple_err("failed to stat image", r);
315 return r;
316 }
317
318 *image_id = std::string(&info.block_name_prefix[strlen(RBD_DATA_PREFIX)]);
319 return 0;
320}
321
322int register_journal(rados_ioctx_t ioctx, const char *image_name) {
323 librados::IoCtx io_ctx;
324 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
325
326 std::string image_id;
327 int r = get_image_id(io_ctx, image_name, &image_id);
328 if (r < 0) {
329 return r;
330 }
331
332 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {});
333 r = journaler.register_client(bufferlist());
334 if (r < 0) {
335 simple_err("failed to register journal client", r);
336 return r;
337 }
338 return 0;
339}
340
341int unregister_journal(rados_ioctx_t ioctx, const char *image_name) {
342 librados::IoCtx io_ctx;
343 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
344
345 std::string image_id;
346 int r = get_image_id(io_ctx, image_name, &image_id);
347 if (r < 0) {
348 return r;
349 }
350
351 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {});
352 r = journaler.unregister_client();
353 if (r < 0) {
354 simple_err("failed to unregister journal client", r);
355 return r;
356 }
357 return 0;
358}
359
360int create_replay_image(rados_ioctx_t ioctx, int order,
361 uint64_t stripe_unit, int stripe_count,
362 const char *replay_image_name,
363 const char *last_replay_image_name) {
364 librados::IoCtx io_ctx;
365 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
366
11fdf7f2
TL
367 uint64_t features;
368 int r = get_features(&features);
369 if (r < 0) {
370 return r;
371 }
372
7c673cae
FG
373 librbd::RBD rbd;
374 if (last_replay_image_name == nullptr) {
11fdf7f2 375 r = rbd.create2(io_ctx, replay_image_name, 0, features, &order);
7c673cae
FG
376 } else {
377 r = rbd.clone2(io_ctx, last_replay_image_name, "snap",
11fdf7f2
TL
378 io_ctx, replay_image_name, features, &order,
379 stripe_unit, stripe_count);
7c673cae
FG
380 }
381
382 if (r < 0) {
383 simple_err("failed to create replay image", r);
384 return r;
385 }
386
387 return 0;
388}
389
390int replay_journal(rados_ioctx_t ioctx, const char *image_name,
391 const char *replay_image_name) {
392 librados::IoCtx io_ctx;
393 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
394
395 std::string image_id;
396 int r = get_image_id(io_ctx, image_name, &image_id);
397 if (r < 0) {
398 return r;
399 }
400
401 std::string replay_image_id;
402 r = get_image_id(io_ctx, replay_image_name, &replay_image_id);
403 if (r < 0) {
404 return r;
405 }
406
407 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {});
408 C_SaferCond init_ctx;
409 journaler.init(&init_ctx);
410 BOOST_SCOPE_EXIT_ALL( (&journaler) ) {
411 journaler.shut_down();
412 };
413
414 r = init_ctx.wait();
415 if (r < 0) {
416 simple_err("failed to initialize journal", r);
417 return r;
418 }
419
420 journal::Journaler replay_journaler(io_ctx, replay_image_id, "", {});
421
422 C_SaferCond replay_init_ctx;
423 replay_journaler.init(&replay_init_ctx);
424 BOOST_SCOPE_EXIT_ALL( (&replay_journaler) ) {
425 replay_journaler.shut_down();
426 };
427
428 r = replay_init_ctx.wait();
429 if (r < 0) {
430 simple_err("failed to initialize replay journal", r);
431 return r;
432 }
433
494da23a 434 replay_journaler.start_append(0);
7c673cae
FG
435
436 C_SaferCond replay_ctx;
437 ReplayHandler replay_handler(&journaler, &replay_journaler,
438 &replay_ctx);
439
440 // copy journal events from source image to replay image
441 journaler.start_replay(&replay_handler);
442 r = replay_ctx.wait();
443
444 journaler.stop_replay();
445
446 C_SaferCond stop_ctx;
447 replay_journaler.stop_append(&stop_ctx);
448 int stop_r = stop_ctx.wait();
449 if (r == 0 && stop_r < 0) {
450 r = stop_r;
451 }
452
453 if (r < 0) {
454 simple_err("failed to replay journal", r);
455 return r;
456 }
457
458 librbd::RBD rbd;
459 librbd::Image image;
460 r = rbd.open(io_ctx, image, replay_image_name);
461 if (r < 0) {
462 simple_err("failed to open replay image", r);
463 return r;
464 }
465
466 // perform an IO op to initiate the journal replay
467 bufferlist bl;
468 r = static_cast<ssize_t>(image.write(0, 0, bl));
469 if (r < 0) {
470 simple_err("failed to write to replay image", r);
471 return r;
472 }
473 return 0;
474}
475
476int finalize_journal(rados_ioctx_t ioctx, const char *imagename, int clones,
477 int order, uint64_t stripe_unit, int stripe_count) {
478 char replayimagename[1024];
479 replay_imagename(replayimagename, sizeof(replayimagename), clones);
480
481 char lastreplayimagename[1024];
482 if (clones > 0) {
483 replay_imagename(lastreplayimagename,
484 sizeof(lastreplayimagename), clones - 1);
485 }
486
487 int ret = create_replay_image(ioctx, order, stripe_unit,
488 stripe_count, replayimagename,
489 clones > 0 ? lastreplayimagename :
490 nullptr);
491 if (ret < 0) {
492 exit(EXIT_FAILURE);
493 }
494
495 ret = replay_journal(ioctx, imagename, replayimagename);
496 if (ret < 0) {
497 exit(EXIT_FAILURE);
498 }
499 return 0;
500}
501
502} // anonymous namespace
503
504/*
505 * rbd
506 */
507
508struct rbd_ctx {
509 const char *name; /* image name */
510 rbd_image_t image; /* image handle */
511 const char *krbd_name; /* image /dev/rbd<id> name */ /* reused for nbd test */
512 int krbd_fd; /* image /dev/rbd<id> fd */ /* reused for nbd test */
513};
514
515#define RBD_CTX_INIT (struct rbd_ctx) { NULL, NULL, NULL, -1}
516
517struct rbd_operations {
518 int (*open)(const char *name, struct rbd_ctx *ctx);
519 int (*close)(struct rbd_ctx *ctx);
520 ssize_t (*read)(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf);
521 ssize_t (*write)(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf);
522 int (*flush)(struct rbd_ctx *ctx);
523 int (*discard)(struct rbd_ctx *ctx, uint64_t off, uint64_t len);
524 int (*get_size)(struct rbd_ctx *ctx, uint64_t *size);
525 int (*resize)(struct rbd_ctx *ctx, uint64_t size);
526 int (*clone)(struct rbd_ctx *ctx, const char *src_snapname,
527 const char *dst_imagename, int *order, int stripe_unit,
528 int stripe_count);
529 int (*flatten)(struct rbd_ctx *ctx);
530 ssize_t (*writesame)(struct rbd_ctx *ctx, uint64_t off, size_t len,
531 const char *buf, size_t data_len);
c07f9fc5
FG
532 ssize_t (*compare_and_write)(struct rbd_ctx *ctx, uint64_t off, size_t len,
533 const char *cmp_buf, const char *buf);
7c673cae
FG
534};
535
536char *pool; /* name of the pool our test image is in */
537char *iname; /* name of our test image */
538rados_t cluster; /* handle for our test cluster */
539rados_ioctx_t ioctx; /* handle for our test pool */
11fdf7f2 540#if defined(WITH_KRBD)
7c673cae 541struct krbd_ctx *krbd; /* handle for libkrbd */
11fdf7f2 542#endif
7c673cae
FG
543bool skip_partial_discard; /* rbd_skip_partial_discard config value*/
544
11fdf7f2
TL
545int get_features(uint64_t* features) {
546 char buf[1024];
547 int r = rados_conf_get(cluster, "rbd_default_features", buf,
548 sizeof(buf));
549 if (r < 0) {
550 simple_err("Could not get rbd_default_features value", r);
551 return r;
552 }
553
554 *features = strtol(buf, NULL, 0);
555
556 if (clone_calls) {
557 *features |= RBD_FEATURE_LAYERING;
558 }
559 if (journal_replay) {
560 *features |= (RBD_FEATURE_EXCLUSIVE_LOCK |
561 RBD_FEATURE_JOURNALING);
562 }
563 return 0;
564}
565
7c673cae
FG
566/*
567 * librbd/krbd rbd_operations handlers. Given the rest of fsx.c, no
568 * attempt to do error handling is made in these handlers.
569 */
570
571int
572__librbd_open(const char *name, struct rbd_ctx *ctx)
573{
574 rbd_image_t image;
575 int ret;
576
11fdf7f2 577 ceph_assert(!ctx->name && !ctx->image &&
7c673cae
FG
578 !ctx->krbd_name && ctx->krbd_fd < 0);
579
580 ret = rbd_open(ioctx, name, &image, NULL);
581 if (ret < 0) {
582 prt("rbd_open(%s) failed\n", name);
583 return ret;
584 }
585
586 ctx->name = strdup(name);
587 ctx->image = image;
588 ctx->krbd_name = NULL;
589 ctx->krbd_fd = -1;
590
591 return 0;
592}
593
594int
595librbd_open(const char *name, struct rbd_ctx *ctx)
596{
597 return __librbd_open(name, ctx);
598}
599
600int
601__librbd_close(struct rbd_ctx *ctx)
602{
603 int ret;
604
11fdf7f2 605 ceph_assert(ctx->name && ctx->image);
7c673cae
FG
606
607 ret = rbd_close(ctx->image);
608 if (ret < 0) {
609 prt("rbd_close(%s) failed\n", ctx->name);
610 return ret;
611 }
612
613 free((void *)ctx->name);
614
615 ctx->name = NULL;
616 ctx->image = NULL;
617
618 return 0;
619}
620
621int
622librbd_close(struct rbd_ctx *ctx)
623{
624 return __librbd_close(ctx);
625}
626
627int
628librbd_verify_object_map(struct rbd_ctx *ctx)
629{
630 int n;
631 uint64_t flags;
632 n = rbd_get_flags(ctx->image, &flags);
633 if (n < 0) {
634 prt("rbd_get_flags() failed\n");
635 return n;
636 }
637
638 if ((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
639 prt("rbd_get_flags() indicates object map is invalid\n");
640 return -EINVAL;
641 }
642 return 0;
643}
644
645ssize_t
646librbd_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
647{
648 ssize_t n;
649
650 n = rbd_read(ctx->image, off, len, buf);
651 if (n < 0)
652 prt("rbd_read(%llu, %zu) failed\n", off, len);
653
654 return n;
655}
656
657ssize_t
658librbd_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
659{
660 ssize_t n;
661 int ret;
662
663 n = rbd_write(ctx->image, off, len, buf);
664 if (n < 0) {
665 prt("rbd_write(%llu, %zu) failed\n", off, len);
666 return n;
667 }
668
669 ret = librbd_verify_object_map(ctx);
670 if (ret < 0) {
671 return ret;
672 }
673 return n;
674}
675
676int
677librbd_flush(struct rbd_ctx *ctx)
678{
679 int ret;
680
681 ret = rbd_flush(ctx->image);
682 if (ret < 0) {
683 prt("rbd_flush failed\n");
684 return ret;
685 }
686
687 return librbd_verify_object_map(ctx);
688}
689
690int
691librbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
692{
693 int ret;
694
695 ret = rbd_discard(ctx->image, off, len);
696 if (ret < 0) {
697 prt("rbd_discard(%llu, %llu) failed\n", off, len);
698 return ret;
699 }
700
701 return librbd_verify_object_map(ctx);
702}
703
704ssize_t
705librbd_writesame(struct rbd_ctx *ctx, uint64_t off, size_t len,
706 const char *buf, size_t data_len)
707{
708 ssize_t n;
709 int ret;
710
711 n = rbd_writesame(ctx->image, off, len, buf, data_len, 0);
712 if (n < 0) {
713 prt("rbd_writesame(%llu, %zu) failed\n", off, len);
714 return n;
715 }
716
717 ret = librbd_verify_object_map(ctx);
718 if (ret < 0) {
719 return ret;
720 }
721 return n;
722}
723
c07f9fc5
FG
724ssize_t
725librbd_compare_and_write(struct rbd_ctx *ctx, uint64_t off, size_t len,
726 const char *cmp_buf, const char *buf)
727{
728 ssize_t n;
729 int ret;
730 uint64_t mismatch_off = 0;
731
732 n = rbd_compare_and_write(ctx->image, off, len, cmp_buf, buf, &mismatch_off, 0);
733 if (n == -EINVAL) {
734 return n;
735 } else if (n < 0) {
736 prt("rbd_compare_and_write mismatch(%llu, %zu, %llu) failed\n",
737 off, len, mismatch_off);
738 return n;
739 }
740
741 ret = librbd_verify_object_map(ctx);
742 if (ret < 0) {
743 return ret;
744 }
745 return n;
746
747}
748
7c673cae
FG
749int
750librbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
751{
7c673cae
FG
752 int ret;
753
11fdf7f2 754 ret = rbd_get_size(ctx->image, size);
7c673cae 755 if (ret < 0) {
11fdf7f2 756 prt("rbd_get_size failed\n");
7c673cae
FG
757 return ret;
758 }
759
7c673cae
FG
760 return 0;
761}
762
763int
764__librbd_resize(struct rbd_ctx *ctx, uint64_t size)
765{
766 int ret;
767
768 ret = rbd_resize(ctx->image, size);
769 if (ret < 0) {
770 prt("rbd_resize(%llu) failed\n", size);
771 return ret;
772 }
773
774 return librbd_verify_object_map(ctx);
775}
776
777int
778librbd_resize(struct rbd_ctx *ctx, uint64_t size)
779{
780 return __librbd_resize(ctx, size);
781}
782
11fdf7f2
TL
783int
784__librbd_deep_copy(struct rbd_ctx *ctx, const char *src_snapname,
785 const char *dst_imagename, uint64_t features, int *order,
786 int stripe_unit, int stripe_count) {
787 int ret;
788
789 rbd_image_options_t opts;
790 rbd_image_options_create(&opts);
791 BOOST_SCOPE_EXIT_ALL( (&opts) ) {
792 rbd_image_options_destroy(opts);
793 };
794 ret = rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_FEATURES,
795 features);
796 ceph_assert(ret == 0);
797 ret = rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_ORDER,
798 *order);
799 ceph_assert(ret == 0);
800 ret = rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_STRIPE_UNIT,
801 stripe_unit);
802 ceph_assert(ret == 0);
803 ret = rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_STRIPE_COUNT,
804 stripe_count);
805 ceph_assert(ret == 0);
806
807 ret = rbd_snap_set(ctx->image, src_snapname);
808 if (ret < 0) {
809 prt("rbd_snap_set(%s@%s) failed\n", ctx->name, src_snapname);
810 return ret;
811 }
812
813 ret = rbd_deep_copy(ctx->image, ioctx, dst_imagename, opts);
814 if (ret < 0) {
815 prt("rbd_deep_copy(%s@%s -> %s) failed\n",
816 ctx->name, src_snapname, dst_imagename);
817 return ret;
818 }
819
820 ret = rbd_snap_set(ctx->image, "");
821 if (ret < 0) {
822 prt("rbd_snap_set(%s@) failed\n", ctx->name);
823 return ret;
824 }
825
826 rbd_image_t image;
827 ret = rbd_open(ioctx, dst_imagename, &image, nullptr);
828 if (ret < 0) {
829 prt("rbd_open(%s) failed\n", dst_imagename);
830 return ret;
831 }
832
833 ret = rbd_snap_unprotect(image, src_snapname);
834 if (ret < 0) {
835 prt("rbd_snap_unprotect(%s@%s) failed\n", dst_imagename,
836 src_snapname);
837 return ret;
838 }
839
840 ret = rbd_snap_remove(image, src_snapname);
841 if (ret < 0) {
842 prt("rbd_snap_remove(%s@%s) failed\n", dst_imagename,
843 src_snapname);
844 return ret;
845 }
846
847 ret = rbd_close(image);
848 if (ret < 0) {
849 prt("rbd_close(%s) failed\n", dst_imagename);
850 return ret;
851 }
852
853 return 0;
854}
855
7c673cae
FG
856int
857__librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
858 const char *dst_imagename, int *order, int stripe_unit,
859 int stripe_count, bool krbd)
860{
861 int ret;
862
863 ret = rbd_snap_create(ctx->image, src_snapname);
864 if (ret < 0) {
865 prt("rbd_snap_create(%s@%s) failed\n", ctx->name,
866 src_snapname);
867 return ret;
868 }
869
870 ret = rbd_snap_protect(ctx->image, src_snapname);
871 if (ret < 0) {
872 prt("rbd_snap_protect(%s@%s) failed\n", ctx->name,
873 src_snapname);
874 return ret;
875 }
876
11fdf7f2
TL
877 uint64_t features;
878 ret = get_features(&features);
879 if (ret < 0) {
880 return ret;
881 }
882
7c673cae
FG
883 if (krbd) {
884 features &= ~(RBD_FEATURE_OBJECT_MAP |
885 RBD_FEATURE_FAST_DIFF |
886 RBD_FEATURE_DEEP_FLATTEN |
887 RBD_FEATURE_JOURNALING);
888 }
11fdf7f2
TL
889 if (deep_copy) {
890 ret = __librbd_deep_copy(ctx, src_snapname, dst_imagename, features,
891 order, stripe_unit, stripe_count);
892 if (ret < 0) {
893 prt("deep_copy(%s@%s -> %s) failed\n", ctx->name,
894 src_snapname, dst_imagename);
895 return ret;
896 }
897 } else {
898 ret = rbd_clone2(ioctx, ctx->name, src_snapname, ioctx,
899 dst_imagename, features, order,
900 stripe_unit, stripe_count);
901 if (ret < 0) {
902 prt("rbd_clone2(%s@%s -> %s) failed\n", ctx->name,
903 src_snapname, dst_imagename);
904 return ret;
905 }
7c673cae
FG
906 }
907
908 return 0;
909}
910
911int
912librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
913 const char *dst_imagename, int *order, int stripe_unit,
914 int stripe_count)
915{
916 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
917 stripe_unit, stripe_count, false);
918}
919
920int
921__librbd_flatten(struct rbd_ctx *ctx)
922{
923 int ret;
924
925 ret = rbd_flatten(ctx->image);
926 if (ret < 0) {
927 prt("rbd_flatten failed\n");
928 return ret;
929 }
930
931 return librbd_verify_object_map(ctx);
932}
933
934int
935librbd_flatten(struct rbd_ctx *ctx)
936{
937 return __librbd_flatten(ctx);
938}
939
940const struct rbd_operations librbd_operations = {
941 librbd_open,
942 librbd_close,
943 librbd_read,
944 librbd_write,
945 librbd_flush,
946 librbd_discard,
947 librbd_get_size,
948 librbd_resize,
949 librbd_clone,
950 librbd_flatten,
951 librbd_writesame,
c07f9fc5 952 librbd_compare_and_write,
7c673cae
FG
953};
954
11fdf7f2 955#if defined(WITH_KRBD)
7c673cae
FG
956int
957krbd_open(const char *name, struct rbd_ctx *ctx)
958{
959 char *devnode;
960 int fd;
961 int ret;
962
963 ret = __librbd_open(name, ctx);
964 if (ret < 0)
965 return ret;
966
11fdf7f2 967 ret = krbd_map(krbd, pool, "", name, "", "", &devnode);
7c673cae
FG
968 if (ret < 0) {
969 prt("krbd_map(%s) failed\n", name);
970 return ret;
971 }
972
973 fd = open(devnode, O_RDWR | o_direct);
974 if (fd < 0) {
975 ret = -errno;
976 prt("open(%s) failed\n", devnode);
977 return ret;
978 }
979
980 ctx->krbd_name = devnode;
981 ctx->krbd_fd = fd;
982
983 return 0;
984}
985
986int
987krbd_close(struct rbd_ctx *ctx)
988{
989 int ret;
990
11fdf7f2 991 ceph_assert(ctx->krbd_name && ctx->krbd_fd >= 0);
7c673cae
FG
992
993 if (close(ctx->krbd_fd) < 0) {
994 ret = -errno;
995 prt("close(%s) failed\n", ctx->krbd_name);
996 return ret;
997 }
998
999 ret = krbd_unmap(krbd, ctx->krbd_name, "");
1000 if (ret < 0) {
1001 prt("krbd_unmap(%s) failed\n", ctx->krbd_name);
1002 return ret;
1003 }
1004
1005 free((void *)ctx->krbd_name);
1006
1007 ctx->krbd_name = NULL;
1008 ctx->krbd_fd = -1;
1009
1010 return __librbd_close(ctx);
1011}
11fdf7f2 1012#endif // WITH_KRBD
7c673cae 1013
11fdf7f2 1014#if defined(__linux__)
7c673cae
FG
1015ssize_t
1016krbd_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
1017{
1018 ssize_t n;
1019
1020 n = pread(ctx->krbd_fd, buf, len, off);
1021 if (n < 0) {
1022 n = -errno;
1023 prt("pread(%llu, %zu) failed\n", off, len);
1024 return n;
1025 }
1026
1027 return n;
1028}
1029
1030ssize_t
1031krbd_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
1032{
1033 ssize_t n;
1034
1035 n = pwrite(ctx->krbd_fd, buf, len, off);
1036 if (n < 0) {
1037 n = -errno;
1038 prt("pwrite(%llu, %zu) failed\n", off, len);
1039 return n;
1040 }
1041
1042 return n;
1043}
1044
1045int
1046__krbd_flush(struct rbd_ctx *ctx, bool invalidate)
1047{
1048 int ret;
1049
1050 if (o_direct)
1051 return 0;
1052
1053 /*
1054 * BLKFLSBUF will sync the filesystem on top of the device (we
1055 * don't care about that here, since we write directly to it),
1056 * write out any dirty buffers and invalidate the buffer cache.
1057 * It won't do a hardware cache flush.
1058 *
1059 * fsync() will write out any dirty buffers and do a hardware
1060 * cache flush (which we don't care about either, because for
1061 * krbd it's a noop). It won't try to empty the buffer cache
1062 * nor poke the filesystem before writing out.
1063 *
1064 * Given that, for our purposes, fsync is a flush, while
1065 * BLKFLSBUF is a flush+invalidate.
1066 */
1067 if (invalidate)
1068 ret = ioctl(ctx->krbd_fd, BLKFLSBUF, NULL);
1069 else
1070 ret = fsync(ctx->krbd_fd);
1071 if (ret < 0) {
1072 ret = -errno;
1073 prt("%s failed\n", invalidate ? "BLKFLSBUF" : "fsync");
1074 return ret;
1075 }
1076
1077 return 0;
1078}
1079
1080int
1081krbd_flush(struct rbd_ctx *ctx)
1082{
1083 return __krbd_flush(ctx, false);
1084}
1085
1086int
1087krbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
1088{
1089 uint64_t range[2] = { off, len };
1090 int ret;
1091
1092 /*
a8e16298 1093 * BLKZEROOUT goes straight to disk and doesn't do anything
7c673cae
FG
1094 * about dirty buffers. This means we need to flush so that
1095 *
1096 * write 0..3M
1097 * discard 1..2M
1098 *
1099 * results in "data 0000 data" rather than "data data data" on
1100 * disk and invalidate so that
1101 *
1102 * discard 1..2M
1103 * read 0..3M
1104 *
1105 * returns "data 0000 data" rather than "data data data" in
1106 * case 1..2M was cached.
a8e16298
TL
1107 *
1108 * Note: These cache coherency issues are supposed to be fixed
1109 * in recent kernels.
7c673cae
FG
1110 */
1111 ret = __krbd_flush(ctx, true);
1112 if (ret < 0)
1113 return ret;
1114
1115 /*
a8e16298 1116 * off and len must be 512-byte aligned, otherwise BLKZEROOUT
7c673cae
FG
1117 * will fail with -EINVAL. This means that -K (enable krbd
1118 * mode) requires -h 512 or similar.
1119 */
a8e16298 1120 if (ioctl(ctx->krbd_fd, BLKZEROOUT, &range) < 0) {
7c673cae 1121 ret = -errno;
a8e16298 1122 prt("BLKZEROOUT(%llu, %llu) failed\n", off, len);
7c673cae
FG
1123 return ret;
1124 }
1125
1126 return 0;
1127}
1128
1129int
1130krbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
1131{
1132 uint64_t bytes;
1133
1134 if (ioctl(ctx->krbd_fd, BLKGETSIZE64, &bytes) < 0) {
1135 int ret = -errno;
1136 prt("BLKGETSIZE64 failed\n");
1137 return ret;
1138 }
1139
1140 *size = bytes;
1141
1142 return 0;
1143}
1144
1145int
1146krbd_resize(struct rbd_ctx *ctx, uint64_t size)
1147{
1148 int ret;
1149
11fdf7f2 1150 ceph_assert(size % truncbdy == 0);
7c673cae
FG
1151
1152 /*
1153 * When krbd detects a size change, it calls revalidate_disk(),
1154 * which ends up calling invalidate_bdev(), which invalidates
1155 * clean pages and does nothing about dirty pages beyond the
1156 * new size. The preceding cache flush makes sure those pages
1157 * are invalidated, which is what we need on shrink so that
1158 *
1159 * write 0..1M
1160 * resize 0
1161 * resize 2M
1162 * read 0..2M
1163 *
1164 * returns "0000 0000" rather than "data 0000".
1165 */
1166 ret = __krbd_flush(ctx, false);
1167 if (ret < 0)
1168 return ret;
1169
1170 return __librbd_resize(ctx, size);
1171}
1172
1173int
1174krbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
1175 const char *dst_imagename, int *order, int stripe_unit,
1176 int stripe_count)
1177{
1178 int ret;
1179
1180 ret = __krbd_flush(ctx, false);
1181 if (ret < 0)
1182 return ret;
1183
1184 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
1185 stripe_unit, stripe_count, true);
1186}
1187
1188int
1189krbd_flatten(struct rbd_ctx *ctx)
1190{
1191 int ret;
1192
1193 ret = __krbd_flush(ctx, false);
1194 if (ret < 0)
1195 return ret;
1196
1197 return __librbd_flatten(ctx);
1198}
11fdf7f2 1199#endif // __linux__
7c673cae 1200
11fdf7f2 1201#if defined(WITH_KRBD)
7c673cae
FG
1202const struct rbd_operations krbd_operations = {
1203 krbd_open,
1204 krbd_close,
1205 krbd_read,
1206 krbd_write,
1207 krbd_flush,
1208 krbd_discard,
1209 krbd_get_size,
1210 krbd_resize,
1211 krbd_clone,
1212 krbd_flatten,
1213 NULL,
1214};
11fdf7f2 1215#endif // WITH_KRBD
7c673cae 1216
11fdf7f2 1217#if defined(__linux__)
7c673cae
FG
1218int
1219nbd_open(const char *name, struct rbd_ctx *ctx)
1220{
1221 int r;
1222 int fd;
1223 char dev[4096];
1224 char *devnode;
1225
1226 SubProcess process("rbd-nbd", SubProcess::KEEP, SubProcess::PIPE,
1227 SubProcess::KEEP);
1228 process.add_cmd_arg("map");
eafe8130 1229 process.add_cmd_arg("--timeout=600");
7c673cae
FG
1230 std::string img;
1231 img.append(pool);
1232 img.append("/");
1233 img.append(name);
1234 process.add_cmd_arg(img.c_str());
1235
1236 r = __librbd_open(name, ctx);
1237 if (r < 0)
1238 return r;
1239
1240 r = process.spawn();
1241 if (r < 0) {
1242 prt("nbd_open failed to run rbd-nbd error: %s\n", process.err().c_str());
1243 return r;
1244 }
1245 r = safe_read(process.get_stdout(), dev, sizeof(dev));
1246 if (r < 0) {
1247 prt("nbd_open failed to get nbd device path\n");
1248 return r;
1249 }
1250 for (int i = 0; i < r; ++i)
1251 if (dev[i] == 10 || dev[i] == 13)
1252 dev[i] = 0;
1253 dev[r] = 0;
1254 r = process.join();
1255 if (r) {
1256 prt("rbd-nbd failed with error: %s", process.err().c_str());
1257 return -EINVAL;
1258 }
1259
1260 devnode = strdup(dev);
1261 if (!devnode)
1262 return -ENOMEM;
1263
1264 fd = open(devnode, O_RDWR | o_direct);
1265 if (fd < 0) {
1266 r = -errno;
1267 prt("open(%s) failed\n", devnode);
1268 return r;
1269 }
1270
1271 ctx->krbd_name = devnode;
1272 ctx->krbd_fd = fd;
1273
1274 return 0;
1275}
1276
1277int
1278nbd_close(struct rbd_ctx *ctx)
1279{
1280 int r;
1281
11fdf7f2 1282 ceph_assert(ctx->krbd_name && ctx->krbd_fd >= 0);
7c673cae
FG
1283
1284 if (close(ctx->krbd_fd) < 0) {
1285 r = -errno;
1286 prt("close(%s) failed\n", ctx->krbd_name);
1287 return r;
1288 }
1289
1290 SubProcess process("rbd-nbd");
1291 process.add_cmd_arg("unmap");
1292 process.add_cmd_arg(ctx->krbd_name);
1293
1294 r = process.spawn();
1295 if (r < 0) {
1296 prt("nbd_close failed to run rbd-nbd error: %s\n", process.err().c_str());
1297 return r;
1298 }
1299 r = process.join();
1300 if (r) {
1301 prt("rbd-nbd failed with error: %d", process.err().c_str());
1302 return -EINVAL;
1303 }
1304
1305 free((void *)ctx->krbd_name);
1306
1307 ctx->krbd_name = NULL;
1308 ctx->krbd_fd = -1;
1309
1310 return __librbd_close(ctx);
1311}
1312
1313int
1314nbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
1315 const char *dst_imagename, int *order, int stripe_unit,
1316 int stripe_count)
1317{
1318 int ret;
1319
1320 ret = __krbd_flush(ctx, false);
1321 if (ret < 0)
1322 return ret;
1323
1324 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
1325 stripe_unit, stripe_count, false);
1326}
1327
1328const struct rbd_operations nbd_operations = {
1329 nbd_open,
1330 nbd_close,
1331 krbd_read,
1332 krbd_write,
1333 krbd_flush,
1334 krbd_discard,
1335 krbd_get_size,
1336 krbd_resize,
1337 nbd_clone,
1338 krbd_flatten,
1339 NULL,
1340};
11fdf7f2
TL
1341#endif // __linux__
1342
1343#if defined(__FreeBSD__)
1344int
1345ggate_open(const char *name, struct rbd_ctx *ctx)
1346{
1347 int r;
1348 int fd;
1349 char dev[4096];
1350 char *devnode;
1351
1352 SubProcess process("rbd-ggate", SubProcess::KEEP, SubProcess::PIPE,
1353 SubProcess::KEEP);
1354 process.add_cmd_arg("map");
1355 std::string img;
1356 img.append(pool);
1357 img.append("/");
1358 img.append(name);
1359 process.add_cmd_arg(img.c_str());
1360
1361 r = __librbd_open(name, ctx);
1362 if (r < 0) {
1363 return r;
1364 }
1365
1366 r = process.spawn();
1367 if (r < 0) {
1368 prt("ggate_open failed to run rbd-ggate: %s\n",
1369 process.err().c_str());
1370 return r;
1371 }
1372 r = safe_read(process.get_stdout(), dev, sizeof(dev));
1373 if (r < 0) {
1374 prt("ggate_open failed to get ggate device path\n");
1375 return r;
1376 }
1377 for (int i = 0; i < r; ++i) {
1378 if (dev[i] == '\r' || dev[i] == '\n') {
1379 dev[i] = 0;
1380 }
1381 }
1382 dev[r] = 0;
1383 r = process.join();
1384 if (r) {
1385 prt("rbd-ggate failed with error: %s", process.err().c_str());
1386 return -EINVAL;
1387 }
1388
1389 devnode = strdup(dev);
1390 if (!devnode) {
1391 return -ENOMEM;
1392 }
1393
1394 for (int i = 0; i < 100; i++) {
1395 fd = open(devnode, O_RDWR | o_direct);
1396 if (fd >= 0 || errno != ENOENT) {
1397 break;
1398 }
1399 usleep(100000);
1400 }
1401 if (fd < 0) {
1402 r = -errno;
1403 prt("open(%s) failed\n", devnode);
1404 return r;
1405 }
1406
1407 ctx->krbd_name = devnode;
1408 ctx->krbd_fd = fd;
1409
1410 return 0;
1411}
1412
1413int
1414ggate_close(struct rbd_ctx *ctx)
1415{
1416 int r;
1417
1418 ceph_assert(ctx->krbd_name && ctx->krbd_fd >= 0);
1419
1420 if (close(ctx->krbd_fd) < 0) {
1421 r = -errno;
1422 prt("close(%s) failed\n", ctx->krbd_name);
1423 return r;
1424 }
1425
1426 SubProcess process("rbd-ggate");
1427 process.add_cmd_arg("unmap");
1428 process.add_cmd_arg(ctx->krbd_name);
1429
1430 r = process.spawn();
1431 if (r < 0) {
1432 prt("ggate_close failed to run rbd-nbd: %s\n",
1433 process.err().c_str());
1434 return r;
1435 }
1436 r = process.join();
1437 if (r) {
1438 prt("rbd-ggate failed with error: %d", process.err().c_str());
1439 return -EINVAL;
1440 }
1441
1442 free((void *)ctx->krbd_name);
1443
1444 ctx->krbd_name = NULL;
1445 ctx->krbd_fd = -1;
1446
1447 return __librbd_close(ctx);
1448}
1449
1450ssize_t
1451ggate_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
1452{
1453 ssize_t n;
1454
1455 n = pread(ctx->krbd_fd, buf, len, off);
1456 if (n < 0) {
1457 n = -errno;
1458 prt("pread(%llu, %zu) failed\n", off, len);
1459 return n;
1460 }
1461
1462 return n;
1463}
1464
1465ssize_t
1466ggate_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
1467{
1468 ssize_t n;
1469
1470 n = pwrite(ctx->krbd_fd, buf, len, off);
1471 if (n < 0) {
1472 n = -errno;
1473 prt("pwrite(%llu, %zu) failed\n", off, len);
1474 return n;
1475 }
1476
1477 return n;
1478}
1479
1480int
1481__ggate_flush(struct rbd_ctx *ctx, bool invalidate)
1482{
1483 int ret;
1484
1485 if (o_direct) {
1486 return 0;
1487 }
1488
1489 if (invalidate) {
1490 ret = ioctl(ctx->krbd_fd, DIOCGFLUSH, NULL);
1491 } else {
1492 ret = fsync(ctx->krbd_fd);
1493 }
1494 if (ret < 0) {
1495 ret = -errno;
1496 prt("%s failed\n", invalidate ? "DIOCGFLUSH" : "fsync");
1497 return ret;
1498 }
1499
1500 return 0;
1501}
1502
1503int
1504ggate_flush(struct rbd_ctx *ctx)
1505{
1506 return __ggate_flush(ctx, false);
1507}
1508
1509int
1510ggate_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
1511{
1512 off_t range[2] = {static_cast<off_t>(off), static_cast<off_t>(len)};
1513 int ret;
1514
1515 ret = __ggate_flush(ctx, true);
1516 if (ret < 0) {
1517 return ret;
1518 }
1519
1520 if (ioctl(ctx->krbd_fd, DIOCGDELETE, &range) < 0) {
1521 ret = -errno;
1522 prt("DIOCGDELETE(%llu, %llu) failed\n", off, len);
1523 return ret;
1524 }
1525
1526 return 0;
1527}
1528
1529int
1530ggate_get_size(struct rbd_ctx *ctx, uint64_t *size)
1531{
1532 off_t bytes;
1533
1534 if (ioctl(ctx->krbd_fd, DIOCGMEDIASIZE, &bytes) < 0) {
1535 int ret = -errno;
1536 prt("DIOCGMEDIASIZE failed\n");
1537 return ret;
1538 }
1539
1540 *size = bytes;
1541
1542 return 0;
1543}
1544
1545int
1546ggate_resize(struct rbd_ctx *ctx, uint64_t size)
1547{
1548 int ret;
1549
1550 ceph_assert(size % truncbdy == 0);
1551
1552 ret = __ggate_flush(ctx, false);
1553 if (ret < 0) {
1554 return ret;
1555 }
1556
1557 return __librbd_resize(ctx, size);
1558}
1559
1560int
1561ggate_clone(struct rbd_ctx *ctx, const char *src_snapname,
1562 const char *dst_imagename, int *order, int stripe_unit,
1563 int stripe_count)
1564{
1565 int ret;
1566
1567 ret = __ggate_flush(ctx, false);
1568 if (ret < 0) {
1569 return ret;
1570 }
1571
1572 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
1573 stripe_unit, stripe_count, false);
1574}
1575
1576int
1577ggate_flatten(struct rbd_ctx *ctx)
1578{
1579 int ret;
1580
1581 ret = __ggate_flush(ctx, false);
1582 if (ret < 0) {
1583 return ret;
1584 }
1585
1586 return __librbd_flatten(ctx);
1587}
1588
1589const struct rbd_operations ggate_operations = {
1590 ggate_open,
1591 ggate_close,
1592 ggate_read,
1593 ggate_write,
1594 ggate_flush,
1595 ggate_discard,
1596 ggate_get_size,
1597 ggate_resize,
1598 ggate_clone,
1599 ggate_flatten,
1600 NULL,
1601};
1602#endif // __FreeBSD__
7c673cae
FG
1603
1604struct rbd_ctx ctx = RBD_CTX_INIT;
1605const struct rbd_operations *ops = &librbd_operations;
1606
1607static bool rbd_image_has_parent(struct rbd_ctx *ctx)
1608{
1609 int ret;
11fdf7f2
TL
1610 rbd_linked_image_spec_t parent_image;
1611 rbd_snap_spec_t parent_snap;
7c673cae 1612
11fdf7f2
TL
1613 ret = rbd_get_parent(ctx->image, &parent_image, &parent_snap);
1614 if (ret < 0 && ret != -ENOENT) {
7c673cae
FG
1615 prterrcode("rbd_get_parent_info", ret);
1616 exit(1);
1617 }
11fdf7f2
TL
1618 rbd_linked_image_spec_cleanup(&parent_image);
1619 rbd_snap_spec_cleanup(&parent_snap);
7c673cae
FG
1620
1621 return !ret;
1622}
1623
1624/*
1625 * fsx
1626 */
1627
1628void
1629log4(int operation, int arg0, int arg1, int arg2)
1630{
1631 struct log_entry *le;
1632
1633 le = &oplog[logptr];
1634 le->operation = operation;
1635 if (closeopen)
1636 le->operation = ~ le->operation;
1637 le->args[0] = arg0;
1638 le->args[1] = arg1;
1639 le->args[2] = arg2;
1640 logptr++;
1641 logcount++;
1642 if (logptr >= LOGSIZE)
1643 logptr = 0;
1644}
1645
1646void
1647logdump(void)
1648{
1649 int i, count, down;
1650 struct log_entry *lp;
1651 const char *falloc_type[3] = {"PAST_EOF", "EXTENDING", "INTERIOR"};
1652
1653 prt("LOG DUMP (%d total operations):\n", logcount);
1654 if (logcount < LOGSIZE) {
1655 i = 0;
1656 count = logcount;
1657 } else {
1658 i = logptr;
1659 count = LOGSIZE;
1660 }
1661 for ( ; count > 0; count--) {
1662 int opnum;
1663
1664 opnum = i+1 + (logcount/LOGSIZE)*LOGSIZE;
1665 prt("%d(%3d mod 256): ", opnum, opnum%256);
1666 lp = &oplog[i];
1667 if ((closeopen = lp->operation < 0))
1668 lp->operation = ~ lp->operation;
1669
1670 switch (lp->operation) {
1671 case OP_MAPREAD:
1672 prt("MAPREAD 0x%x thru 0x%x\t(0x%x bytes)",
1673 lp->args[0], lp->args[0] + lp->args[1] - 1,
1674 lp->args[1]);
1675 if (badoff >= lp->args[0] && badoff <
1676 lp->args[0] + lp->args[1])
1677 prt("\t***RRRR***");
1678 break;
1679 case OP_MAPWRITE:
1680 prt("MAPWRITE 0x%x thru 0x%x\t(0x%x bytes)",
1681 lp->args[0], lp->args[0] + lp->args[1] - 1,
1682 lp->args[1]);
1683 if (badoff >= lp->args[0] && badoff <
1684 lp->args[0] + lp->args[1])
1685 prt("\t******WWWW");
1686 break;
1687 case OP_READ:
1688 prt("READ 0x%x thru 0x%x\t(0x%x bytes)",
1689 lp->args[0], lp->args[0] + lp->args[1] - 1,
1690 lp->args[1]);
1691 if (badoff >= lp->args[0] &&
1692 badoff < lp->args[0] + lp->args[1])
1693 prt("\t***RRRR***");
1694 break;
1695 case OP_WRITE:
1696 prt("WRITE 0x%x thru 0x%x\t(0x%x bytes)",
1697 lp->args[0], lp->args[0] + lp->args[1] - 1,
1698 lp->args[1]);
1699 if (lp->args[0] > lp->args[2])
1700 prt(" HOLE");
1701 else if (lp->args[0] + lp->args[1] > lp->args[2])
1702 prt(" EXTEND");
1703 if ((badoff >= lp->args[0] || badoff >=lp->args[2]) &&
1704 badoff < lp->args[0] + lp->args[1])
1705 prt("\t***WWWW");
1706 break;
1707 case OP_TRUNCATE:
1708 down = lp->args[0] < lp->args[1];
1709 prt("TRUNCATE %s\tfrom 0x%x to 0x%x",
1710 down ? "DOWN" : "UP", lp->args[1], lp->args[0]);
1711 if (badoff >= lp->args[!down] &&
1712 badoff < lp->args[!!down])
1713 prt("\t******WWWW");
1714 break;
1715 case OP_FALLOCATE:
1716 /* 0: offset 1: length 2: where alloced */
1717 prt("FALLOC 0x%x thru 0x%x\t(0x%x bytes) %s",
1718 lp->args[0], lp->args[0] + lp->args[1],
1719 lp->args[1], falloc_type[lp->args[2]]);
1720 if (badoff >= lp->args[0] &&
1721 badoff < lp->args[0] + lp->args[1])
1722 prt("\t******FFFF");
1723 break;
1724 case OP_PUNCH_HOLE:
1725 prt("PUNCH 0x%x thru 0x%x\t(0x%x bytes)",
1726 lp->args[0], lp->args[0] + lp->args[1] - 1,
1727 lp->args[1]);
1728 if (badoff >= lp->args[0] && badoff <
1729 lp->args[0] + lp->args[1])
1730 prt("\t******PPPP");
1731 break;
1732 case OP_WRITESAME:
1733 prt("WRITESAME 0x%x thru 0x%x\t(0x%x bytes) data_size 0x%x",
1734 lp->args[0], lp->args[0] + lp->args[1] - 1,
1735 lp->args[1], lp->args[2]);
1736 if (badoff >= lp->args[0] &&
1737 badoff < lp->args[0] + lp->args[1])
1738 prt("\t***WSWSWSWS");
1739 break;
c07f9fc5
FG
1740 case OP_COMPARE_AND_WRITE:
1741 prt("COMPARE_AND_WRITE 0x%x thru 0x%x\t(0x%x bytes)",
1742 lp->args[0], lp->args[0] + lp->args[1] - 1,
1743 lp->args[1]);
1744 if (lp->args[0] > lp->args[2])
1745 prt(" HOLE");
1746 else if (lp->args[0] + lp->args[1] > lp->args[2])
1747 prt(" EXTEND");
1748 if ((badoff >= lp->args[0] || badoff >=lp->args[2]) &&
1749 badoff < lp->args[0] + lp->args[1])
1750 prt("\t***WWWW");
1751 break;
7c673cae
FG
1752 case OP_CLONE:
1753 prt("CLONE");
1754 break;
1755 case OP_FLATTEN:
1756 prt("FLATTEN");
1757 break;
1758 case OP_SKIPPED:
1759 prt("SKIPPED (no operation)");
1760 break;
1761 default:
1762 prt("BOGUS LOG ENTRY (operation code = %d)!",
1763 lp->operation);
1764 }
1765 if (closeopen)
1766 prt("\n\t\tCLOSE/OPEN");
1767 prt("\n");
1768 i++;
1769 if (i == LOGSIZE)
1770 i = 0;
1771 }
1772}
1773
1774void
1775save_buffer(char *buffer, off_t bufferlength, int fd)
1776{
1777 off_t ret;
1778 ssize_t byteswritten;
1779
1780 if (fd <= 0 || bufferlength == 0)
1781 return;
1782
1783 if (bufferlength > SSIZE_MAX) {
1784 prt("fsx flaw: overflow in save_buffer\n");
1785 exit(67);
1786 }
1787
1788 ret = lseek(fd, (off_t)0, SEEK_SET);
1789 if (ret == (off_t)-1)
1790 prterr("save_buffer: lseek 0");
1791
1792 byteswritten = write(fd, buffer, (size_t)bufferlength);
1793 if (byteswritten != bufferlength) {
1794 if (byteswritten == -1)
1795 prterr("save_buffer write");
1796 else
1797 warn("save_buffer: short write, 0x%x bytes instead of 0x%llx\n",
1798 (unsigned)byteswritten,
1799 (unsigned long long)bufferlength);
1800 }
1801}
1802
1803
1804void
1805report_failure(int status)
1806{
1807 logdump();
1808
1809 if (fsxgoodfd) {
1810 if (good_buf) {
1811 save_buffer(good_buf, file_size, fsxgoodfd);
1812 prt("Correct content saved for comparison\n");
1813 prt("(maybe hexdump \"%s\" vs \"%s.fsxgood\")\n",
1814 iname, iname);
1815 }
1816 close(fsxgoodfd);
1817 }
1818 sleep(3); // so the log can flush to disk. KLUDGEY!
1819 exit(status);
1820}
1821
1822#define short_at(cp) ((unsigned short)((*((unsigned char *)(cp)) << 8) | \
1823 *(((unsigned char *)(cp)) + 1)))
1824
1825int
1826fsxcmp(char *good_buf, char *temp_buf, unsigned size)
1827{
1828 if (!skip_partial_discard) {
1829 return memcmp(good_buf, temp_buf, size);
1830 }
1831
1832 for (unsigned i = 0; i < size; i++) {
1833 if (good_buf[i] != temp_buf[i] && good_buf[i] != 0) {
1834 return good_buf[i] - temp_buf[i];
1835 }
1836 }
1837 return 0;
1838}
1839
1840void
1841check_buffers(char *good_buf, char *temp_buf, unsigned offset, unsigned size)
1842{
1843 if (fsxcmp(good_buf + offset, temp_buf, size) != 0) {
1844 unsigned i = 0;
1845 unsigned n = 0;
1846
1847 prt("READ BAD DATA: offset = 0x%x, size = 0x%x, fname = %s\n",
1848 offset, size, iname);
1849 prt("OFFSET\tGOOD\tBAD\tRANGE\n");
1850 while (size > 0) {
1851 unsigned char c = good_buf[offset];
1852 unsigned char t = temp_buf[i];
1853 if (c != t) {
1854 if (n < 16) {
1855 unsigned bad = short_at(&temp_buf[i]);
1856 prt("0x%5x\t0x%04x\t0x%04x", offset,
1857 short_at(&good_buf[offset]), bad);
1858 unsigned op = temp_buf[(offset & 1) ? i+1 : i];
1859 prt("\t0x%5x\n", n);
1860 if (op)
1861 prt("operation# (mod 256) for "
1862 "the bad data may be %u\n",
1863 ((unsigned)op & 0xff));
1864 else
1865 prt("operation# (mod 256) for "
1866 "the bad data unknown, check"
1867 " HOLE and EXTEND ops\n");
1868 }
1869 n++;
1870 badoff = offset;
1871 }
1872 offset++;
1873 i++;
1874 size--;
1875 }
1876 report_failure(110);
1877 }
1878}
1879
1880
1881void
1882check_size(void)
1883{
1884 uint64_t size;
1885 int ret;
1886
1887 ret = ops->get_size(&ctx, &size);
1888 if (ret < 0)
1889 prterrcode("check_size: ops->get_size", ret);
1890
1891 if ((uint64_t)file_size != size) {
1892 prt("Size error: expected 0x%llx stat 0x%llx\n",
1893 (unsigned long long)file_size,
1894 (unsigned long long)size);
1895 report_failure(120);
1896 }
1897}
1898
1899#define TRUNC_HACK_SIZE (200ULL << 9) /* 512-byte aligned for krbd */
1900
1901void
1902check_trunc_hack(void)
1903{
1904 uint64_t size;
1905 int ret;
1906
1907 ret = ops->resize(&ctx, 0ULL);
1908 if (ret < 0)
1909 prterrcode("check_trunc_hack: ops->resize pre", ret);
1910
1911 ret = ops->resize(&ctx, TRUNC_HACK_SIZE);
1912 if (ret < 0)
1913 prterrcode("check_trunc_hack: ops->resize actual", ret);
1914
1915 ret = ops->get_size(&ctx, &size);
1916 if (ret < 0)
1917 prterrcode("check_trunc_hack: ops->get_size", ret);
1918
1919 if (size != TRUNC_HACK_SIZE) {
1920 prt("no extend on truncate! not posix!\n");
1921 exit(130);
1922 }
1923
1924 ret = ops->resize(&ctx, 0ULL);
1925 if (ret < 0)
1926 prterrcode("check_trunc_hack: ops->resize post", ret);
1927}
1928
1929int
1930create_image()
1931{
1932 int r;
1933 int order = 0;
1934 char buf[32];
11fdf7f2
TL
1935 char client_name[256];
1936
1937 sprintf(client_name, "client.%s", client_id);
7c673cae 1938
11fdf7f2 1939 r = rados_create2(&cluster, cluster_name, client_name, 0);
7c673cae
FG
1940 if (r < 0) {
1941 simple_err("Could not create cluster handle", r);
1942 return r;
1943 }
1944 rados_conf_parse_env(cluster, NULL);
1945 r = rados_conf_read_file(cluster, NULL);
1946 if (r < 0) {
1947 simple_err("Error reading ceph config file", r);
1948 goto failed_shutdown;
1949 }
1950 r = rados_connect(cluster);
1951 if (r < 0) {
1952 simple_err("Error connecting to cluster", r);
1953 goto failed_shutdown;
1954 }
11fdf7f2 1955#if defined(WITH_KRBD)
7c673cae
FG
1956 r = krbd_create_from_context(rados_cct(cluster), &krbd);
1957 if (r < 0) {
1958 simple_err("Could not create libkrbd handle", r);
1959 goto failed_shutdown;
1960 }
11fdf7f2 1961#endif
7c673cae
FG
1962
1963 r = rados_pool_create(cluster, pool);
1964 if (r < 0 && r != -EEXIST) {
1965 simple_err("Error creating pool", r);
1966 goto failed_krbd;
1967 }
1968 r = rados_ioctx_create(cluster, pool, &ioctx);
1969 if (r < 0) {
1970 simple_err("Error creating ioctx", r);
1971 goto failed_krbd;
1972 }
c07f9fc5
FG
1973 rados_application_enable(ioctx, "rbd", 1);
1974
7c673cae 1975 if (clone_calls || journal_replay) {
11fdf7f2
TL
1976 uint64_t features;
1977 r = get_features(&features);
1978 if (r < 0) {
1979 goto failed_open;
7c673cae 1980 }
11fdf7f2
TL
1981
1982 r = rbd_create2(ioctx, iname, file_size, features, &order);
7c673cae 1983 } else {
11fdf7f2 1984 r = rbd_create(ioctx, iname, file_size, &order);
7c673cae
FG
1985 }
1986 if (r < 0) {
1987 simple_err("Error creating image", r);
1988 goto failed_open;
1989 }
1990
1991 if (journal_replay) {
1992 r = register_journal(ioctx, iname);
1993 if (r < 0) {
1994 goto failed_open;
1995 }
1996 }
1997
1998 r = rados_conf_get(cluster, "rbd_skip_partial_discard", buf,
1999 sizeof(buf));
2000 if (r < 0) {
2001 simple_err("Could not get rbd_skip_partial_discard value", r);
2002 goto failed_open;
2003 }
2004 skip_partial_discard = (strcmp(buf, "true") == 0);
2005
2006 return 0;
2007
2008 failed_open:
2009 rados_ioctx_destroy(ioctx);
2010 failed_krbd:
11fdf7f2 2011#if defined(WITH_KRBD)
7c673cae 2012 krbd_destroy(krbd);
11fdf7f2 2013#endif
7c673cae
FG
2014 failed_shutdown:
2015 rados_shutdown(cluster);
2016 return r;
2017}
2018
2019void
2020doflush(unsigned offset, unsigned size)
2021{
2022 int ret;
2023
2024 if (o_direct)
2025 return;
2026
2027 ret = ops->flush(&ctx);
2028 if (ret < 0)
2029 prterrcode("doflush: ops->flush", ret);
2030}
2031
2032void
2033doread(unsigned offset, unsigned size)
2034{
2035 int ret;
2036
2037 offset -= offset % readbdy;
2038 if (o_direct)
2039 size -= size % readbdy;
2040 if (size == 0) {
2041 if (!quiet && testcalls > simulatedopcount && !o_direct)
2042 prt("skipping zero size read\n");
2043 log4(OP_SKIPPED, OP_READ, offset, size);
2044 return;
2045 }
2046 if (size + offset > file_size) {
2047 if (!quiet && testcalls > simulatedopcount)
2048 prt("skipping seek/read past end of file\n");
2049 log4(OP_SKIPPED, OP_READ, offset, size);
2050 return;
2051 }
2052
2053 log4(OP_READ, offset, size, 0);
2054
2055 if (testcalls <= simulatedopcount)
2056 return;
2057
2058 if (!quiet &&
2059 ((progressinterval && testcalls % progressinterval == 0) ||
2060 (debug &&
2061 (monitorstart == -1 ||
2062 (static_cast<long>(offset + size) > monitorstart &&
2063 (monitorend == -1 ||
2064 static_cast<long>(offset) <= monitorend))))))
2065 prt("%lu read\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
2066 offset, offset + size - 1, size);
2067
2068 ret = ops->read(&ctx, offset, size, temp_buf);
2069 if (ret != (int)size) {
2070 if (ret < 0)
2071 prterrcode("doread: ops->read", ret);
2072 else
2073 prt("short read: 0x%x bytes instead of 0x%x\n",
2074 ret, size);
2075 report_failure(141);
2076 }
2077
2078 check_buffers(good_buf, temp_buf, offset, size);
2079}
2080
2081
2082void
2083check_eofpage(char *s, unsigned offset, char *p, int size)
2084{
2085 unsigned long last_page, should_be_zero;
2086
2087 if (offset + size <= (file_size & ~page_mask))
2088 return;
2089 /*
2090 * we landed in the last page of the file
2091 * test to make sure the VM system provided 0's
2092 * beyond the true end of the file mapping
2093 * (as required by mmap def in 1996 posix 1003.1)
2094 */
2095 last_page = ((unsigned long)p + (offset & page_mask) + size) & ~page_mask;
2096
2097 for (should_be_zero = last_page + (file_size & page_mask);
2098 should_be_zero < last_page + page_size;
2099 should_be_zero++)
2100 if (*(char *)should_be_zero) {
2101 prt("Mapped %s: non-zero data past EOF (0x%llx) page offset 0x%x is 0x%04x\n",
2102 s, file_size - 1, should_be_zero & page_mask,
2103 short_at(should_be_zero));
2104 report_failure(205);
2105 }
2106}
2107
2108
2109void
2110gendata(char *original_buf, char *good_buf, unsigned offset, unsigned size)
2111{
2112 while (size--) {
2113 good_buf[offset] = testcalls % 256;
2114 if (offset % 2)
2115 good_buf[offset] += original_buf[offset];
2116 offset++;
2117 }
2118}
2119
2120
2121void
2122dowrite(unsigned offset, unsigned size)
2123{
2124 ssize_t ret;
2125 off_t newsize;
2126
2127 offset -= offset % writebdy;
2128 if (o_direct)
2129 size -= size % writebdy;
2130 if (size == 0) {
2131 if (!quiet && testcalls > simulatedopcount && !o_direct)
2132 prt("skipping zero size write\n");
2133 log4(OP_SKIPPED, OP_WRITE, offset, size);
2134 return;
2135 }
2136
2137 log4(OP_WRITE, offset, size, file_size);
2138
2139 gendata(original_buf, good_buf, offset, size);
2140 if (file_size < offset + size) {
2141 newsize = ceil(((double)offset + size) / truncbdy) * truncbdy;
2142 if (file_size < newsize)
2143 memset(good_buf + file_size, '\0', newsize - file_size);
2144 file_size = newsize;
2145 if (lite) {
2146 warn("Lite file size bug in fsx!");
2147 report_failure(149);
2148 }
2149 ret = ops->resize(&ctx, newsize);
2150 if (ret < 0) {
2151 prterrcode("dowrite: ops->resize", ret);
2152 report_failure(150);
2153 }
2154 }
2155
2156 if (testcalls <= simulatedopcount)
2157 return;
2158
2159 if (!quiet &&
2160 ((progressinterval && testcalls % progressinterval == 0) ||
2161 (debug &&
2162 (monitorstart == -1 ||
2163 (static_cast<long>(offset + size) > monitorstart &&
2164 (monitorend == -1 ||
2165 static_cast<long>(offset) <= monitorend))))))
2166 prt("%lu write\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
2167 offset, offset + size - 1, size);
2168
2169 ret = ops->write(&ctx, offset, size, good_buf + offset);
2170 if (ret != (ssize_t)size) {
2171 if (ret < 0)
2172 prterrcode("dowrite: ops->write", ret);
2173 else
2174 prt("short write: 0x%x bytes instead of 0x%x\n",
2175 ret, size);
2176 report_failure(151);
2177 }
2178
2179 if (flush_enabled)
2180 doflush(offset, size);
2181}
2182
2183
2184void
2185dotruncate(unsigned size)
2186{
2187 int oldsize = file_size;
2188 int ret;
2189
2190 size -= size % truncbdy;
2191 if (size > biggest) {
2192 biggest = size;
2193 if (!quiet && testcalls > simulatedopcount)
2194 prt("truncating to largest ever: 0x%x\n", size);
2195 }
2196
2197 log4(OP_TRUNCATE, size, (unsigned)file_size, 0);
2198
2199 if (size > file_size)
2200 memset(good_buf + file_size, '\0', size - file_size);
2201 else if (size < file_size)
2202 memset(good_buf + size, '\0', file_size - size);
2203 file_size = size;
2204
2205 if (testcalls <= simulatedopcount)
2206 return;
2207
2208 if ((progressinterval && testcalls % progressinterval == 0) ||
2209 (debug && (monitorstart == -1 || monitorend == -1 ||
2210 (long)size <= monitorend)))
2211 prt("%lu trunc\tfrom 0x%x to 0x%x\n", testcalls, oldsize, size);
2212
2213 ret = ops->resize(&ctx, size);
2214 if (ret < 0) {
2215 prterrcode("dotruncate: ops->resize", ret);
2216 report_failure(160);
2217 }
2218}
2219
2220void
2221do_punch_hole(unsigned offset, unsigned length)
2222{
2223 unsigned end_offset;
2224 int max_offset = 0;
2225 int max_len = 0;
2226 int ret;
2227
2228 offset -= offset % holebdy;
2229 length -= length % holebdy;
2230 if (length == 0) {
2231 if (!quiet && testcalls > simulatedopcount)
2232 prt("skipping zero length punch hole\n");
2233 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, length);
2234 return;
2235 }
2236
2237 if (file_size <= (loff_t)offset) {
2238 if (!quiet && testcalls > simulatedopcount)
2239 prt("skipping hole punch off the end of the file\n");
2240 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, length);
2241 return;
2242 }
2243
2244 end_offset = offset + length;
2245
2246 log4(OP_PUNCH_HOLE, offset, length, 0);
2247
2248 if (testcalls <= simulatedopcount)
2249 return;
2250
2251 if ((progressinterval && testcalls % progressinterval == 0) ||
2252 (debug && (monitorstart == -1 || monitorend == -1 ||
2253 (long)end_offset <= monitorend))) {
2254 prt("%lu punch\tfrom 0x%x to 0x%x, (0x%x bytes)\n", testcalls,
2255 offset, offset+length, length);
2256 }
2257
2258 ret = ops->discard(&ctx, (unsigned long long)offset,
2259 (unsigned long long)length);
2260 if (ret < 0) {
2261 prterrcode("do_punch_hole: ops->discard", ret);
2262 report_failure(161);
2263 }
2264
2265 max_offset = offset < file_size ? offset : file_size;
2266 max_len = max_offset + length <= file_size ? length :
2267 file_size - max_offset;
2268 memset(good_buf + max_offset, '\0', max_len);
2269}
2270
2271unsigned get_data_size(unsigned size)
2272{
2273 unsigned i;
2274 unsigned hint;
2275 unsigned max = sqrt((double)size) + 1;
2276 unsigned good = 1;
2277 unsigned curr = good;
2278
2279 hint = get_random() % max;
2280
2281 for (i = 1; i < max && curr < hint; i++) {
2282 if (size % i == 0) {
2283 good = curr;
2284 curr = i;
2285 }
2286 }
2287
2288 if (curr == hint)
2289 good = curr;
2290
2291 return good;
2292}
2293
2294void
2295dowritesame(unsigned offset, unsigned size)
2296{
2297 ssize_t ret;
2298 off_t newsize;
2299 unsigned buf_off;
2300 unsigned data_size;
2301 int n;
2302
2303 offset -= offset % writebdy;
2304 if (o_direct)
2305 size -= size % writebdy;
2306 if (size == 0) {
2307 if (!quiet && testcalls > simulatedopcount && !o_direct)
2308 prt("skipping zero size writesame\n");
2309 log4(OP_SKIPPED, OP_WRITESAME, offset, size);
2310 return;
2311 }
2312
2313 data_size = get_data_size(size);
2314
2315 log4(OP_WRITESAME, offset, size, data_size);
2316
2317 gendata(original_buf, good_buf, offset, data_size);
2318 if (file_size < offset + size) {
2319 newsize = ceil(((double)offset + size) / truncbdy) * truncbdy;
2320 if (file_size < newsize)
2321 memset(good_buf + file_size, '\0', newsize - file_size);
2322 file_size = newsize;
2323 if (lite) {
2324 warn("Lite file size bug in fsx!");
2325 report_failure(162);
2326 }
2327 ret = ops->resize(&ctx, newsize);
2328 if (ret < 0) {
2329 prterrcode("dowritesame: ops->resize", ret);
2330 report_failure(163);
2331 }
2332 }
2333
2334 for (n = size / data_size, buf_off = data_size; n > 1; n--) {
2335 memcpy(good_buf + offset + buf_off, good_buf + offset, data_size);
2336 buf_off += data_size;
2337 }
2338
2339 if (testcalls <= simulatedopcount)
2340 return;
2341
2342 if (!quiet &&
2343 ((progressinterval && testcalls % progressinterval == 0) ||
2344 (debug &&
2345 (monitorstart == -1 ||
2346 (static_cast<long>(offset + size) > monitorstart &&
2347 (monitorend == -1 ||
2348 static_cast<long>(offset) <= monitorend))))))
2349 prt("%lu writesame\t0x%x thru\t0x%x\tdata_size\t0x%x(0x%x bytes)\n", testcalls,
2350 offset, offset + size - 1, data_size, size);
2351
2352 ret = ops->writesame(&ctx, offset, size, good_buf + offset, data_size);
2353 if (ret != (ssize_t)size) {
2354 if (ret < 0)
2355 prterrcode("dowritesame: ops->writesame", ret);
2356 else
2357 prt("short writesame: 0x%x bytes instead of 0x%x\n",
2358 ret, size);
2359 report_failure(164);
2360 }
2361
2362 if (flush_enabled)
2363 doflush(offset, size);
2364}
2365
c07f9fc5
FG
2366void
2367docompareandwrite(unsigned offset, unsigned size)
2368{
2369 int ret;
2370
b32b8144
FG
2371 if (skip_partial_discard) {
2372 if (!quiet && testcalls > simulatedopcount)
2373 prt("compare and write disabled\n");
2374 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
2375 return;
2376 }
2377
c07f9fc5
FG
2378 offset -= offset % writebdy;
2379 if (o_direct)
2380 size -= size % writebdy;
2381
2382 if (size == 0) {
2383 if (!quiet && testcalls > simulatedopcount && !o_direct)
2384 prt("skipping zero size read\n");
2385 log4(OP_SKIPPED, OP_READ, offset, size);
2386 return;
2387 }
2388
2389 if (size + offset > file_size) {
2390 if (!quiet && testcalls > simulatedopcount)
2391 prt("skipping seek/compare past end of file\n");
2392 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
2393 return;
2394 }
2395
2396 memcpy(temp_buf + offset, good_buf + offset, size);
2397 gendata(original_buf, good_buf, offset, size);
2398 log4(OP_COMPARE_AND_WRITE, offset, size, 0);
2399
2400 if (testcalls <= simulatedopcount)
2401 return;
2402
2403 if (!quiet &&
2404 ((progressinterval && testcalls % progressinterval == 0) ||
2405 (debug &&
2406 (monitorstart == -1 ||
2407 (static_cast<long>(offset + size) > monitorstart &&
2408 (monitorend == -1 ||
2409 static_cast<long>(offset) <= monitorend))))))
2410 prt("%lu compareandwrite\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
2411 offset, offset + size - 1, size);
2412
2413 ret = ops->compare_and_write(&ctx, offset, size, temp_buf + offset,
2414 good_buf + offset);
2415 if (ret != (ssize_t)size) {
2416 if (ret == -EINVAL) {
2417 memcpy(good_buf + offset, temp_buf + offset, size);
2418 return;
2419 }
2420 if (ret < 0)
2421 prterrcode("docompareandwrite: ops->compare_and_write", ret);
2422 else
2423 prt("short write: 0x%x bytes instead of 0x%x\n", ret, size);
2424 report_failure(151);
2425 return;
2426 }
2427
2428 if (flush_enabled)
2429 doflush(offset, size);
2430}
2431
7c673cae
FG
2432void clone_filename(char *buf, size_t len, int clones)
2433{
11fdf7f2
TL
2434#if __GNUC__ && __GNUC__ >= 8
2435#pragma GCC diagnostic push
2436#pragma GCC diagnostic ignored "-Wformat-truncation"
2437#endif
7c673cae
FG
2438 snprintf(buf, len, "%s/fsx-%s-parent%d",
2439 dirpath, iname, clones);
11fdf7f2
TL
2440#if __GNUC__ && __GNUC__ >= 8
2441#pragma GCC diagnostic pop
2442#endif
7c673cae
FG
2443}
2444
2445void clone_imagename(char *buf, size_t len, int clones)
2446{
2447 if (clones > 0)
2448 snprintf(buf, len, "%s-clone%d", iname, clones);
2449 else
2450 strncpy(buf, iname, len);
2451 buf[len - 1] = '\0';
2452}
2453
2454void replay_imagename(char *buf, size_t len, int clones)
2455{
2456 clone_imagename(buf, len, clones);
2457 strncat(buf, "-replay", len - strlen(buf));
2458 buf[len - 1] = '\0';
2459}
2460
2461void check_clone(int clonenum, bool replay_image);
2462
2463void
2464do_clone()
2465{
2466 char filename[1024];
2467 char imagename[1024];
2468 char lastimagename[1024];
2469 int ret, fd;
2470 int order = 0, stripe_unit = 0, stripe_count = 0;
2471 uint64_t newsize = file_size;
2472
2473 log4(OP_CLONE, 0, 0, 0);
2474 ++num_clones;
2475
2476 if (randomize_striping) {
2477 order = 18 + get_random() % 8;
2478 stripe_unit = 1ull << (order - 1 - (get_random() % 8));
2479 stripe_count = 2 + get_random() % 14;
2480 }
2481
2482 prt("%lu clone\t%d order %d su %d sc %d\n", testcalls, num_clones,
2483 order, stripe_unit, stripe_count);
2484
2485 clone_imagename(imagename, sizeof(imagename), num_clones);
2486 clone_imagename(lastimagename, sizeof(lastimagename),
2487 num_clones - 1);
11fdf7f2 2488 ceph_assert(strcmp(lastimagename, ctx.name) == 0);
7c673cae
FG
2489
2490 ret = ops->clone(&ctx, "snap", imagename, &order, stripe_unit,
2491 stripe_count);
2492 if (ret < 0) {
2493 prterrcode("do_clone: ops->clone", ret);
2494 exit(165);
2495 }
2496
2497 if (randomize_parent_overlap && rbd_image_has_parent(&ctx)) {
2498 int rand = get_random() % 16 + 1; // [1..16]
2499
2500 if (rand < 13) {
2501 uint64_t overlap;
2502
2503 ret = rbd_get_overlap(ctx.image, &overlap);
2504 if (ret < 0) {
2505 prterrcode("do_clone: rbd_get_overlap", ret);
2506 exit(1);
2507 }
2508
2509 if (rand < 10) { // 9/16
2510 newsize = overlap * ((double)rand / 10);
2511 newsize -= newsize % truncbdy;
2512 } else { // 3/16
2513 newsize = 0;
2514 }
2515
11fdf7f2 2516 ceph_assert(newsize != (uint64_t)file_size);
7c673cae
FG
2517 prt("truncating image %s from 0x%llx (overlap 0x%llx) to 0x%llx\n",
2518 ctx.name, file_size, overlap, newsize);
2519
2520 ret = ops->resize(&ctx, newsize);
2521 if (ret < 0) {
2522 prterrcode("do_clone: ops->resize", ret);
2523 exit(1);
2524 }
2525 } else if (rand < 15) { // 2/16
2526 prt("flattening image %s\n", ctx.name);
2527
2528 ret = ops->flatten(&ctx);
2529 if (ret < 0) {
2530 prterrcode("do_clone: ops->flatten", ret);
2531 exit(1);
2532 }
2533 } else { // 2/16
2534 prt("leaving image %s intact\n", ctx.name);
2535 }
2536 }
2537
2538 clone_filename(filename, sizeof(filename), num_clones);
2539 if ((fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0666)) < 0) {
2540 simple_err("do_clone: open", -errno);
2541 exit(162);
2542 }
2543 save_buffer(good_buf, newsize, fd);
2544 if ((ret = close(fd)) < 0) {
2545 simple_err("do_clone: close", -errno);
2546 exit(163);
2547 }
2548
2549 /*
2550 * Close parent.
2551 */
2552 if ((ret = ops->close(&ctx)) < 0) {
2553 prterrcode("do_clone: ops->close", ret);
2554 exit(174);
2555 }
2556
2557 if (journal_replay) {
2558 ret = finalize_journal(ioctx, lastimagename, num_clones - 1,
2559 order, stripe_unit, stripe_count);
2560 if (ret < 0) {
2561 exit(EXIT_FAILURE);
2562 }
2563
2564 ret = register_journal(ioctx, imagename);
2565 if (ret < 0) {
2566 exit(EXIT_FAILURE);
2567 }
2568 }
2569
2570 /*
2571 * Open freshly made clone.
2572 */
2573 if ((ret = ops->open(imagename, &ctx)) < 0) {
2574 prterrcode("do_clone: ops->open", ret);
2575 exit(166);
2576 }
2577
2578 if (num_clones > 1) {
2579 if (journal_replay) {
2580 check_clone(num_clones - 2, true);
2581 }
2582 check_clone(num_clones - 2, false);
2583 }
2584}
2585
2586void
2587check_clone(int clonenum, bool replay_image)
2588{
2589 char filename[128];
2590 char imagename[128];
2591 int ret, fd;
2592 struct rbd_ctx cur_ctx = RBD_CTX_INIT;
2593 struct stat file_info;
2594 char *good_buf, *temp_buf;
2595
2596 if (replay_image) {
2597 replay_imagename(imagename, sizeof(imagename), clonenum);
2598 } else {
2599 clone_imagename(imagename, sizeof(imagename), clonenum);
2600 }
2601
2602 if ((ret = ops->open(imagename, &cur_ctx)) < 0) {
2603 prterrcode("check_clone: ops->open", ret);
2604 exit(167);
2605 }
2606
2607 clone_filename(filename, sizeof(filename), clonenum + 1);
2608 if ((fd = open(filename, O_RDONLY)) < 0) {
2609 simple_err("check_clone: open", -errno);
2610 exit(168);
2611 }
2612
2613 prt("checking clone #%d, image %s against file %s\n",
2614 clonenum, imagename, filename);
2615 if ((ret = fstat(fd, &file_info)) < 0) {
2616 simple_err("check_clone: fstat", -errno);
2617 exit(169);
2618 }
2619
2620 good_buf = NULL;
2621 ret = posix_memalign((void **)&good_buf,
11fdf7f2 2622 std::max(writebdy, (int)sizeof(void *)),
7c673cae
FG
2623 file_info.st_size);
2624 if (ret > 0) {
2625 prterrcode("check_clone: posix_memalign(good_buf)", -ret);
2626 exit(96);
2627 }
2628
2629 temp_buf = NULL;
2630 ret = posix_memalign((void **)&temp_buf,
11fdf7f2 2631 std::max(readbdy, (int)sizeof(void *)),
7c673cae
FG
2632 file_info.st_size);
2633 if (ret > 0) {
2634 prterrcode("check_clone: posix_memalign(temp_buf)", -ret);
2635 exit(97);
2636 }
2637
2638 if ((ret = pread(fd, good_buf, file_info.st_size, 0)) < 0) {
2639 simple_err("check_clone: pread", -errno);
2640 exit(170);
2641 }
2642 if ((ret = ops->read(&cur_ctx, 0, file_info.st_size, temp_buf)) < 0) {
2643 prterrcode("check_clone: ops->read", ret);
2644 exit(171);
2645 }
2646 close(fd);
2647 if ((ret = ops->close(&cur_ctx)) < 0) {
2648 prterrcode("check_clone: ops->close", ret);
2649 exit(174);
2650 }
2651 check_buffers(good_buf, temp_buf, 0, file_info.st_size);
2652
2653 if (!replay_image) {
2654 unlink(filename);
2655 }
2656
2657 free(good_buf);
2658 free(temp_buf);
2659}
2660
2661void
2662writefileimage()
2663{
2664 ssize_t ret;
2665
2666 ret = ops->write(&ctx, 0, file_size, good_buf);
2667 if (ret != file_size) {
2668 if (ret < 0)
2669 prterrcode("writefileimage: ops->write", ret);
2670 else
2671 prt("short write: 0x%x bytes instead of 0x%llx\n",
2672 ret, (unsigned long long)file_size);
2673 report_failure(172);
2674 }
2675
2676 if (!lite) {
2677 ret = ops->resize(&ctx, file_size);
2678 if (ret < 0) {
2679 prterrcode("writefileimage: ops->resize", ret);
2680 report_failure(173);
2681 }
2682 }
2683}
2684
2685void
2686do_flatten()
2687{
2688 int ret;
2689
2690 if (!rbd_image_has_parent(&ctx)) {
2691 log4(OP_SKIPPED, OP_FLATTEN, 0, 0);
2692 return;
2693 }
2694 log4(OP_FLATTEN, 0, 0, 0);
2695 prt("%lu flatten\n", testcalls);
2696
2697 ret = ops->flatten(&ctx);
2698 if (ret < 0) {
2699 prterrcode("writefileimage: ops->flatten", ret);
2700 exit(177);
2701 }
2702}
2703
2704void
2705docloseopen(void)
2706{
2707 char *name;
2708 int ret;
2709
2710 if (testcalls <= simulatedopcount)
2711 return;
2712
2713 name = strdup(ctx.name);
2714
2715 if (debug)
2716 prt("%lu close/open\n", testcalls);
2717
2718 ret = ops->close(&ctx);
2719 if (ret < 0) {
2720 prterrcode("docloseopen: ops->close", ret);
2721 report_failure(180);
2722 }
2723
2724 ret = ops->open(name, &ctx);
2725 if (ret < 0) {
2726 prterrcode("docloseopen: ops->open", ret);
2727 report_failure(181);
2728 }
2729
2730 free(name);
2731}
2732
2733#define TRIM_OFF_LEN(off, len, size) \
2734do { \
2735 if (size) \
2736 (off) %= (size); \
2737 else \
2738 (off) = 0; \
2739 if ((unsigned)(off) + (unsigned)(len) > (unsigned)(size)) \
2740 (len) = (size) - (off); \
2741} while (0)
2742
2743void
2744test(void)
2745{
2746 unsigned long offset;
2747 unsigned long size = maxoplen;
2748 unsigned long rv = get_random();
2749 unsigned long op;
2750
2751 if (simulatedopcount > 0 && testcalls == simulatedopcount)
2752 writefileimage();
2753
2754 testcalls++;
2755
2756 if (closeprob)
2757 closeopen = (rv >> 3) < (1u << 28) / (unsigned)closeprob;
2758
2759 if (debugstart > 0 && testcalls >= debugstart)
2760 debug = 1;
2761
2762 if (!quiet && testcalls < simulatedopcount && testcalls % 100000 == 0)
2763 prt("%lu...\n", testcalls);
2764
2765 offset = get_random();
2766 if (randomoplen)
2767 size = get_random() % (maxoplen + 1);
2768
2769 /* calculate appropriate op to run */
2770 if (lite)
2771 op = rv % OP_MAX_LITE;
2772 else
2773 op = rv % OP_MAX_FULL;
2774
2775 switch (op) {
2776 case OP_MAPREAD:
2777 if (!mapped_reads)
2778 op = OP_READ;
2779 break;
2780 case OP_MAPWRITE:
2781 if (!mapped_writes)
2782 op = OP_WRITE;
2783 break;
2784 case OP_FALLOCATE:
2785 if (!fallocate_calls) {
2786 log4(OP_SKIPPED, OP_FALLOCATE, offset, size);
2787 goto out;
2788 }
2789 break;
2790 case OP_PUNCH_HOLE:
2791 if (!punch_hole_calls) {
2792 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, size);
2793 goto out;
2794 }
2795 break;
2796 case OP_CLONE:
2797 /* clone, 8% chance */
2798 if (!clone_calls || file_size == 0 || get_random() % 100 >= 8) {
2799 log4(OP_SKIPPED, OP_CLONE, 0, 0);
2800 goto out;
2801 }
2802 break;
2803 case OP_FLATTEN:
2804 /* flatten four times as rarely as clone, 2% chance */
2805 if (get_random() % 100 >= 2) {
2806 log4(OP_SKIPPED, OP_FLATTEN, 0, 0);
2807 goto out;
2808 }
2809 break;
2810 case OP_WRITESAME:
2811 /* writesame not implemented */
2812 if (!ops->writesame) {
2813 log4(OP_SKIPPED, OP_WRITESAME, offset, size);
2814 goto out;
2815 }
d2e6a577 2816 break;
c07f9fc5
FG
2817 case OP_COMPARE_AND_WRITE:
2818 /* compare_and_write not implemented */
2819 if (!ops->compare_and_write) {
2820 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
2821 goto out;
2822 }
d2e6a577 2823 break;
7c673cae
FG
2824 }
2825
2826 switch (op) {
2827 case OP_READ:
2828 TRIM_OFF_LEN(offset, size, file_size);
2829 doread(offset, size);
2830 break;
2831
2832 case OP_WRITE:
2833 TRIM_OFF_LEN(offset, size, maxfilelen);
2834 dowrite(offset, size);
2835 break;
2836
2837 case OP_MAPREAD:
2838 TRIM_OFF_LEN(offset, size, file_size);
2839 exit(183);
2840 break;
2841
2842 case OP_MAPWRITE:
2843 TRIM_OFF_LEN(offset, size, maxfilelen);
2844 exit(182);
2845 break;
2846
2847 case OP_TRUNCATE:
2848 if (!style)
2849 size = get_random() % maxfilelen;
2850 dotruncate(size);
2851 break;
2852
2853 case OP_PUNCH_HOLE:
2854 TRIM_OFF_LEN(offset, size, file_size);
2855 do_punch_hole(offset, size);
2856 break;
2857
2858 case OP_WRITESAME:
2859 TRIM_OFF_LEN(offset, size, maxfilelen);
2860 dowritesame(offset, size);
2861 break;
c07f9fc5
FG
2862 case OP_COMPARE_AND_WRITE:
2863 TRIM_OFF_LEN(offset, size, file_size);
2864 docompareandwrite(offset, size);
2865 break;
7c673cae
FG
2866
2867 case OP_CLONE:
2868 do_clone();
2869 break;
2870
2871 case OP_FLATTEN:
2872 do_flatten();
2873 break;
2874
2875 default:
2876 prterr("test: unknown operation");
2877 report_failure(42);
2878 break;
2879 }
2880
2881out:
2882 if (sizechecks && testcalls > simulatedopcount)
2883 check_size();
2884 if (closeopen)
2885 docloseopen();
2886}
2887
2888
2889void
2890cleanup(int sig)
2891{
2892 if (sig)
2893 prt("signal %d\n", sig);
2894 prt("testcalls = %lu\n", testcalls);
2895 exit(sig);
2896}
2897
2898
2899void
2900usage(void)
2901{
2902 fprintf(stdout, "usage: %s",
2903 "fsx [-dfjknqxyACFHKLORUWZ] [-b opnum] [-c Prob] [-h holebdy] [-l flen] [-m start:end] [-o oplen] [-p progressinterval] [-r readbdy] [-s style] [-t truncbdy] [-w writebdy] [-D startingop] [-N numops] [-P dirpath] [-S seed] pname iname\n\
2904 -b opnum: beginning operation number (default 1)\n\
2905 -c P: 1 in P chance of file close+open at each op (default infinity)\n\
2906 -d: debug output for all operations\n\
2907 -f: flush and invalidate cache after I/O\n\
11fdf7f2 2908 -g: deep copy instead of clone\n\
7c673cae
FG
2909 -h holebdy: 4096 would make discards page aligned (default 1)\n\
2910 -j: journal replay stress test\n\
2911 -k: keep data on success (default 0)\n\
2912 -l flen: the upper bound on file size (default 262144)\n\
2913 -m startop:endop: monitor (print debug output) specified byte range (default 0:infinity)\n\
2914 -n: no verifications of file size\n\
2915 -o oplen: the upper bound on operation size (default 65536)\n\
2916 -p progressinterval: debug output at specified operation interval\n\
2917 -q: quieter operation\n\
2918 -r readbdy: 4096 would make reads page aligned (default 1)\n\
2919 -s style: 1 gives smaller truncates (default 0)\n\
2920 -t truncbdy: 4096 would make truncates page aligned (default 1)\n\
2921 -w writebdy: 4096 would make writes page aligned (default 1)\n\
2922 -x: preallocate file space before starting, XFS only (default 0)\n\
2923 -y: synchronize changes to a file\n"
2924
2925" -C: do not use clone calls\n\
2926 -D startingop: debug output starting at specified operation\n"
2927#ifdef FALLOCATE
2928" -F: Do not use fallocate (preallocation) calls\n"
2929#endif
11fdf7f2
TL
2930#if defined(__FreeBSD__)
2931" -G: enable rbd-ggate mode (use -L, -r and -w too)\n"
2932#endif
2933" -H: do not use punch hole calls\n"
2934#if defined(WITH_KRBD)
2935" -K: enable krbd mode (use -t and -h too)\n"
2936#endif
2937#if defined(__linux__)
2938" -M: enable rbd-nbd mode (use -t and -h too)\n"
2939#endif
2940" -L: fsxLite - no file creations & no file size changes\n\
7c673cae
FG
2941 -N numops: total # operations to do (default infinity)\n\
2942 -O: use oplen (see -o flag) for every op (default random)\n\
2943 -P dirpath: save .fsxlog and .fsxgood files in dirpath (default ./)\n\
2944 -R: read() system calls only (mapped reads disabled)\n\
2945 -S seed: for random # generator (default 1) 0 gets timestamp\n\
2946 -U: disable randomized striping\n\
2947 -W: mapped write operations DISabled\n\
2948 -Z: O_DIRECT (use -R, -W, -r and -w too)\n\
2949 poolname: this is REQUIRED (no default)\n\
2950 imagename: this is REQUIRED (no default)\n");
2951 exit(89);
2952}
2953
2954
2955int
2956getnum(char *s, char **e)
2957{
2958 int ret;
2959
2960 *e = (char *) 0;
2961 ret = strtol(s, e, 0);
2962 if (*e)
2963 switch (**e) {
2964 case 'b':
2965 case 'B':
2966 ret *= 512;
2967 *e = *e + 1;
2968 break;
2969 case 'k':
2970 case 'K':
2971 ret *= 1024;
2972 *e = *e + 1;
2973 break;
2974 case 'm':
2975 case 'M':
2976 ret *= 1024*1024;
2977 *e = *e + 1;
2978 break;
2979 case 'w':
2980 case 'W':
2981 ret *= 4;
2982 *e = *e + 1;
2983 break;
2984 }
2985 return (ret);
2986}
2987
2988void
2989test_fallocate()
2990{
2991#ifdef FALLOCATE
2992 if (!lite && fallocate_calls) {
2993 if (fallocate(fd, 0, 0, 1) && errno == EOPNOTSUPP) {
2994 if(!quiet)
2995 warn("main: filesystem does not support fallocate, disabling\n");
2996 fallocate_calls = 0;
2997 } else {
2998 ftruncate(fd, 0);
2999 }
3000 }
3001#else /* ! FALLOCATE */
3002 fallocate_calls = 0;
3003#endif
3004
3005}
3006
3007void remove_image(rados_ioctx_t ioctx, char *imagename, bool remove_snap,
3008 bool unregister) {
3009 rbd_image_t image;
3010 char errmsg[128];
3011 int ret;
3012
3013 if ((ret = rbd_open(ioctx, imagename, &image, NULL)) < 0) {
3014 sprintf(errmsg, "rbd_open %s", imagename);
3015 prterrcode(errmsg, ret);
3016 report_failure(101);
3017 }
3018 if (remove_snap) {
3019 if ((ret = rbd_snap_unprotect(image, "snap")) < 0) {
3020 sprintf(errmsg, "rbd_snap_unprotect %s@snap",
3021 imagename);
3022 prterrcode(errmsg, ret);
3023 report_failure(102);
3024 }
3025 if ((ret = rbd_snap_remove(image, "snap")) < 0) {
3026 sprintf(errmsg, "rbd_snap_remove %s@snap",
3027 imagename);
3028 prterrcode(errmsg, ret);
3029 report_failure(103);
3030 }
3031 }
3032 if ((ret = rbd_close(image)) < 0) {
3033 sprintf(errmsg, "rbd_close %s", imagename);
3034 prterrcode(errmsg, ret);
3035 report_failure(104);
3036 }
3037
3038 if (unregister &&
3039 (ret = unregister_journal(ioctx, imagename)) < 0) {
3040 report_failure(105);
3041 }
3042
3043 if ((ret = rbd_remove(ioctx, imagename)) < 0) {
3044 sprintf(errmsg, "rbd_remove %s", imagename);
3045 prterrcode(errmsg, ret);
3046 report_failure(106);
3047 }
3048}
3049
3050int
3051main(int argc, char **argv)
3052{
11fdf7f2
TL
3053 enum {
3054 LONG_OPT_CLUSTER = 1000,
3055 LONG_OPT_ID = 1001
3056 };
3057
7c673cae
FG
3058 int i, style, ch, ret;
3059 char *endp;
3060 char goodfile[1024];
3061 char logfile[1024];
3062
11fdf7f2
TL
3063 const char* optstring = "b:c:dfgh:jkl:m:no:p:qr:s:t:w:xyCD:FGHKMLN:OP:RS:UWZ";
3064 const struct option longopts[] = {
3065 {"cluster", 1, NULL, LONG_OPT_CLUSTER},
3066 {"id", 1, NULL, LONG_OPT_ID}};
3067
7c673cae
FG
3068 goodfile[0] = 0;
3069 logfile[0] = 0;
3070
3071 page_size = getpagesize();
3072 page_mask = page_size - 1;
3073 mmap_mask = page_mask;
3074
3075 setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
3076
11fdf7f2 3077 while ((ch = getopt_long(argc, argv, optstring, longopts, NULL)) != EOF) {
7c673cae 3078 switch (ch) {
11fdf7f2
TL
3079 case LONG_OPT_CLUSTER:
3080 cluster_name = optarg;
3081 break;
3082 case LONG_OPT_ID:
3083 client_id = optarg;
3084 break;
7c673cae
FG
3085 case 'b':
3086 simulatedopcount = getnum(optarg, &endp);
3087 if (!quiet)
3088 fprintf(stdout, "Will begin at operation %lu\n",
3089 simulatedopcount);
3090 if (simulatedopcount == 0)
3091 usage();
3092 simulatedopcount -= 1;
3093 break;
3094 case 'c':
3095 closeprob = getnum(optarg, &endp);
3096 if (!quiet)
3097 fprintf(stdout,
3098 "Chance of close/open is 1 in %d\n",
3099 closeprob);
3100 if (closeprob <= 0)
3101 usage();
3102 break;
3103 case 'd':
3104 debug = 1;
3105 break;
3106 case 'f':
3107 flush_enabled = 1;
3108 break;
11fdf7f2
TL
3109 case 'g':
3110 deep_copy = 1;
3111 break;
7c673cae
FG
3112 case 'h':
3113 holebdy = getnum(optarg, &endp);
3114 if (holebdy <= 0)
3115 usage();
3116 break;
3117 case 'j':
3118 journal_replay = true;
3119 break;
3120 case 'k':
3121 keep_on_success = 1;
3122 break;
3123 case 'l':
3124 {
3125 int _num = getnum(optarg, &endp);
3126 if (_num <= 0)
3127 usage();
3128 maxfilelen = _num;
3129 }
3130 break;
3131 case 'm':
3132 monitorstart = getnum(optarg, &endp);
3133 if (monitorstart < 0)
3134 usage();
3135 if (!endp || *endp++ != ':')
3136 usage();
3137 monitorend = getnum(endp, &endp);
3138 if (monitorend < 0)
3139 usage();
3140 if (monitorend == 0)
3141 monitorend = -1; /* aka infinity */
3142 debug = 1;
3143 break;
3144 case 'n':
3145 sizechecks = 0;
3146 break;
3147 case 'o':
3148 maxoplen = getnum(optarg, &endp);
3149 if (maxoplen <= 0)
3150 usage();
3151 break;
3152 case 'p':
3153 progressinterval = getnum(optarg, &endp);
3154 if (progressinterval == 0)
3155 usage();
3156 break;
3157 case 'q':
3158 quiet = 1;
3159 break;
3160 case 'r':
3161 readbdy = getnum(optarg, &endp);
3162 if (readbdy <= 0)
3163 usage();
3164 break;
3165 case 's':
3166 style = getnum(optarg, &endp);
3167 if (style < 0 || style > 1)
3168 usage();
3169 break;
3170 case 't':
3171 truncbdy = getnum(optarg, &endp);
3172 if (truncbdy <= 0)
3173 usage();
3174 break;
3175 case 'w':
3176 writebdy = getnum(optarg, &endp);
3177 if (writebdy <= 0)
3178 usage();
3179 break;
3180 case 'x':
3181 prealloc = 1;
3182 break;
3183 case 'y':
3184 do_fsync = 1;
3185 break;
3186 case 'C':
3187 clone_calls = 0;
3188 break;
3189 case 'D':
3190 debugstart = getnum(optarg, &endp);
3191 if (debugstart < 1)
3192 usage();
3193 break;
3194 case 'F':
3195 fallocate_calls = 0;
3196 break;
11fdf7f2
TL
3197#if defined(__FreeBSD__)
3198 case 'G':
3199 prt("rbd-ggate mode enabled\n");
3200 ops = &ggate_operations;
3201 break;
3202#endif
7c673cae
FG
3203 case 'H':
3204 punch_hole_calls = 0;
3205 break;
11fdf7f2 3206#if defined(WITH_KRBD)
7c673cae
FG
3207 case 'K':
3208 prt("krbd mode enabled\n");
3209 ops = &krbd_operations;
3210 break;
11fdf7f2
TL
3211#endif
3212#if defined(__linux__)
7c673cae
FG
3213 case 'M':
3214 prt("rbd-nbd mode enabled\n");
3215 ops = &nbd_operations;
3216 break;
11fdf7f2 3217#endif
7c673cae 3218 case 'L':
11fdf7f2 3219 lite = 1;
7c673cae
FG
3220 break;
3221 case 'N':
3222 numops = getnum(optarg, &endp);
3223 if (numops < 0)
3224 usage();
3225 break;
3226 case 'O':
3227 randomoplen = 0;
3228 break;
3229 case 'P':
3230 strncpy(dirpath, optarg, sizeof(dirpath)-1);
3231 dirpath[sizeof(dirpath)-1] = '\0';
3232 strncpy(goodfile, dirpath, sizeof(goodfile)-1);
3233 goodfile[sizeof(goodfile)-1] = '\0';
3234 if (strlen(goodfile) < sizeof(goodfile)-2) {
3235 strcat(goodfile, "/");
3236 } else {
3237 prt("file name to long\n");
3238 exit(1);
3239 }
3240 strncpy(logfile, dirpath, sizeof(logfile)-1);
3241 logfile[sizeof(logfile)-1] = '\0';
3242 if (strlen(logfile) < sizeof(logfile)-2) {
3243 strcat(logfile, "/");
3244 } else {
3245 prt("file path to long\n");
3246 exit(1);
3247 }
3248 break;
3249 case 'R':
3250 mapped_reads = 0;
3251 if (!quiet)
3252 fprintf(stdout, "mapped reads DISABLED\n");
3253 break;
3254 case 'S':
3255 seed = getnum(optarg, &endp);
3256 if (seed == 0)
11fdf7f2 3257 seed = std::random_device()() % 10000;
7c673cae
FG
3258 if (!quiet)
3259 fprintf(stdout, "Seed set to %d\n", seed);
3260 if (seed < 0)
3261 usage();
3262 break;
3263 case 'U':
3264 randomize_striping = 0;
3265 break;
3266 case 'W':
3267 mapped_writes = 0;
3268 if (!quiet)
3269 fprintf(stdout, "mapped writes DISABLED\n");
3270 break;
3271 case 'Z':
3272 o_direct = O_DIRECT;
3273 break;
3274 default:
3275 usage();
3276 /* NOTREACHED */
3277 }
11fdf7f2 3278 }
7c673cae
FG
3279 argc -= optind;
3280 argv += optind;
3281 if (argc != 2)
3282 usage();
3283 pool = argv[0];
3284 iname = argv[1];
3285
3286 signal(SIGHUP, cleanup);
3287 signal(SIGINT, cleanup);
3288 signal(SIGPIPE, cleanup);
3289 signal(SIGALRM, cleanup);
3290 signal(SIGTERM, cleanup);
3291 signal(SIGXCPU, cleanup);
3292 signal(SIGXFSZ, cleanup);
3293 signal(SIGVTALRM, cleanup);
3294 signal(SIGUSR1, cleanup);
3295 signal(SIGUSR2, cleanup);
3296
3297 random_generator.seed(seed);
3298
11fdf7f2
TL
3299 if (lite) {
3300 file_size = maxfilelen;
3301 }
3302
7c673cae
FG
3303 ret = create_image();
3304 if (ret < 0) {
3305 prterrcode(iname, ret);
3306 exit(90);
3307 }
3308 ret = ops->open(iname, &ctx);
3309 if (ret < 0) {
3310 simple_err("Error opening image", ret);
3311 exit(91);
3312 }
3313 if (!dirpath[0])
3314 strcat(dirpath, ".");
3315 strncat(goodfile, iname, 256);
3316 strcat (goodfile, ".fsxgood");
3317 fsxgoodfd = open(goodfile, O_RDWR|O_CREAT|O_TRUNC, 0666);
3318 if (fsxgoodfd < 0) {
3319 prterr(goodfile);
3320 exit(92);
3321 }
3322 strncat(logfile, iname, 256);
3323 strcat (logfile, ".fsxlog");
3324 fsxlogf = fopen(logfile, "w");
3325 if (fsxlogf == NULL) {
3326 prterr(logfile);
3327 exit(93);
3328 }
3329
3330 original_buf = (char *) malloc(maxfilelen);
3331 for (i = 0; i < (int)maxfilelen; i++)
3332 original_buf[i] = get_random() % 256;
3333
3334 ret = posix_memalign((void **)&good_buf,
11fdf7f2 3335 std::max(writebdy, (int)sizeof(void *)), maxfilelen);
7c673cae
FG
3336 if (ret > 0) {
3337 if (ret == EINVAL)
3338 prt("writebdy is not a suitable power of two\n");
3339 else
3340 prterrcode("main: posix_memalign(good_buf)", -ret);
3341 exit(94);
3342 }
3343 memset(good_buf, '\0', maxfilelen);
3344
3345 ret = posix_memalign((void **)&temp_buf,
11fdf7f2 3346 std::max(readbdy, (int)sizeof(void *)), maxfilelen);
7c673cae
FG
3347 if (ret > 0) {
3348 if (ret == EINVAL)
3349 prt("readbdy is not a suitable power of two\n");
3350 else
3351 prterrcode("main: posix_memalign(temp_buf)", -ret);
3352 exit(95);
3353 }
3354 memset(temp_buf, '\0', maxfilelen);
3355
3356 if (lite) { /* zero entire existing file */
3357 ssize_t written;
3358
3359 written = ops->write(&ctx, 0, (size_t)maxfilelen, good_buf);
3360 if (written != (ssize_t)maxfilelen) {
3361 if (written < 0) {
3362 prterrcode(iname, written);
3363 warn("main: error on write");
3364 } else
3365 warn("main: short write, 0x%x bytes instead "
3366 "of 0x%lx\n",
3367 (unsigned)written,
3368 maxfilelen);
3369 exit(98);
3370 }
3371 } else
3372 check_trunc_hack();
3373
3374 //test_fallocate();
3375
3376 while (numops == -1 || numops--)
3377 test();
3378
3379 ret = ops->close(&ctx);
3380 if (ret < 0) {
3381 prterrcode("ops->close", ret);
3382 report_failure(99);
3383 }
3384
3385 if (journal_replay) {
3386 char imagename[1024];
3387 clone_imagename(imagename, sizeof(imagename), num_clones);
3388 ret = finalize_journal(ioctx, imagename, num_clones, 0, 0, 0);
3389 if (ret < 0) {
3390 report_failure(100);
3391 }
3392 }
3393
3394 if (num_clones > 0) {
3395 if (journal_replay) {
3396 check_clone(num_clones - 1, true);
3397 }
3398 check_clone(num_clones - 1, false);
3399 }
3400
3401 if (!keep_on_success) {
3402 while (num_clones >= 0) {
3403 static bool remove_snap = false;
3404
3405 if (journal_replay) {
3406 char replayimagename[1024];
3407 replay_imagename(replayimagename,
3408 sizeof(replayimagename),
3409 num_clones);
3410 remove_image(ioctx, replayimagename,
3411 remove_snap,
3412 false);
3413 }
3414
3415 char clonename[128];
3416 clone_imagename(clonename, 128, num_clones);
3417 remove_image(ioctx, clonename, remove_snap,
3418 journal_replay);
3419
3420 remove_snap = true;
3421 num_clones--;
3422 }
3423 }
3424
3425 prt("All operations completed A-OK!\n");
3426 fclose(fsxlogf);
3427
3428 rados_ioctx_destroy(ioctx);
11fdf7f2 3429#if defined(WITH_KRBD)
7c673cae 3430 krbd_destroy(krbd);
11fdf7f2 3431#endif
7c673cae
FG
3432 rados_shutdown(cluster);
3433
3434 free(original_buf);
3435 free(good_buf);
3436 free(temp_buf);
3437
3438 exit(0);
3439 return 0;
3440}