1 // -*- mode:C++; tab-width:8; c-basic-offset:8; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=8 smarttab
4 * Copyright (C) 1991, NeXT Computer, Inc. All Rights Reserverd.
7 * Author: Avadis Tevanian, Jr.
9 * File system exerciser.
11 * Rewritten 8/98 by Conrad Minshall.
13 * Small changes to work under Linux -- davej.
15 * Checks for mmap last-page zero fill.
18 #include <sys/types.h>
27 #include <sys/ioctl.h>
44 #include "include/intarith.h"
45 #include "include/krbd.h"
46 #include "include/rados/librados.h"
47 #include "include/rados/librados.hpp"
48 #include "include/rbd/librbd.h"
49 #include "include/rbd/librbd.hpp"
50 #include "common/Cond.h"
51 #include "common/SubProcess.h"
52 #include "common/safe_io.h"
53 #include "journal/Journaler.h"
54 #include "journal/ReplayEntry.h"
55 #include "journal/ReplayHandler.h"
56 #include "journal/Settings.h"
58 #include <boost/scope_exit.hpp>
60 #define NUMPRINTCOLUMNS 32 /* # columns of data to print on each line */
63 * A log entry is an operation and a bunch of arguments.
73 struct log_entry oplog
[LOGSIZE
]; /* the log */
74 int logptr
= 0; /* current position in log */
75 int logcount
= 0; /* total ops */
78 * The operation matrix is complex due to conditional execution of different
79 * features. Hence when we come to deciding what operation to run, we need to
80 * be careful in how we select the different operations. The active operations
81 * are mapped to numbers as follows:
92 * COMPAREANDWRITE: - 8
94 * When mapped read/writes are disabled, they are simply converted to normal
95 * reads and writes. When fallocate/fpunch calls are disabled, they are
96 * converted to OP_SKIPPED. Hence OP_SKIPPED needs to have a number higher than
97 * the operation selction matrix, as does the OP_CLOSEOPEN which is an
98 * operation modifier rather than an operation in itself.
100 * Because of the "lite" version, we also need to have different "maximum
101 * operation" defines to allow the ops to be selected correctly based on the
105 /* common operations */
109 #define OP_MAPWRITE 3
110 #define OP_MAX_LITE 4
112 /* !lite operations */
113 #define OP_TRUNCATE 4
114 #define OP_FALLOCATE 5
115 #define OP_PUNCH_HOLE 6
116 #define OP_WRITESAME 7
117 #define OP_COMPARE_AND_WRITE 8
118 /* rbd-specific operations */
120 #define OP_FLATTEN 10
121 #define OP_MAX_FULL 11
123 /* operation modifiers */
124 #define OP_CLOSEOPEN 100
125 #define OP_SKIPPED 101
128 #define PAGE_SIZE getpagesize()
130 #define PAGE_MASK (PAGE_SIZE - 1)
133 char *original_buf
; /* a pointer to the original data */
134 char *good_buf
; /* a pointer to the correct data */
135 char *temp_buf
; /* a pointer to the current data */
141 unsigned long testcalls
= 0; /* calls to function "test" */
143 unsigned long simulatedopcount
= 0; /* -b flag */
144 int closeprob
= 0; /* -c flag */
145 int debug
= 0; /* -d flag */
146 unsigned long debugstart
= 0; /* -D flag */
147 int flush_enabled
= 0; /* -f flag */
148 int holebdy
= 1; /* -h flag */
149 bool journal_replay
= false; /* -j flah */
150 int keep_on_success
= 0; /* -k flag */
151 int do_fsync
= 0; /* -y flag */
152 unsigned long maxfilelen
= 256 * 1024; /* -l flag */
153 int sizechecks
= 1; /* -n flag disables them */
154 int maxoplen
= 64 * 1024; /* -o flag */
155 int quiet
= 0; /* -q flag */
156 unsigned long progressinterval
= 0; /* -p flag */
157 int readbdy
= 1; /* -r flag */
158 int style
= 0; /* -s flag */
159 int prealloc
= 0; /* -x flag */
160 int truncbdy
= 1; /* -t flag */
161 int writebdy
= 1; /* -w flag */
162 long monitorstart
= -1; /* -m flag */
163 long monitorend
= -1; /* -m flag */
164 int lite
= 0; /* -L flag */
165 long numops
= -1; /* -N flag */
166 int randomoplen
= 1; /* -O flag disables it */
167 int seed
= 1; /* -S flag */
168 int mapped_writes
= 0; /* -W flag disables */
169 int fallocate_calls
= 0; /* -F flag disables */
170 int punch_hole_calls
= 1; /* -H flag disables */
171 int clone_calls
= 1; /* -C flag disables */
172 int randomize_striping
= 1; /* -U flag disables */
173 int randomize_parent_overlap
= 1;
174 int mapped_reads
= 0; /* -R flag disables it */
176 int o_direct
= 0; /* -Z flag */
184 FILE * fsxlogf
= NULL
;
189 vwarnc(int code
, const char *fmt
, va_list ap
) {
190 fprintf(stderr
, "fsx: ");
192 vfprintf(stderr
, fmt
, ap
);
193 fprintf(stderr
, ": ");
195 fprintf(stderr
, "%s\n", strerror(code
));
199 warn(const char * fmt
, ...) {
202 vwarnc(errno
, fmt
, ap
);
206 #define BUF_SIZE 1024
209 prt(const char *fmt
, ...)
212 char buffer
[BUF_SIZE
];
215 vsnprintf(buffer
, BUF_SIZE
, fmt
, args
);
217 fprintf(stdout
, "%s", buffer
);
219 fprintf(fsxlogf
, "%s", buffer
);
223 prterr(const char *prefix
)
225 prt("%s%s%s\n", prefix
, prefix
? ": " : "", strerror(errno
));
229 prterrcode(const char *prefix
, int code
)
231 prt("%s%s%s\n", prefix
, prefix
? ": " : "", strerror(-code
));
235 simple_err(const char *msg
, int err
)
237 fprintf(stderr
, "%s: %s\n", msg
, strerror(-err
));
243 std::mt19937 random_generator
;
248 return random_generator();
251 void replay_imagename(char *buf
, size_t len
, int clones
);
255 static const std::string
JOURNAL_CLIENT_ID("fsx");
257 struct ReplayHandler
: public journal::ReplayHandler
{
258 journal::Journaler
*journaler
;
259 journal::Journaler
*replay_journaler
;
262 ReplayHandler(journal::Journaler
*journaler
,
263 journal::Journaler
*replay_journaler
, Context
*on_finish
)
264 : journaler(journaler
), replay_journaler(replay_journaler
),
265 on_finish(on_finish
) {
268 void get() override
{
270 void put() override
{
273 void handle_entries_available() override
{
275 journal::ReplayEntry replay_entry
;
276 if (!journaler
->try_pop_front(&replay_entry
)) {
280 replay_journaler
->append(0, replay_entry
.get_data());
284 void handle_complete(int r
) override
{
285 on_finish
->complete(r
);
289 int get_image_id(librados::IoCtx
&io_ctx
, const char *image_name
,
290 std::string
*image_id
) {
293 int r
= rbd
.open(io_ctx
, image
, image_name
);
295 simple_err("failed to open image", r
);
299 rbd_image_info_t info
;
300 r
= image
.stat(info
, sizeof(info
));
302 simple_err("failed to stat image", r
);
306 *image_id
= std::string(&info
.block_name_prefix
[strlen(RBD_DATA_PREFIX
)]);
310 int register_journal(rados_ioctx_t ioctx
, const char *image_name
) {
311 librados::IoCtx io_ctx
;
312 librados::IoCtx::from_rados_ioctx_t(ioctx
, io_ctx
);
314 std::string image_id
;
315 int r
= get_image_id(io_ctx
, image_name
, &image_id
);
320 journal::Journaler
journaler(io_ctx
, image_id
, JOURNAL_CLIENT_ID
, {});
321 r
= journaler
.register_client(bufferlist());
323 simple_err("failed to register journal client", r
);
329 int unregister_journal(rados_ioctx_t ioctx
, const char *image_name
) {
330 librados::IoCtx io_ctx
;
331 librados::IoCtx::from_rados_ioctx_t(ioctx
, io_ctx
);
333 std::string image_id
;
334 int r
= get_image_id(io_ctx
, image_name
, &image_id
);
339 journal::Journaler
journaler(io_ctx
, image_id
, JOURNAL_CLIENT_ID
, {});
340 r
= journaler
.unregister_client();
342 simple_err("failed to unregister journal client", r
);
348 int create_replay_image(rados_ioctx_t ioctx
, int order
,
349 uint64_t stripe_unit
, int stripe_count
,
350 const char *replay_image_name
,
351 const char *last_replay_image_name
) {
352 librados::IoCtx io_ctx
;
353 librados::IoCtx::from_rados_ioctx_t(ioctx
, io_ctx
);
357 if (last_replay_image_name
== nullptr) {
358 r
= rbd
.create2(io_ctx
, replay_image_name
, 0,
359 RBD_FEATURES_ALL
, &order
);
361 r
= rbd
.clone2(io_ctx
, last_replay_image_name
, "snap",
362 io_ctx
, replay_image_name
, RBD_FEATURES_ALL
,
363 &order
, stripe_unit
, stripe_count
);
367 simple_err("failed to create replay image", r
);
374 int replay_journal(rados_ioctx_t ioctx
, const char *image_name
,
375 const char *replay_image_name
) {
376 librados::IoCtx io_ctx
;
377 librados::IoCtx::from_rados_ioctx_t(ioctx
, io_ctx
);
379 std::string image_id
;
380 int r
= get_image_id(io_ctx
, image_name
, &image_id
);
385 std::string replay_image_id
;
386 r
= get_image_id(io_ctx
, replay_image_name
, &replay_image_id
);
391 journal::Journaler
journaler(io_ctx
, image_id
, JOURNAL_CLIENT_ID
, {});
392 C_SaferCond init_ctx
;
393 journaler
.init(&init_ctx
);
394 BOOST_SCOPE_EXIT_ALL( (&journaler
) ) {
395 journaler
.shut_down();
400 simple_err("failed to initialize journal", r
);
404 journal::Journaler
replay_journaler(io_ctx
, replay_image_id
, "", {});
406 C_SaferCond replay_init_ctx
;
407 replay_journaler
.init(&replay_init_ctx
);
408 BOOST_SCOPE_EXIT_ALL( (&replay_journaler
) ) {
409 replay_journaler
.shut_down();
412 r
= replay_init_ctx
.wait();
414 simple_err("failed to initialize replay journal", r
);
418 replay_journaler
.start_append(0, 0, 0);
420 C_SaferCond replay_ctx
;
421 ReplayHandler
replay_handler(&journaler
, &replay_journaler
,
424 // copy journal events from source image to replay image
425 journaler
.start_replay(&replay_handler
);
426 r
= replay_ctx
.wait();
428 journaler
.stop_replay();
430 C_SaferCond stop_ctx
;
431 replay_journaler
.stop_append(&stop_ctx
);
432 int stop_r
= stop_ctx
.wait();
433 if (r
== 0 && stop_r
< 0) {
438 simple_err("failed to replay journal", r
);
444 r
= rbd
.open(io_ctx
, image
, replay_image_name
);
446 simple_err("failed to open replay image", r
);
450 // perform an IO op to initiate the journal replay
452 r
= static_cast<ssize_t
>(image
.write(0, 0, bl
));
454 simple_err("failed to write to replay image", r
);
460 int finalize_journal(rados_ioctx_t ioctx
, const char *imagename
, int clones
,
461 int order
, uint64_t stripe_unit
, int stripe_count
) {
462 char replayimagename
[1024];
463 replay_imagename(replayimagename
, sizeof(replayimagename
), clones
);
465 char lastreplayimagename
[1024];
467 replay_imagename(lastreplayimagename
,
468 sizeof(lastreplayimagename
), clones
- 1);
471 int ret
= create_replay_image(ioctx
, order
, stripe_unit
,
472 stripe_count
, replayimagename
,
473 clones
> 0 ? lastreplayimagename
:
479 ret
= replay_journal(ioctx
, imagename
, replayimagename
);
486 } // anonymous namespace
493 const char *name
; /* image name */
494 rbd_image_t image
; /* image handle */
495 const char *krbd_name
; /* image /dev/rbd<id> name */ /* reused for nbd test */
496 int krbd_fd
; /* image /dev/rbd<id> fd */ /* reused for nbd test */
499 #define RBD_CTX_INIT (struct rbd_ctx) { NULL, NULL, NULL, -1}
501 struct rbd_operations
{
502 int (*open
)(const char *name
, struct rbd_ctx
*ctx
);
503 int (*close
)(struct rbd_ctx
*ctx
);
504 ssize_t (*read
)(struct rbd_ctx
*ctx
, uint64_t off
, size_t len
, char *buf
);
505 ssize_t (*write
)(struct rbd_ctx
*ctx
, uint64_t off
, size_t len
, const char *buf
);
506 int (*flush
)(struct rbd_ctx
*ctx
);
507 int (*discard
)(struct rbd_ctx
*ctx
, uint64_t off
, uint64_t len
);
508 int (*get_size
)(struct rbd_ctx
*ctx
, uint64_t *size
);
509 int (*resize
)(struct rbd_ctx
*ctx
, uint64_t size
);
510 int (*clone
)(struct rbd_ctx
*ctx
, const char *src_snapname
,
511 const char *dst_imagename
, int *order
, int stripe_unit
,
513 int (*flatten
)(struct rbd_ctx
*ctx
);
514 ssize_t (*writesame
)(struct rbd_ctx
*ctx
, uint64_t off
, size_t len
,
515 const char *buf
, size_t data_len
);
516 ssize_t (*compare_and_write
)(struct rbd_ctx
*ctx
, uint64_t off
, size_t len
,
517 const char *cmp_buf
, const char *buf
);
520 char *pool
; /* name of the pool our test image is in */
521 char *iname
; /* name of our test image */
522 rados_t cluster
; /* handle for our test cluster */
523 rados_ioctx_t ioctx
; /* handle for our test pool */
524 struct krbd_ctx
*krbd
; /* handle for libkrbd */
525 bool skip_partial_discard
; /* rbd_skip_partial_discard config value*/
528 * librbd/krbd rbd_operations handlers. Given the rest of fsx.c, no
529 * attempt to do error handling is made in these handlers.
533 __librbd_open(const char *name
, struct rbd_ctx
*ctx
)
538 assert(!ctx
->name
&& !ctx
->image
&&
539 !ctx
->krbd_name
&& ctx
->krbd_fd
< 0);
541 ret
= rbd_open(ioctx
, name
, &image
, NULL
);
543 prt("rbd_open(%s) failed\n", name
);
547 ctx
->name
= strdup(name
);
549 ctx
->krbd_name
= NULL
;
556 librbd_open(const char *name
, struct rbd_ctx
*ctx
)
558 return __librbd_open(name
, ctx
);
562 __librbd_close(struct rbd_ctx
*ctx
)
566 assert(ctx
->name
&& ctx
->image
);
568 ret
= rbd_close(ctx
->image
);
570 prt("rbd_close(%s) failed\n", ctx
->name
);
574 free((void *)ctx
->name
);
583 librbd_close(struct rbd_ctx
*ctx
)
585 return __librbd_close(ctx
);
589 librbd_verify_object_map(struct rbd_ctx
*ctx
)
593 n
= rbd_get_flags(ctx
->image
, &flags
);
595 prt("rbd_get_flags() failed\n");
599 if ((flags
& RBD_FLAG_OBJECT_MAP_INVALID
) != 0) {
600 prt("rbd_get_flags() indicates object map is invalid\n");
607 librbd_read(struct rbd_ctx
*ctx
, uint64_t off
, size_t len
, char *buf
)
611 n
= rbd_read(ctx
->image
, off
, len
, buf
);
613 prt("rbd_read(%llu, %zu) failed\n", off
, len
);
619 librbd_write(struct rbd_ctx
*ctx
, uint64_t off
, size_t len
, const char *buf
)
624 n
= rbd_write(ctx
->image
, off
, len
, buf
);
626 prt("rbd_write(%llu, %zu) failed\n", off
, len
);
630 ret
= librbd_verify_object_map(ctx
);
638 librbd_flush(struct rbd_ctx
*ctx
)
642 ret
= rbd_flush(ctx
->image
);
644 prt("rbd_flush failed\n");
648 return librbd_verify_object_map(ctx
);
652 librbd_discard(struct rbd_ctx
*ctx
, uint64_t off
, uint64_t len
)
656 ret
= rbd_discard(ctx
->image
, off
, len
);
658 prt("rbd_discard(%llu, %llu) failed\n", off
, len
);
662 return librbd_verify_object_map(ctx
);
666 librbd_writesame(struct rbd_ctx
*ctx
, uint64_t off
, size_t len
,
667 const char *buf
, size_t data_len
)
672 n
= rbd_writesame(ctx
->image
, off
, len
, buf
, data_len
, 0);
674 prt("rbd_writesame(%llu, %zu) failed\n", off
, len
);
678 ret
= librbd_verify_object_map(ctx
);
686 librbd_compare_and_write(struct rbd_ctx
*ctx
, uint64_t off
, size_t len
,
687 const char *cmp_buf
, const char *buf
)
691 uint64_t mismatch_off
= 0;
693 n
= rbd_compare_and_write(ctx
->image
, off
, len
, cmp_buf
, buf
, &mismatch_off
, 0);
697 prt("rbd_compare_and_write mismatch(%llu, %zu, %llu) failed\n",
698 off
, len
, mismatch_off
);
702 ret
= librbd_verify_object_map(ctx
);
711 librbd_get_size(struct rbd_ctx
*ctx
, uint64_t *size
)
713 rbd_image_info_t info
;
716 ret
= rbd_stat(ctx
->image
, &info
, sizeof(info
));
718 prt("rbd_stat failed\n");
728 __librbd_resize(struct rbd_ctx
*ctx
, uint64_t size
)
732 ret
= rbd_resize(ctx
->image
, size
);
734 prt("rbd_resize(%llu) failed\n", size
);
738 return librbd_verify_object_map(ctx
);
742 librbd_resize(struct rbd_ctx
*ctx
, uint64_t size
)
744 return __librbd_resize(ctx
, size
);
748 __librbd_clone(struct rbd_ctx
*ctx
, const char *src_snapname
,
749 const char *dst_imagename
, int *order
, int stripe_unit
,
750 int stripe_count
, bool krbd
)
754 ret
= rbd_snap_create(ctx
->image
, src_snapname
);
756 prt("rbd_snap_create(%s@%s) failed\n", ctx
->name
,
761 ret
= rbd_snap_protect(ctx
->image
, src_snapname
);
763 prt("rbd_snap_protect(%s@%s) failed\n", ctx
->name
,
768 uint64_t features
= RBD_FEATURES_ALL
;
770 features
&= ~(RBD_FEATURE_OBJECT_MAP
|
771 RBD_FEATURE_FAST_DIFF
|
772 RBD_FEATURE_DEEP_FLATTEN
|
773 RBD_FEATURE_JOURNALING
);
775 ret
= rbd_clone2(ioctx
, ctx
->name
, src_snapname
, ioctx
,
776 dst_imagename
, features
, order
,
777 stripe_unit
, stripe_count
);
779 prt("rbd_clone2(%s@%s -> %s) failed\n", ctx
->name
,
780 src_snapname
, dst_imagename
);
788 librbd_clone(struct rbd_ctx
*ctx
, const char *src_snapname
,
789 const char *dst_imagename
, int *order
, int stripe_unit
,
792 return __librbd_clone(ctx
, src_snapname
, dst_imagename
, order
,
793 stripe_unit
, stripe_count
, false);
797 __librbd_flatten(struct rbd_ctx
*ctx
)
801 ret
= rbd_flatten(ctx
->image
);
803 prt("rbd_flatten failed\n");
807 return librbd_verify_object_map(ctx
);
811 librbd_flatten(struct rbd_ctx
*ctx
)
813 return __librbd_flatten(ctx
);
816 const struct rbd_operations librbd_operations
= {
828 librbd_compare_and_write
,
832 krbd_open(const char *name
, struct rbd_ctx
*ctx
)
838 ret
= __librbd_open(name
, ctx
);
842 ret
= krbd_map(krbd
, pool
, name
, "", "", &devnode
);
844 prt("krbd_map(%s) failed\n", name
);
848 fd
= open(devnode
, O_RDWR
| o_direct
);
851 prt("open(%s) failed\n", devnode
);
855 ctx
->krbd_name
= devnode
;
862 krbd_close(struct rbd_ctx
*ctx
)
866 assert(ctx
->krbd_name
&& ctx
->krbd_fd
>= 0);
868 if (close(ctx
->krbd_fd
) < 0) {
870 prt("close(%s) failed\n", ctx
->krbd_name
);
874 ret
= krbd_unmap(krbd
, ctx
->krbd_name
, "");
876 prt("krbd_unmap(%s) failed\n", ctx
->krbd_name
);
880 free((void *)ctx
->krbd_name
);
882 ctx
->krbd_name
= NULL
;
885 return __librbd_close(ctx
);
889 krbd_read(struct rbd_ctx
*ctx
, uint64_t off
, size_t len
, char *buf
)
893 n
= pread(ctx
->krbd_fd
, buf
, len
, off
);
896 prt("pread(%llu, %zu) failed\n", off
, len
);
904 krbd_write(struct rbd_ctx
*ctx
, uint64_t off
, size_t len
, const char *buf
)
908 n
= pwrite(ctx
->krbd_fd
, buf
, len
, off
);
911 prt("pwrite(%llu, %zu) failed\n", off
, len
);
919 __krbd_flush(struct rbd_ctx
*ctx
, bool invalidate
)
927 * BLKFLSBUF will sync the filesystem on top of the device (we
928 * don't care about that here, since we write directly to it),
929 * write out any dirty buffers and invalidate the buffer cache.
930 * It won't do a hardware cache flush.
932 * fsync() will write out any dirty buffers and do a hardware
933 * cache flush (which we don't care about either, because for
934 * krbd it's a noop). It won't try to empty the buffer cache
935 * nor poke the filesystem before writing out.
937 * Given that, for our purposes, fsync is a flush, while
938 * BLKFLSBUF is a flush+invalidate.
941 ret
= ioctl(ctx
->krbd_fd
, BLKFLSBUF
, NULL
);
943 ret
= fsync(ctx
->krbd_fd
);
946 prt("%s failed\n", invalidate
? "BLKFLSBUF" : "fsync");
954 krbd_flush(struct rbd_ctx
*ctx
)
956 return __krbd_flush(ctx
, false);
960 krbd_discard(struct rbd_ctx
*ctx
, uint64_t off
, uint64_t len
)
962 uint64_t range
[2] = { off
, len
};
966 * BLKZEROOUT goes straight to disk and doesn't do anything
967 * about dirty buffers. This means we need to flush so that
972 * results in "data 0000 data" rather than "data data data" on
973 * disk and invalidate so that
978 * returns "data 0000 data" rather than "data data data" in
979 * case 1..2M was cached.
981 * Note: These cache coherency issues are supposed to be fixed
984 ret
= __krbd_flush(ctx
, true);
989 * off and len must be 512-byte aligned, otherwise BLKZEROOUT
990 * will fail with -EINVAL. This means that -K (enable krbd
991 * mode) requires -h 512 or similar.
993 if (ioctl(ctx
->krbd_fd
, BLKZEROOUT
, &range
) < 0) {
995 prt("BLKZEROOUT(%llu, %llu) failed\n", off
, len
);
1003 krbd_get_size(struct rbd_ctx
*ctx
, uint64_t *size
)
1007 if (ioctl(ctx
->krbd_fd
, BLKGETSIZE64
, &bytes
) < 0) {
1009 prt("BLKGETSIZE64 failed\n");
1019 krbd_resize(struct rbd_ctx
*ctx
, uint64_t size
)
1023 assert(size
% truncbdy
== 0);
1026 * When krbd detects a size change, it calls revalidate_disk(),
1027 * which ends up calling invalidate_bdev(), which invalidates
1028 * clean pages and does nothing about dirty pages beyond the
1029 * new size. The preceding cache flush makes sure those pages
1030 * are invalidated, which is what we need on shrink so that
1037 * returns "0000 0000" rather than "data 0000".
1039 ret
= __krbd_flush(ctx
, false);
1043 return __librbd_resize(ctx
, size
);
1047 krbd_clone(struct rbd_ctx
*ctx
, const char *src_snapname
,
1048 const char *dst_imagename
, int *order
, int stripe_unit
,
1053 ret
= __krbd_flush(ctx
, false);
1057 return __librbd_clone(ctx
, src_snapname
, dst_imagename
, order
,
1058 stripe_unit
, stripe_count
, true);
1062 krbd_flatten(struct rbd_ctx
*ctx
)
1066 ret
= __krbd_flush(ctx
, false);
1070 return __librbd_flatten(ctx
);
1073 const struct rbd_operations krbd_operations
= {
1088 nbd_open(const char *name
, struct rbd_ctx
*ctx
)
1095 SubProcess
process("rbd-nbd", SubProcess::KEEP
, SubProcess::PIPE
,
1097 process
.add_cmd_arg("map");
1102 process
.add_cmd_arg(img
.c_str());
1104 r
= __librbd_open(name
, ctx
);
1108 r
= process
.spawn();
1110 prt("nbd_open failed to run rbd-nbd error: %s\n", process
.err().c_str());
1113 r
= safe_read(process
.get_stdout(), dev
, sizeof(dev
));
1115 prt("nbd_open failed to get nbd device path\n");
1118 for (int i
= 0; i
< r
; ++i
)
1119 if (dev
[i
] == 10 || dev
[i
] == 13)
1124 prt("rbd-nbd failed with error: %s", process
.err().c_str());
1128 devnode
= strdup(dev
);
1132 fd
= open(devnode
, O_RDWR
| o_direct
);
1135 prt("open(%s) failed\n", devnode
);
1139 ctx
->krbd_name
= devnode
;
1146 nbd_close(struct rbd_ctx
*ctx
)
1150 assert(ctx
->krbd_name
&& ctx
->krbd_fd
>= 0);
1152 if (close(ctx
->krbd_fd
) < 0) {
1154 prt("close(%s) failed\n", ctx
->krbd_name
);
1158 SubProcess
process("rbd-nbd");
1159 process
.add_cmd_arg("unmap");
1160 process
.add_cmd_arg(ctx
->krbd_name
);
1162 r
= process
.spawn();
1164 prt("nbd_close failed to run rbd-nbd error: %s\n", process
.err().c_str());
1169 prt("rbd-nbd failed with error: %d", process
.err().c_str());
1173 free((void *)ctx
->krbd_name
);
1175 ctx
->krbd_name
= NULL
;
1178 return __librbd_close(ctx
);
1182 nbd_clone(struct rbd_ctx
*ctx
, const char *src_snapname
,
1183 const char *dst_imagename
, int *order
, int stripe_unit
,
1188 ret
= __krbd_flush(ctx
, false);
1192 return __librbd_clone(ctx
, src_snapname
, dst_imagename
, order
,
1193 stripe_unit
, stripe_count
, false);
1196 const struct rbd_operations nbd_operations
= {
1210 struct rbd_ctx ctx
= RBD_CTX_INIT
;
1211 const struct rbd_operations
*ops
= &librbd_operations
;
1213 static bool rbd_image_has_parent(struct rbd_ctx
*ctx
)
1217 ret
= rbd_get_parent_info(ctx
->image
, NULL
, 0, NULL
, 0, NULL
, 0);
1218 if (ret
< 0 && ret
!= -ENOENT
) {
1219 prterrcode("rbd_get_parent_info", ret
);
1231 log4(int operation
, int arg0
, int arg1
, int arg2
)
1233 struct log_entry
*le
;
1235 le
= &oplog
[logptr
];
1236 le
->operation
= operation
;
1238 le
->operation
= ~ le
->operation
;
1244 if (logptr
>= LOGSIZE
)
1252 struct log_entry
*lp
;
1253 const char *falloc_type
[3] = {"PAST_EOF", "EXTENDING", "INTERIOR"};
1255 prt("LOG DUMP (%d total operations):\n", logcount
);
1256 if (logcount
< LOGSIZE
) {
1263 for ( ; count
> 0; count
--) {
1266 opnum
= i
+1 + (logcount
/LOGSIZE
)*LOGSIZE
;
1267 prt("%d(%3d mod 256): ", opnum
, opnum
%256);
1269 if ((closeopen
= lp
->operation
< 0))
1270 lp
->operation
= ~ lp
->operation
;
1272 switch (lp
->operation
) {
1274 prt("MAPREAD 0x%x thru 0x%x\t(0x%x bytes)",
1275 lp
->args
[0], lp
->args
[0] + lp
->args
[1] - 1,
1277 if (badoff
>= lp
->args
[0] && badoff
<
1278 lp
->args
[0] + lp
->args
[1])
1279 prt("\t***RRRR***");
1282 prt("MAPWRITE 0x%x thru 0x%x\t(0x%x bytes)",
1283 lp
->args
[0], lp
->args
[0] + lp
->args
[1] - 1,
1285 if (badoff
>= lp
->args
[0] && badoff
<
1286 lp
->args
[0] + lp
->args
[1])
1287 prt("\t******WWWW");
1290 prt("READ 0x%x thru 0x%x\t(0x%x bytes)",
1291 lp
->args
[0], lp
->args
[0] + lp
->args
[1] - 1,
1293 if (badoff
>= lp
->args
[0] &&
1294 badoff
< lp
->args
[0] + lp
->args
[1])
1295 prt("\t***RRRR***");
1298 prt("WRITE 0x%x thru 0x%x\t(0x%x bytes)",
1299 lp
->args
[0], lp
->args
[0] + lp
->args
[1] - 1,
1301 if (lp
->args
[0] > lp
->args
[2])
1303 else if (lp
->args
[0] + lp
->args
[1] > lp
->args
[2])
1305 if ((badoff
>= lp
->args
[0] || badoff
>=lp
->args
[2]) &&
1306 badoff
< lp
->args
[0] + lp
->args
[1])
1310 down
= lp
->args
[0] < lp
->args
[1];
1311 prt("TRUNCATE %s\tfrom 0x%x to 0x%x",
1312 down
? "DOWN" : "UP", lp
->args
[1], lp
->args
[0]);
1313 if (badoff
>= lp
->args
[!down
] &&
1314 badoff
< lp
->args
[!!down
])
1315 prt("\t******WWWW");
1318 /* 0: offset 1: length 2: where alloced */
1319 prt("FALLOC 0x%x thru 0x%x\t(0x%x bytes) %s",
1320 lp
->args
[0], lp
->args
[0] + lp
->args
[1],
1321 lp
->args
[1], falloc_type
[lp
->args
[2]]);
1322 if (badoff
>= lp
->args
[0] &&
1323 badoff
< lp
->args
[0] + lp
->args
[1])
1324 prt("\t******FFFF");
1327 prt("PUNCH 0x%x thru 0x%x\t(0x%x bytes)",
1328 lp
->args
[0], lp
->args
[0] + lp
->args
[1] - 1,
1330 if (badoff
>= lp
->args
[0] && badoff
<
1331 lp
->args
[0] + lp
->args
[1])
1332 prt("\t******PPPP");
1335 prt("WRITESAME 0x%x thru 0x%x\t(0x%x bytes) data_size 0x%x",
1336 lp
->args
[0], lp
->args
[0] + lp
->args
[1] - 1,
1337 lp
->args
[1], lp
->args
[2]);
1338 if (badoff
>= lp
->args
[0] &&
1339 badoff
< lp
->args
[0] + lp
->args
[1])
1340 prt("\t***WSWSWSWS");
1342 case OP_COMPARE_AND_WRITE
:
1343 prt("COMPARE_AND_WRITE 0x%x thru 0x%x\t(0x%x bytes)",
1344 lp
->args
[0], lp
->args
[0] + lp
->args
[1] - 1,
1346 if (lp
->args
[0] > lp
->args
[2])
1348 else if (lp
->args
[0] + lp
->args
[1] > lp
->args
[2])
1350 if ((badoff
>= lp
->args
[0] || badoff
>=lp
->args
[2]) &&
1351 badoff
< lp
->args
[0] + lp
->args
[1])
1361 prt("SKIPPED (no operation)");
1364 prt("BOGUS LOG ENTRY (operation code = %d)!",
1368 prt("\n\t\tCLOSE/OPEN");
1377 save_buffer(char *buffer
, off_t bufferlength
, int fd
)
1380 ssize_t byteswritten
;
1382 if (fd
<= 0 || bufferlength
== 0)
1385 if (bufferlength
> SSIZE_MAX
) {
1386 prt("fsx flaw: overflow in save_buffer\n");
1390 ret
= lseek(fd
, (off_t
)0, SEEK_SET
);
1391 if (ret
== (off_t
)-1)
1392 prterr("save_buffer: lseek 0");
1394 byteswritten
= write(fd
, buffer
, (size_t)bufferlength
);
1395 if (byteswritten
!= bufferlength
) {
1396 if (byteswritten
== -1)
1397 prterr("save_buffer write");
1399 warn("save_buffer: short write, 0x%x bytes instead of 0x%llx\n",
1400 (unsigned)byteswritten
,
1401 (unsigned long long)bufferlength
);
1407 report_failure(int status
)
1413 save_buffer(good_buf
, file_size
, fsxgoodfd
);
1414 prt("Correct content saved for comparison\n");
1415 prt("(maybe hexdump \"%s\" vs \"%s.fsxgood\")\n",
1420 sleep(3); // so the log can flush to disk. KLUDGEY!
1424 #define short_at(cp) ((unsigned short)((*((unsigned char *)(cp)) << 8) | \
1425 *(((unsigned char *)(cp)) + 1)))
1428 fsxcmp(char *good_buf
, char *temp_buf
, unsigned size
)
1430 if (!skip_partial_discard
) {
1431 return memcmp(good_buf
, temp_buf
, size
);
1434 for (unsigned i
= 0; i
< size
; i
++) {
1435 if (good_buf
[i
] != temp_buf
[i
] && good_buf
[i
] != 0) {
1436 return good_buf
[i
] - temp_buf
[i
];
1443 check_buffers(char *good_buf
, char *temp_buf
, unsigned offset
, unsigned size
)
1445 if (fsxcmp(good_buf
+ offset
, temp_buf
, size
) != 0) {
1449 prt("READ BAD DATA: offset = 0x%x, size = 0x%x, fname = %s\n",
1450 offset
, size
, iname
);
1451 prt("OFFSET\tGOOD\tBAD\tRANGE\n");
1453 unsigned char c
= good_buf
[offset
];
1454 unsigned char t
= temp_buf
[i
];
1457 unsigned bad
= short_at(&temp_buf
[i
]);
1458 prt("0x%5x\t0x%04x\t0x%04x", offset
,
1459 short_at(&good_buf
[offset
]), bad
);
1460 unsigned op
= temp_buf
[(offset
& 1) ? i
+1 : i
];
1461 prt("\t0x%5x\n", n
);
1463 prt("operation# (mod 256) for "
1464 "the bad data may be %u\n",
1465 ((unsigned)op
& 0xff));
1467 prt("operation# (mod 256) for "
1468 "the bad data unknown, check"
1469 " HOLE and EXTEND ops\n");
1478 report_failure(110);
1489 ret
= ops
->get_size(&ctx
, &size
);
1491 prterrcode("check_size: ops->get_size", ret
);
1493 if ((uint64_t)file_size
!= size
) {
1494 prt("Size error: expected 0x%llx stat 0x%llx\n",
1495 (unsigned long long)file_size
,
1496 (unsigned long long)size
);
1497 report_failure(120);
1501 #define TRUNC_HACK_SIZE (200ULL << 9) /* 512-byte aligned for krbd */
1504 check_trunc_hack(void)
1509 ret
= ops
->resize(&ctx
, 0ULL);
1511 prterrcode("check_trunc_hack: ops->resize pre", ret
);
1513 ret
= ops
->resize(&ctx
, TRUNC_HACK_SIZE
);
1515 prterrcode("check_trunc_hack: ops->resize actual", ret
);
1517 ret
= ops
->get_size(&ctx
, &size
);
1519 prterrcode("check_trunc_hack: ops->get_size", ret
);
1521 if (size
!= TRUNC_HACK_SIZE
) {
1522 prt("no extend on truncate! not posix!\n");
1526 ret
= ops
->resize(&ctx
, 0ULL);
1528 prterrcode("check_trunc_hack: ops->resize post", ret
);
1538 r
= rados_create(&cluster
, NULL
);
1540 simple_err("Could not create cluster handle", r
);
1543 rados_conf_parse_env(cluster
, NULL
);
1544 r
= rados_conf_read_file(cluster
, NULL
);
1546 simple_err("Error reading ceph config file", r
);
1547 goto failed_shutdown
;
1549 r
= rados_connect(cluster
);
1551 simple_err("Error connecting to cluster", r
);
1552 goto failed_shutdown
;
1554 r
= krbd_create_from_context(rados_cct(cluster
), &krbd
);
1556 simple_err("Could not create libkrbd handle", r
);
1557 goto failed_shutdown
;
1560 r
= rados_pool_create(cluster
, pool
);
1561 if (r
< 0 && r
!= -EEXIST
) {
1562 simple_err("Error creating pool", r
);
1565 r
= rados_ioctx_create(cluster
, pool
, &ioctx
);
1567 simple_err("Error creating ioctx", r
);
1570 rados_application_enable(ioctx
, "rbd", 1);
1572 if (clone_calls
|| journal_replay
) {
1573 uint64_t features
= 0;
1575 features
|= RBD_FEATURE_LAYERING
;
1577 if (journal_replay
) {
1578 features
|= (RBD_FEATURE_EXCLUSIVE_LOCK
|
1579 RBD_FEATURE_JOURNALING
);
1581 r
= rbd_create2(ioctx
, iname
, 0, features
, &order
);
1583 r
= rbd_create(ioctx
, iname
, 0, &order
);
1586 simple_err("Error creating image", r
);
1590 if (journal_replay
) {
1591 r
= register_journal(ioctx
, iname
);
1597 r
= rados_conf_get(cluster
, "rbd_skip_partial_discard", buf
,
1600 simple_err("Could not get rbd_skip_partial_discard value", r
);
1603 skip_partial_discard
= (strcmp(buf
, "true") == 0);
1608 rados_ioctx_destroy(ioctx
);
1612 rados_shutdown(cluster
);
1617 doflush(unsigned offset
, unsigned size
)
1624 ret
= ops
->flush(&ctx
);
1626 prterrcode("doflush: ops->flush", ret
);
1630 doread(unsigned offset
, unsigned size
)
1634 offset
-= offset
% readbdy
;
1636 size
-= size
% readbdy
;
1638 if (!quiet
&& testcalls
> simulatedopcount
&& !o_direct
)
1639 prt("skipping zero size read\n");
1640 log4(OP_SKIPPED
, OP_READ
, offset
, size
);
1643 if (size
+ offset
> file_size
) {
1644 if (!quiet
&& testcalls
> simulatedopcount
)
1645 prt("skipping seek/read past end of file\n");
1646 log4(OP_SKIPPED
, OP_READ
, offset
, size
);
1650 log4(OP_READ
, offset
, size
, 0);
1652 if (testcalls
<= simulatedopcount
)
1656 ((progressinterval
&& testcalls
% progressinterval
== 0) ||
1658 (monitorstart
== -1 ||
1659 (static_cast<long>(offset
+ size
) > monitorstart
&&
1660 (monitorend
== -1 ||
1661 static_cast<long>(offset
) <= monitorend
))))))
1662 prt("%lu read\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls
,
1663 offset
, offset
+ size
- 1, size
);
1665 ret
= ops
->read(&ctx
, offset
, size
, temp_buf
);
1666 if (ret
!= (int)size
) {
1668 prterrcode("doread: ops->read", ret
);
1670 prt("short read: 0x%x bytes instead of 0x%x\n",
1672 report_failure(141);
1675 check_buffers(good_buf
, temp_buf
, offset
, size
);
1680 check_eofpage(char *s
, unsigned offset
, char *p
, int size
)
1682 unsigned long last_page
, should_be_zero
;
1684 if (offset
+ size
<= (file_size
& ~page_mask
))
1687 * we landed in the last page of the file
1688 * test to make sure the VM system provided 0's
1689 * beyond the true end of the file mapping
1690 * (as required by mmap def in 1996 posix 1003.1)
1692 last_page
= ((unsigned long)p
+ (offset
& page_mask
) + size
) & ~page_mask
;
1694 for (should_be_zero
= last_page
+ (file_size
& page_mask
);
1695 should_be_zero
< last_page
+ page_size
;
1697 if (*(char *)should_be_zero
) {
1698 prt("Mapped %s: non-zero data past EOF (0x%llx) page offset 0x%x is 0x%04x\n",
1699 s
, file_size
- 1, should_be_zero
& page_mask
,
1700 short_at(should_be_zero
));
1701 report_failure(205);
1707 gendata(char *original_buf
, char *good_buf
, unsigned offset
, unsigned size
)
1710 good_buf
[offset
] = testcalls
% 256;
1712 good_buf
[offset
] += original_buf
[offset
];
1719 dowrite(unsigned offset
, unsigned size
)
1724 offset
-= offset
% writebdy
;
1726 size
-= size
% writebdy
;
1728 if (!quiet
&& testcalls
> simulatedopcount
&& !o_direct
)
1729 prt("skipping zero size write\n");
1730 log4(OP_SKIPPED
, OP_WRITE
, offset
, size
);
1734 log4(OP_WRITE
, offset
, size
, file_size
);
1736 gendata(original_buf
, good_buf
, offset
, size
);
1737 if (file_size
< offset
+ size
) {
1738 newsize
= ceil(((double)offset
+ size
) / truncbdy
) * truncbdy
;
1739 if (file_size
< newsize
)
1740 memset(good_buf
+ file_size
, '\0', newsize
- file_size
);
1741 file_size
= newsize
;
1743 warn("Lite file size bug in fsx!");
1744 report_failure(149);
1746 ret
= ops
->resize(&ctx
, newsize
);
1748 prterrcode("dowrite: ops->resize", ret
);
1749 report_failure(150);
1753 if (testcalls
<= simulatedopcount
)
1757 ((progressinterval
&& testcalls
% progressinterval
== 0) ||
1759 (monitorstart
== -1 ||
1760 (static_cast<long>(offset
+ size
) > monitorstart
&&
1761 (monitorend
== -1 ||
1762 static_cast<long>(offset
) <= monitorend
))))))
1763 prt("%lu write\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls
,
1764 offset
, offset
+ size
- 1, size
);
1766 ret
= ops
->write(&ctx
, offset
, size
, good_buf
+ offset
);
1767 if (ret
!= (ssize_t
)size
) {
1769 prterrcode("dowrite: ops->write", ret
);
1771 prt("short write: 0x%x bytes instead of 0x%x\n",
1773 report_failure(151);
1777 doflush(offset
, size
);
1782 dotruncate(unsigned size
)
1784 int oldsize
= file_size
;
1787 size
-= size
% truncbdy
;
1788 if (size
> biggest
) {
1790 if (!quiet
&& testcalls
> simulatedopcount
)
1791 prt("truncating to largest ever: 0x%x\n", size
);
1794 log4(OP_TRUNCATE
, size
, (unsigned)file_size
, 0);
1796 if (size
> file_size
)
1797 memset(good_buf
+ file_size
, '\0', size
- file_size
);
1798 else if (size
< file_size
)
1799 memset(good_buf
+ size
, '\0', file_size
- size
);
1802 if (testcalls
<= simulatedopcount
)
1805 if ((progressinterval
&& testcalls
% progressinterval
== 0) ||
1806 (debug
&& (monitorstart
== -1 || monitorend
== -1 ||
1807 (long)size
<= monitorend
)))
1808 prt("%lu trunc\tfrom 0x%x to 0x%x\n", testcalls
, oldsize
, size
);
1810 ret
= ops
->resize(&ctx
, size
);
1812 prterrcode("dotruncate: ops->resize", ret
);
1813 report_failure(160);
1818 do_punch_hole(unsigned offset
, unsigned length
)
1820 unsigned end_offset
;
1825 offset
-= offset
% holebdy
;
1826 length
-= length
% holebdy
;
1828 if (!quiet
&& testcalls
> simulatedopcount
)
1829 prt("skipping zero length punch hole\n");
1830 log4(OP_SKIPPED
, OP_PUNCH_HOLE
, offset
, length
);
1834 if (file_size
<= (loff_t
)offset
) {
1835 if (!quiet
&& testcalls
> simulatedopcount
)
1836 prt("skipping hole punch off the end of the file\n");
1837 log4(OP_SKIPPED
, OP_PUNCH_HOLE
, offset
, length
);
1841 end_offset
= offset
+ length
;
1843 log4(OP_PUNCH_HOLE
, offset
, length
, 0);
1845 if (testcalls
<= simulatedopcount
)
1848 if ((progressinterval
&& testcalls
% progressinterval
== 0) ||
1849 (debug
&& (monitorstart
== -1 || monitorend
== -1 ||
1850 (long)end_offset
<= monitorend
))) {
1851 prt("%lu punch\tfrom 0x%x to 0x%x, (0x%x bytes)\n", testcalls
,
1852 offset
, offset
+length
, length
);
1855 ret
= ops
->discard(&ctx
, (unsigned long long)offset
,
1856 (unsigned long long)length
);
1858 prterrcode("do_punch_hole: ops->discard", ret
);
1859 report_failure(161);
1862 max_offset
= offset
< file_size
? offset
: file_size
;
1863 max_len
= max_offset
+ length
<= file_size
? length
:
1864 file_size
- max_offset
;
1865 memset(good_buf
+ max_offset
, '\0', max_len
);
1868 unsigned get_data_size(unsigned size
)
1872 unsigned max
= sqrt((double)size
) + 1;
1874 unsigned curr
= good
;
1876 hint
= get_random() % max
;
1878 for (i
= 1; i
< max
&& curr
< hint
; i
++) {
1879 if (size
% i
== 0) {
1892 dowritesame(unsigned offset
, unsigned size
)
1900 offset
-= offset
% writebdy
;
1902 size
-= size
% writebdy
;
1904 if (!quiet
&& testcalls
> simulatedopcount
&& !o_direct
)
1905 prt("skipping zero size writesame\n");
1906 log4(OP_SKIPPED
, OP_WRITESAME
, offset
, size
);
1910 data_size
= get_data_size(size
);
1912 log4(OP_WRITESAME
, offset
, size
, data_size
);
1914 gendata(original_buf
, good_buf
, offset
, data_size
);
1915 if (file_size
< offset
+ size
) {
1916 newsize
= ceil(((double)offset
+ size
) / truncbdy
) * truncbdy
;
1917 if (file_size
< newsize
)
1918 memset(good_buf
+ file_size
, '\0', newsize
- file_size
);
1919 file_size
= newsize
;
1921 warn("Lite file size bug in fsx!");
1922 report_failure(162);
1924 ret
= ops
->resize(&ctx
, newsize
);
1926 prterrcode("dowritesame: ops->resize", ret
);
1927 report_failure(163);
1931 for (n
= size
/ data_size
, buf_off
= data_size
; n
> 1; n
--) {
1932 memcpy(good_buf
+ offset
+ buf_off
, good_buf
+ offset
, data_size
);
1933 buf_off
+= data_size
;
1936 if (testcalls
<= simulatedopcount
)
1940 ((progressinterval
&& testcalls
% progressinterval
== 0) ||
1942 (monitorstart
== -1 ||
1943 (static_cast<long>(offset
+ size
) > monitorstart
&&
1944 (monitorend
== -1 ||
1945 static_cast<long>(offset
) <= monitorend
))))))
1946 prt("%lu writesame\t0x%x thru\t0x%x\tdata_size\t0x%x(0x%x bytes)\n", testcalls
,
1947 offset
, offset
+ size
- 1, data_size
, size
);
1949 ret
= ops
->writesame(&ctx
, offset
, size
, good_buf
+ offset
, data_size
);
1950 if (ret
!= (ssize_t
)size
) {
1952 prterrcode("dowritesame: ops->writesame", ret
);
1954 prt("short writesame: 0x%x bytes instead of 0x%x\n",
1956 report_failure(164);
1960 doflush(offset
, size
);
1964 docompareandwrite(unsigned offset
, unsigned size
)
1968 if (skip_partial_discard
) {
1969 if (!quiet
&& testcalls
> simulatedopcount
)
1970 prt("compare and write disabled\n");
1971 log4(OP_SKIPPED
, OP_COMPARE_AND_WRITE
, offset
, size
);
1975 offset
-= offset
% writebdy
;
1977 size
-= size
% writebdy
;
1980 if (!quiet
&& testcalls
> simulatedopcount
&& !o_direct
)
1981 prt("skipping zero size read\n");
1982 log4(OP_SKIPPED
, OP_READ
, offset
, size
);
1986 if (size
+ offset
> file_size
) {
1987 if (!quiet
&& testcalls
> simulatedopcount
)
1988 prt("skipping seek/compare past end of file\n");
1989 log4(OP_SKIPPED
, OP_COMPARE_AND_WRITE
, offset
, size
);
1993 memcpy(temp_buf
+ offset
, good_buf
+ offset
, size
);
1994 gendata(original_buf
, good_buf
, offset
, size
);
1995 log4(OP_COMPARE_AND_WRITE
, offset
, size
, 0);
1997 if (testcalls
<= simulatedopcount
)
2001 ((progressinterval
&& testcalls
% progressinterval
== 0) ||
2003 (monitorstart
== -1 ||
2004 (static_cast<long>(offset
+ size
) > monitorstart
&&
2005 (monitorend
== -1 ||
2006 static_cast<long>(offset
) <= monitorend
))))))
2007 prt("%lu compareandwrite\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls
,
2008 offset
, offset
+ size
- 1, size
);
2010 ret
= ops
->compare_and_write(&ctx
, offset
, size
, temp_buf
+ offset
,
2012 if (ret
!= (ssize_t
)size
) {
2013 if (ret
== -EINVAL
) {
2014 memcpy(good_buf
+ offset
, temp_buf
+ offset
, size
);
2018 prterrcode("docompareandwrite: ops->compare_and_write", ret
);
2020 prt("short write: 0x%x bytes instead of 0x%x\n", ret
, size
);
2021 report_failure(151);
2026 doflush(offset
, size
);
2029 void clone_filename(char *buf
, size_t len
, int clones
)
2031 snprintf(buf
, len
, "%s/fsx-%s-parent%d",
2032 dirpath
, iname
, clones
);
2035 void clone_imagename(char *buf
, size_t len
, int clones
)
2038 snprintf(buf
, len
, "%s-clone%d", iname
, clones
);
2040 strncpy(buf
, iname
, len
);
2041 buf
[len
- 1] = '\0';
2044 void replay_imagename(char *buf
, size_t len
, int clones
)
2046 clone_imagename(buf
, len
, clones
);
2047 strncat(buf
, "-replay", len
- strlen(buf
));
2048 buf
[len
- 1] = '\0';
2051 void check_clone(int clonenum
, bool replay_image
);
2056 char filename
[1024];
2057 char imagename
[1024];
2058 char lastimagename
[1024];
2060 int order
= 0, stripe_unit
= 0, stripe_count
= 0;
2061 uint64_t newsize
= file_size
;
2063 log4(OP_CLONE
, 0, 0, 0);
2066 if (randomize_striping
) {
2067 order
= 18 + get_random() % 8;
2068 stripe_unit
= 1ull << (order
- 1 - (get_random() % 8));
2069 stripe_count
= 2 + get_random() % 14;
2072 prt("%lu clone\t%d order %d su %d sc %d\n", testcalls
, num_clones
,
2073 order
, stripe_unit
, stripe_count
);
2075 clone_imagename(imagename
, sizeof(imagename
), num_clones
);
2076 clone_imagename(lastimagename
, sizeof(lastimagename
),
2078 assert(strcmp(lastimagename
, ctx
.name
) == 0);
2080 ret
= ops
->clone(&ctx
, "snap", imagename
, &order
, stripe_unit
,
2083 prterrcode("do_clone: ops->clone", ret
);
2087 if (randomize_parent_overlap
&& rbd_image_has_parent(&ctx
)) {
2088 int rand
= get_random() % 16 + 1; // [1..16]
2093 ret
= rbd_get_overlap(ctx
.image
, &overlap
);
2095 prterrcode("do_clone: rbd_get_overlap", ret
);
2099 if (rand
< 10) { // 9/16
2100 newsize
= overlap
* ((double)rand
/ 10);
2101 newsize
-= newsize
% truncbdy
;
2106 assert(newsize
!= (uint64_t)file_size
);
2107 prt("truncating image %s from 0x%llx (overlap 0x%llx) to 0x%llx\n",
2108 ctx
.name
, file_size
, overlap
, newsize
);
2110 ret
= ops
->resize(&ctx
, newsize
);
2112 prterrcode("do_clone: ops->resize", ret
);
2115 } else if (rand
< 15) { // 2/16
2116 prt("flattening image %s\n", ctx
.name
);
2118 ret
= ops
->flatten(&ctx
);
2120 prterrcode("do_clone: ops->flatten", ret
);
2124 prt("leaving image %s intact\n", ctx
.name
);
2128 clone_filename(filename
, sizeof(filename
), num_clones
);
2129 if ((fd
= open(filename
, O_WRONLY
|O_CREAT
|O_TRUNC
, 0666)) < 0) {
2130 simple_err("do_clone: open", -errno
);
2133 save_buffer(good_buf
, newsize
, fd
);
2134 if ((ret
= close(fd
)) < 0) {
2135 simple_err("do_clone: close", -errno
);
2142 if ((ret
= ops
->close(&ctx
)) < 0) {
2143 prterrcode("do_clone: ops->close", ret
);
2147 if (journal_replay
) {
2148 ret
= finalize_journal(ioctx
, lastimagename
, num_clones
- 1,
2149 order
, stripe_unit
, stripe_count
);
2154 ret
= register_journal(ioctx
, imagename
);
2161 * Open freshly made clone.
2163 if ((ret
= ops
->open(imagename
, &ctx
)) < 0) {
2164 prterrcode("do_clone: ops->open", ret
);
2168 if (num_clones
> 1) {
2169 if (journal_replay
) {
2170 check_clone(num_clones
- 2, true);
2172 check_clone(num_clones
- 2, false);
2177 check_clone(int clonenum
, bool replay_image
)
2180 char imagename
[128];
2182 struct rbd_ctx cur_ctx
= RBD_CTX_INIT
;
2183 struct stat file_info
;
2184 char *good_buf
, *temp_buf
;
2187 replay_imagename(imagename
, sizeof(imagename
), clonenum
);
2189 clone_imagename(imagename
, sizeof(imagename
), clonenum
);
2192 if ((ret
= ops
->open(imagename
, &cur_ctx
)) < 0) {
2193 prterrcode("check_clone: ops->open", ret
);
2197 clone_filename(filename
, sizeof(filename
), clonenum
+ 1);
2198 if ((fd
= open(filename
, O_RDONLY
)) < 0) {
2199 simple_err("check_clone: open", -errno
);
2203 prt("checking clone #%d, image %s against file %s\n",
2204 clonenum
, imagename
, filename
);
2205 if ((ret
= fstat(fd
, &file_info
)) < 0) {
2206 simple_err("check_clone: fstat", -errno
);
2211 ret
= posix_memalign((void **)&good_buf
,
2212 MAX(writebdy
, (int)sizeof(void *)),
2215 prterrcode("check_clone: posix_memalign(good_buf)", -ret
);
2220 ret
= posix_memalign((void **)&temp_buf
,
2221 MAX(readbdy
, (int)sizeof(void *)),
2224 prterrcode("check_clone: posix_memalign(temp_buf)", -ret
);
2228 if ((ret
= pread(fd
, good_buf
, file_info
.st_size
, 0)) < 0) {
2229 simple_err("check_clone: pread", -errno
);
2232 if ((ret
= ops
->read(&cur_ctx
, 0, file_info
.st_size
, temp_buf
)) < 0) {
2233 prterrcode("check_clone: ops->read", ret
);
2237 if ((ret
= ops
->close(&cur_ctx
)) < 0) {
2238 prterrcode("check_clone: ops->close", ret
);
2241 check_buffers(good_buf
, temp_buf
, 0, file_info
.st_size
);
2243 if (!replay_image
) {
2256 ret
= ops
->write(&ctx
, 0, file_size
, good_buf
);
2257 if (ret
!= file_size
) {
2259 prterrcode("writefileimage: ops->write", ret
);
2261 prt("short write: 0x%x bytes instead of 0x%llx\n",
2262 ret
, (unsigned long long)file_size
);
2263 report_failure(172);
2267 ret
= ops
->resize(&ctx
, file_size
);
2269 prterrcode("writefileimage: ops->resize", ret
);
2270 report_failure(173);
2280 if (!rbd_image_has_parent(&ctx
)) {
2281 log4(OP_SKIPPED
, OP_FLATTEN
, 0, 0);
2284 log4(OP_FLATTEN
, 0, 0, 0);
2285 prt("%lu flatten\n", testcalls
);
2287 ret
= ops
->flatten(&ctx
);
2289 prterrcode("writefileimage: ops->flatten", ret
);
2300 if (testcalls
<= simulatedopcount
)
2303 name
= strdup(ctx
.name
);
2306 prt("%lu close/open\n", testcalls
);
2308 ret
= ops
->close(&ctx
);
2310 prterrcode("docloseopen: ops->close", ret
);
2311 report_failure(180);
2314 ret
= ops
->open(name
, &ctx
);
2316 prterrcode("docloseopen: ops->open", ret
);
2317 report_failure(181);
2323 #define TRIM_OFF_LEN(off, len, size) \
2329 if ((unsigned)(off) + (unsigned)(len) > (unsigned)(size)) \
2330 (len) = (size) - (off); \
2336 unsigned long offset
;
2337 unsigned long size
= maxoplen
;
2338 unsigned long rv
= get_random();
2341 if (simulatedopcount
> 0 && testcalls
== simulatedopcount
)
2347 closeopen
= (rv
>> 3) < (1u << 28) / (unsigned)closeprob
;
2349 if (debugstart
> 0 && testcalls
>= debugstart
)
2352 if (!quiet
&& testcalls
< simulatedopcount
&& testcalls
% 100000 == 0)
2353 prt("%lu...\n", testcalls
);
2355 offset
= get_random();
2357 size
= get_random() % (maxoplen
+ 1);
2359 /* calculate appropriate op to run */
2361 op
= rv
% OP_MAX_LITE
;
2363 op
= rv
% OP_MAX_FULL
;
2375 if (!fallocate_calls
) {
2376 log4(OP_SKIPPED
, OP_FALLOCATE
, offset
, size
);
2381 if (!punch_hole_calls
) {
2382 log4(OP_SKIPPED
, OP_PUNCH_HOLE
, offset
, size
);
2387 /* clone, 8% chance */
2388 if (!clone_calls
|| file_size
== 0 || get_random() % 100 >= 8) {
2389 log4(OP_SKIPPED
, OP_CLONE
, 0, 0);
2394 /* flatten four times as rarely as clone, 2% chance */
2395 if (get_random() % 100 >= 2) {
2396 log4(OP_SKIPPED
, OP_FLATTEN
, 0, 0);
2401 /* writesame not implemented */
2402 if (!ops
->writesame
) {
2403 log4(OP_SKIPPED
, OP_WRITESAME
, offset
, size
);
2407 case OP_COMPARE_AND_WRITE
:
2408 /* compare_and_write not implemented */
2409 if (!ops
->compare_and_write
) {
2410 log4(OP_SKIPPED
, OP_COMPARE_AND_WRITE
, offset
, size
);
2418 TRIM_OFF_LEN(offset
, size
, file_size
);
2419 doread(offset
, size
);
2423 TRIM_OFF_LEN(offset
, size
, maxfilelen
);
2424 dowrite(offset
, size
);
2428 TRIM_OFF_LEN(offset
, size
, file_size
);
2433 TRIM_OFF_LEN(offset
, size
, maxfilelen
);
2439 size
= get_random() % maxfilelen
;
2444 TRIM_OFF_LEN(offset
, size
, file_size
);
2445 do_punch_hole(offset
, size
);
2449 TRIM_OFF_LEN(offset
, size
, maxfilelen
);
2450 dowritesame(offset
, size
);
2452 case OP_COMPARE_AND_WRITE
:
2453 TRIM_OFF_LEN(offset
, size
, file_size
);
2454 docompareandwrite(offset
, size
);
2466 prterr("test: unknown operation");
2472 if (sizechecks
&& testcalls
> simulatedopcount
)
2483 prt("signal %d\n", sig
);
2484 prt("testcalls = %lu\n", testcalls
);
2492 fprintf(stdout
, "usage: %s",
2493 "fsx [-dfjknqxyACFHKLORUWZ] [-b opnum] [-c Prob] [-h holebdy] [-l flen] [-m start:end] [-o oplen] [-p progressinterval] [-r readbdy] [-s style] [-t truncbdy] [-w writebdy] [-D startingop] [-N numops] [-P dirpath] [-S seed] pname iname\n\
2494 -b opnum: beginning operation number (default 1)\n\
2495 -c P: 1 in P chance of file close+open at each op (default infinity)\n\
2496 -d: debug output for all operations\n\
2497 -f: flush and invalidate cache after I/O\n\
2498 -h holebdy: 4096 would make discards page aligned (default 1)\n\
2499 -j: journal replay stress test\n\
2500 -k: keep data on success (default 0)\n\
2501 -l flen: the upper bound on file size (default 262144)\n\
2502 -m startop:endop: monitor (print debug output) specified byte range (default 0:infinity)\n\
2503 -n: no verifications of file size\n\
2504 -o oplen: the upper bound on operation size (default 65536)\n\
2505 -p progressinterval: debug output at specified operation interval\n\
2506 -q: quieter operation\n\
2507 -r readbdy: 4096 would make reads page aligned (default 1)\n\
2508 -s style: 1 gives smaller truncates (default 0)\n\
2509 -t truncbdy: 4096 would make truncates page aligned (default 1)\n\
2510 -w writebdy: 4096 would make writes page aligned (default 1)\n\
2511 -x: preallocate file space before starting, XFS only (default 0)\n\
2512 -y: synchronize changes to a file\n"
2514 " -C: do not use clone calls\n\
2515 -D startingop: debug output starting at specified operation\n"
2517 " -F: Do not use fallocate (preallocation) calls\n"
2519 " -H: do not use punch hole calls\n\
2520 -K: enable krbd mode (use -t and -h too)\n\
2521 -M: enable rbd-nbd mode (use -t and -h too)\n\
2522 -L: fsxLite - no file creations & no file size changes\n\
2523 -N numops: total # operations to do (default infinity)\n\
2524 -O: use oplen (see -o flag) for every op (default random)\n\
2525 -P dirpath: save .fsxlog and .fsxgood files in dirpath (default ./)\n\
2526 -R: read() system calls only (mapped reads disabled)\n\
2527 -S seed: for random # generator (default 1) 0 gets timestamp\n\
2528 -U: disable randomized striping\n\
2529 -W: mapped write operations DISabled\n\
2530 -Z: O_DIRECT (use -R, -W, -r and -w too)\n\
2531 poolname: this is REQUIRED (no default)\n\
2532 imagename: this is REQUIRED (no default)\n");
2538 getnum(char *s
, char **e
)
2543 ret
= strtol(s
, e
, 0);
2574 if (!lite
&& fallocate_calls
) {
2575 if (fallocate(fd
, 0, 0, 1) && errno
== EOPNOTSUPP
) {
2577 warn("main: filesystem does not support fallocate, disabling\n");
2578 fallocate_calls
= 0;
2583 #else /* ! FALLOCATE */
2584 fallocate_calls
= 0;
2589 void remove_image(rados_ioctx_t ioctx
, char *imagename
, bool remove_snap
,
2595 if ((ret
= rbd_open(ioctx
, imagename
, &image
, NULL
)) < 0) {
2596 sprintf(errmsg
, "rbd_open %s", imagename
);
2597 prterrcode(errmsg
, ret
);
2598 report_failure(101);
2601 if ((ret
= rbd_snap_unprotect(image
, "snap")) < 0) {
2602 sprintf(errmsg
, "rbd_snap_unprotect %s@snap",
2604 prterrcode(errmsg
, ret
);
2605 report_failure(102);
2607 if ((ret
= rbd_snap_remove(image
, "snap")) < 0) {
2608 sprintf(errmsg
, "rbd_snap_remove %s@snap",
2610 prterrcode(errmsg
, ret
);
2611 report_failure(103);
2614 if ((ret
= rbd_close(image
)) < 0) {
2615 sprintf(errmsg
, "rbd_close %s", imagename
);
2616 prterrcode(errmsg
, ret
);
2617 report_failure(104);
2621 (ret
= unregister_journal(ioctx
, imagename
)) < 0) {
2622 report_failure(105);
2625 if ((ret
= rbd_remove(ioctx
, imagename
)) < 0) {
2626 sprintf(errmsg
, "rbd_remove %s", imagename
);
2627 prterrcode(errmsg
, ret
);
2628 report_failure(106);
2633 main(int argc
, char **argv
)
2635 int i
, style
, ch
, ret
;
2637 char goodfile
[1024];
2643 page_size
= getpagesize();
2644 page_mask
= page_size
- 1;
2645 mmap_mask
= page_mask
;
2647 setvbuf(stdout
, (char *)0, _IOLBF
, 0); /* line buffered stdout */
2649 while ((ch
= getopt(argc
, argv
, "b:c:dfh:jkl:m:no:p:qr:s:t:w:xyCD:FHKMLN:OP:RS:UWZ"))
2653 simulatedopcount
= getnum(optarg
, &endp
);
2655 fprintf(stdout
, "Will begin at operation %lu\n",
2657 if (simulatedopcount
== 0)
2659 simulatedopcount
-= 1;
2662 closeprob
= getnum(optarg
, &endp
);
2665 "Chance of close/open is 1 in %d\n",
2677 holebdy
= getnum(optarg
, &endp
);
2682 journal_replay
= true;
2685 keep_on_success
= 1;
2689 int _num
= getnum(optarg
, &endp
);
2696 monitorstart
= getnum(optarg
, &endp
);
2697 if (monitorstart
< 0)
2699 if (!endp
|| *endp
++ != ':')
2701 monitorend
= getnum(endp
, &endp
);
2704 if (monitorend
== 0)
2705 monitorend
= -1; /* aka infinity */
2712 maxoplen
= getnum(optarg
, &endp
);
2717 progressinterval
= getnum(optarg
, &endp
);
2718 if (progressinterval
== 0)
2725 readbdy
= getnum(optarg
, &endp
);
2730 style
= getnum(optarg
, &endp
);
2731 if (style
< 0 || style
> 1)
2735 truncbdy
= getnum(optarg
, &endp
);
2740 writebdy
= getnum(optarg
, &endp
);
2754 debugstart
= getnum(optarg
, &endp
);
2759 fallocate_calls
= 0;
2762 punch_hole_calls
= 0;
2765 prt("krbd mode enabled\n");
2766 ops
= &krbd_operations
;
2769 prt("rbd-nbd mode enabled\n");
2770 ops
= &nbd_operations
;
2773 prt("lite mode not supported for rbd\n");
2777 numops
= getnum(optarg
, &endp
);
2785 strncpy(dirpath
, optarg
, sizeof(dirpath
)-1);
2786 dirpath
[sizeof(dirpath
)-1] = '\0';
2787 strncpy(goodfile
, dirpath
, sizeof(goodfile
)-1);
2788 goodfile
[sizeof(goodfile
)-1] = '\0';
2789 if (strlen(goodfile
) < sizeof(goodfile
)-2) {
2790 strcat(goodfile
, "/");
2792 prt("file name to long\n");
2795 strncpy(logfile
, dirpath
, sizeof(logfile
)-1);
2796 logfile
[sizeof(logfile
)-1] = '\0';
2797 if (strlen(logfile
) < sizeof(logfile
)-2) {
2798 strcat(logfile
, "/");
2800 prt("file path to long\n");
2807 fprintf(stdout
, "mapped reads DISABLED\n");
2810 seed
= getnum(optarg
, &endp
);
2812 seed
= time(0) % 10000;
2814 fprintf(stdout
, "Seed set to %d\n", seed
);
2819 randomize_striping
= 0;
2824 fprintf(stdout
, "mapped writes DISABLED\n");
2827 o_direct
= O_DIRECT
;
2840 signal(SIGHUP
, cleanup
);
2841 signal(SIGINT
, cleanup
);
2842 signal(SIGPIPE
, cleanup
);
2843 signal(SIGALRM
, cleanup
);
2844 signal(SIGTERM
, cleanup
);
2845 signal(SIGXCPU
, cleanup
);
2846 signal(SIGXFSZ
, cleanup
);
2847 signal(SIGVTALRM
, cleanup
);
2848 signal(SIGUSR1
, cleanup
);
2849 signal(SIGUSR2
, cleanup
);
2851 random_generator
.seed(seed
);
2853 ret
= create_image();
2855 prterrcode(iname
, ret
);
2858 ret
= ops
->open(iname
, &ctx
);
2860 simple_err("Error opening image", ret
);
2864 strcat(dirpath
, ".");
2865 strncat(goodfile
, iname
, 256);
2866 strcat (goodfile
, ".fsxgood");
2867 fsxgoodfd
= open(goodfile
, O_RDWR
|O_CREAT
|O_TRUNC
, 0666);
2868 if (fsxgoodfd
< 0) {
2872 strncat(logfile
, iname
, 256);
2873 strcat (logfile
, ".fsxlog");
2874 fsxlogf
= fopen(logfile
, "w");
2875 if (fsxlogf
== NULL
) {
2880 original_buf
= (char *) malloc(maxfilelen
);
2881 for (i
= 0; i
< (int)maxfilelen
; i
++)
2882 original_buf
[i
] = get_random() % 256;
2884 ret
= posix_memalign((void **)&good_buf
,
2885 MAX(writebdy
, (int)sizeof(void *)), maxfilelen
);
2888 prt("writebdy is not a suitable power of two\n");
2890 prterrcode("main: posix_memalign(good_buf)", -ret
);
2893 memset(good_buf
, '\0', maxfilelen
);
2895 ret
= posix_memalign((void **)&temp_buf
,
2896 MAX(readbdy
, (int)sizeof(void *)), maxfilelen
);
2899 prt("readbdy is not a suitable power of two\n");
2901 prterrcode("main: posix_memalign(temp_buf)", -ret
);
2904 memset(temp_buf
, '\0', maxfilelen
);
2906 if (lite
) { /* zero entire existing file */
2909 written
= ops
->write(&ctx
, 0, (size_t)maxfilelen
, good_buf
);
2910 if (written
!= (ssize_t
)maxfilelen
) {
2912 prterrcode(iname
, written
);
2913 warn("main: error on write");
2915 warn("main: short write, 0x%x bytes instead "
2926 while (numops
== -1 || numops
--)
2929 ret
= ops
->close(&ctx
);
2931 prterrcode("ops->close", ret
);
2935 if (journal_replay
) {
2936 char imagename
[1024];
2937 clone_imagename(imagename
, sizeof(imagename
), num_clones
);
2938 ret
= finalize_journal(ioctx
, imagename
, num_clones
, 0, 0, 0);
2940 report_failure(100);
2944 if (num_clones
> 0) {
2945 if (journal_replay
) {
2946 check_clone(num_clones
- 1, true);
2948 check_clone(num_clones
- 1, false);
2951 if (!keep_on_success
) {
2952 while (num_clones
>= 0) {
2953 static bool remove_snap
= false;
2955 if (journal_replay
) {
2956 char replayimagename
[1024];
2957 replay_imagename(replayimagename
,
2958 sizeof(replayimagename
),
2960 remove_image(ioctx
, replayimagename
,
2965 char clonename
[128];
2966 clone_imagename(clonename
, 128, num_clones
);
2967 remove_image(ioctx
, clonename
, remove_snap
,
2975 prt("All operations completed A-OK!\n");
2978 rados_ioctx_destroy(ioctx
);
2980 rados_shutdown(cluster
);