]> git.proxmox.com Git - ceph.git/blob - ceph/src/test/librbd/fsx.cc
cee2a08960223fac08c2017b672c0be748bfcb94
[ceph.git] / ceph / src / test / librbd / fsx.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:8; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=8 smarttab
3 /*
4 * Copyright (C) 1991, NeXT Computer, Inc. All Rights Reserverd.
5 *
6 * File: fsx.cc
7 * Author: Avadis Tevanian, Jr.
8 *
9 * File system exerciser.
10 *
11 * Rewritten 8/98 by Conrad Minshall.
12 *
13 * Small changes to work under Linux -- davej.
14 *
15 * Checks for mmap last-page zero fill.
16 */
17
18 #include <sys/types.h>
19 #include <unistd.h>
20 #include <getopt.h>
21 #include <limits.h>
22 #include <strings.h>
23 #if defined(__FreeBSD__)
24 #include <sys/disk.h>
25 #endif
26 #include <sys/file.h>
27 #include <sys/stat.h>
28 #include <sys/mman.h>
29 #if defined(__linux__)
30 #include <linux/fs.h>
31 #endif
32 #include <sys/ioctl.h>
33 #ifdef HAVE_ERR_H
34 #include <err.h>
35 #endif
36 #include <signal.h>
37 #include <stddef.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <stdarg.h>
42 #include <assert.h>
43 #include <errno.h>
44 #include <math.h>
45 #include <fcntl.h>
46 #include <random>
47
48 #include "include/compat.h"
49 #include "include/intarith.h"
50 #if defined(WITH_KRBD)
51 #include "include/krbd.h"
52 #endif
53 #include "include/rados/librados.h"
54 #include "include/rados/librados.hpp"
55 #include "include/rbd/librbd.h"
56 #include "include/rbd/librbd.hpp"
57 #include "common/Cond.h"
58 #include "common/SubProcess.h"
59 #include "common/safe_io.h"
60 #include "journal/Journaler.h"
61 #include "journal/ReplayEntry.h"
62 #include "journal/ReplayHandler.h"
63 #include "journal/Settings.h"
64
65 #include <boost/scope_exit.hpp>
66
67 #define NUMPRINTCOLUMNS 32 /* # columns of data to print on each line */
68
69 /*
70 * A log entry is an operation and a bunch of arguments.
71 */
72
73 struct log_entry {
74 int operation;
75 int args[3];
76 };
77
78 #define LOGSIZE 1000
79
80 struct log_entry oplog[LOGSIZE]; /* the log */
81 int logptr = 0; /* current position in log */
82 int logcount = 0; /* total ops */
83
84 /*
85 * The operation matrix is complex due to conditional execution of different
86 * features. Hence when we come to deciding what operation to run, we need to
87 * be careful in how we select the different operations. The active operations
88 * are mapped to numbers as follows:
89 *
90 * lite !lite
91 * READ: 0 0
92 * WRITE: 1 1
93 * MAPREAD: 2 2
94 * MAPWRITE: 3 3
95 * TRUNCATE: - 4
96 * FALLOCATE: - 5
97 * PUNCH HOLE: - 6
98 * WRITESAME: - 7
99 * COMPAREANDWRITE: - 8
100 *
101 * When mapped read/writes are disabled, they are simply converted to normal
102 * reads and writes. When fallocate/fpunch calls are disabled, they are
103 * converted to OP_SKIPPED. Hence OP_SKIPPED needs to have a number higher than
104 * the operation selction matrix, as does the OP_CLOSEOPEN which is an
105 * operation modifier rather than an operation in itself.
106 *
107 * Because of the "lite" version, we also need to have different "maximum
108 * operation" defines to allow the ops to be selected correctly based on the
109 * mode being run.
110 */
111
112 /* common operations */
113 #define OP_READ 0
114 #define OP_WRITE 1
115 #define OP_MAPREAD 2
116 #define OP_MAPWRITE 3
117 #define OP_MAX_LITE 4
118
119 /* !lite operations */
120 #define OP_TRUNCATE 4
121 #define OP_FALLOCATE 5
122 #define OP_PUNCH_HOLE 6
123 #define OP_WRITESAME 7
124 #define OP_COMPARE_AND_WRITE 8
125 /* rbd-specific operations */
126 #define OP_CLONE 9
127 #define OP_FLATTEN 10
128 #define OP_MAX_FULL 11
129
130 /* operation modifiers */
131 #define OP_CLOSEOPEN 100
132 #define OP_SKIPPED 101
133
134 #undef PAGE_SIZE
135 #define PAGE_SIZE getpagesize()
136 #undef PAGE_MASK
137 #define PAGE_MASK (PAGE_SIZE - 1)
138
139
140 char *original_buf; /* a pointer to the original data */
141 char *good_buf; /* a pointer to the correct data */
142 char *temp_buf; /* a pointer to the current data */
143
144 char dirpath[1024];
145
146 off_t file_size = 0;
147 off_t biggest = 0;
148 unsigned long testcalls = 0; /* calls to function "test" */
149
150 const char* cluster_name = "ceph"; /* --cluster optional */
151 const char* client_id = "admin"; /* --id optional */
152
153 unsigned long simulatedopcount = 0; /* -b flag */
154 int closeprob = 0; /* -c flag */
155 int debug = 0; /* -d flag */
156 unsigned long debugstart = 0; /* -D flag */
157 int flush_enabled = 0; /* -f flag */
158 int deep_copy = 0; /* -g flag */
159 int holebdy = 1; /* -h flag */
160 bool journal_replay = false; /* -j flah */
161 int keep_on_success = 0; /* -k flag */
162 int do_fsync = 0; /* -y flag */
163 unsigned long maxfilelen = 256 * 1024; /* -l flag */
164 int sizechecks = 1; /* -n flag disables them */
165 int maxoplen = 64 * 1024; /* -o flag */
166 int quiet = 0; /* -q flag */
167 unsigned long progressinterval = 0; /* -p flag */
168 int readbdy = 1; /* -r flag */
169 int style = 0; /* -s flag */
170 int prealloc = 0; /* -x flag */
171 int truncbdy = 1; /* -t flag */
172 int writebdy = 1; /* -w flag */
173 long monitorstart = -1; /* -m flag */
174 long monitorend = -1; /* -m flag */
175 int lite = 0; /* -L flag */
176 long numops = -1; /* -N flag */
177 int randomoplen = 1; /* -O flag disables it */
178 int seed = 1; /* -S flag */
179 int mapped_writes = 0; /* -W flag disables */
180 int fallocate_calls = 0; /* -F flag disables */
181 int punch_hole_calls = 1; /* -H flag disables */
182 int clone_calls = 1; /* -C flag disables */
183 int randomize_striping = 1; /* -U flag disables */
184 int randomize_parent_overlap = 1;
185 int mapped_reads = 0; /* -R flag disables it */
186 int fsxgoodfd = 0;
187 int o_direct = 0; /* -Z flag */
188
189 int num_clones = 0;
190
191 int page_size;
192 int page_mask;
193 int mmap_mask;
194
195 FILE * fsxlogf = NULL;
196 int badoff = -1;
197 int closeopen = 0;
198
199 void
200 vwarnc(int code, const char *fmt, va_list ap) {
201 fprintf(stderr, "fsx: ");
202 if (fmt != NULL) {
203 vfprintf(stderr, fmt, ap);
204 fprintf(stderr, ": ");
205 }
206 fprintf(stderr, "%s\n", strerror(code));
207 }
208
209 void
210 warn(const char * fmt, ...) {
211 va_list ap;
212 va_start(ap, fmt);
213 vwarnc(errno, fmt, ap);
214 va_end(ap);
215 }
216
217 #define BUF_SIZE 1024
218
219 void
220 prt(const char *fmt, ...)
221 {
222 va_list args;
223 char buffer[BUF_SIZE];
224
225 va_start(args, fmt);
226 vsnprintf(buffer, BUF_SIZE, fmt, args);
227 va_end(args);
228 fprintf(stdout, "%s", buffer);
229 if (fsxlogf)
230 fprintf(fsxlogf, "%s", buffer);
231 }
232
233 void
234 prterr(const char *prefix)
235 {
236 prt("%s%s%s\n", prefix, prefix ? ": " : "", strerror(errno));
237 }
238
239 void
240 prterrcode(const char *prefix, int code)
241 {
242 prt("%s%s%s\n", prefix, prefix ? ": " : "", strerror(-code));
243 }
244
245 void
246 simple_err(const char *msg, int err)
247 {
248 fprintf(stderr, "%s: %s\n", msg, strerror(-err));
249 }
250
251 /*
252 * random
253 */
254 std::mt19937 random_generator;
255
256 uint_fast32_t
257 get_random(void)
258 {
259 return random_generator();
260 }
261
262 int get_features(uint64_t* features);
263 void replay_imagename(char *buf, size_t len, int clones);
264
265 namespace {
266
267 static const std::string JOURNAL_CLIENT_ID("fsx");
268
269 struct ReplayHandler : public journal::ReplayHandler {
270 journal::Journaler *journaler;
271 journal::Journaler *replay_journaler;
272 Context *on_finish;
273
274 ReplayHandler(journal::Journaler *journaler,
275 journal::Journaler *replay_journaler, Context *on_finish)
276 : journaler(journaler), replay_journaler(replay_journaler),
277 on_finish(on_finish) {
278 }
279
280 void handle_entries_available() override {
281 while (true) {
282 journal::ReplayEntry replay_entry;
283 if (!journaler->try_pop_front(&replay_entry)) {
284 return;
285 }
286
287 replay_journaler->append(0, replay_entry.get_data());
288 }
289 }
290
291 void handle_complete(int r) override {
292 on_finish->complete(r);
293 }
294 };
295
296 int get_image_id(librados::IoCtx &io_ctx, const char *image_name,
297 std::string *image_id) {
298 librbd::RBD rbd;
299 librbd::Image image;
300 int r = rbd.open(io_ctx, image, image_name);
301 if (r < 0) {
302 simple_err("failed to open image", r);
303 return r;
304 }
305
306 rbd_image_info_t info;
307 r = image.stat(info, sizeof(info));
308 if (r < 0) {
309 simple_err("failed to stat image", r);
310 return r;
311 }
312
313 *image_id = std::string(&info.block_name_prefix[strlen(RBD_DATA_PREFIX)]);
314 return 0;
315 }
316
317 int register_journal(rados_ioctx_t ioctx, const char *image_name) {
318 librados::IoCtx io_ctx;
319 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
320
321 std::string image_id;
322 int r = get_image_id(io_ctx, image_name, &image_id);
323 if (r < 0) {
324 return r;
325 }
326
327 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {},
328 nullptr);
329 r = journaler.register_client(bufferlist());
330 if (r < 0) {
331 simple_err("failed to register journal client", r);
332 return r;
333 }
334 return 0;
335 }
336
337 int unregister_journal(rados_ioctx_t ioctx, const char *image_name) {
338 librados::IoCtx io_ctx;
339 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
340
341 std::string image_id;
342 int r = get_image_id(io_ctx, image_name, &image_id);
343 if (r < 0) {
344 return r;
345 }
346
347 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {},
348 nullptr);
349 r = journaler.unregister_client();
350 if (r < 0) {
351 simple_err("failed to unregister journal client", r);
352 return r;
353 }
354 return 0;
355 }
356
357 int create_replay_image(rados_ioctx_t ioctx, int order,
358 uint64_t stripe_unit, int stripe_count,
359 const char *replay_image_name,
360 const char *last_replay_image_name) {
361 librados::IoCtx io_ctx;
362 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
363
364 uint64_t features;
365 int r = get_features(&features);
366 if (r < 0) {
367 return r;
368 }
369
370 librbd::RBD rbd;
371 if (last_replay_image_name == nullptr) {
372 r = rbd.create2(io_ctx, replay_image_name, 0, features, &order);
373 } else {
374 r = rbd.clone2(io_ctx, last_replay_image_name, "snap",
375 io_ctx, replay_image_name, features, &order,
376 stripe_unit, stripe_count);
377 }
378
379 if (r < 0) {
380 simple_err("failed to create replay image", r);
381 return r;
382 }
383
384 return 0;
385 }
386
387 int replay_journal(rados_ioctx_t ioctx, const char *image_name,
388 const char *replay_image_name) {
389 librados::IoCtx io_ctx;
390 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
391
392 std::string image_id;
393 int r = get_image_id(io_ctx, image_name, &image_id);
394 if (r < 0) {
395 return r;
396 }
397
398 std::string replay_image_id;
399 r = get_image_id(io_ctx, replay_image_name, &replay_image_id);
400 if (r < 0) {
401 return r;
402 }
403
404 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {},
405 nullptr);
406 C_SaferCond init_ctx;
407 journaler.init(&init_ctx);
408 BOOST_SCOPE_EXIT_ALL( (&journaler) ) {
409 journaler.shut_down();
410 };
411
412 r = init_ctx.wait();
413 if (r < 0) {
414 simple_err("failed to initialize journal", r);
415 return r;
416 }
417
418 journal::Journaler replay_journaler(io_ctx, replay_image_id, "", {},
419 nullptr);
420
421 C_SaferCond replay_init_ctx;
422 replay_journaler.init(&replay_init_ctx);
423 BOOST_SCOPE_EXIT_ALL( (&replay_journaler) ) {
424 replay_journaler.shut_down();
425 };
426
427 r = replay_init_ctx.wait();
428 if (r < 0) {
429 simple_err("failed to initialize replay journal", r);
430 return r;
431 }
432
433 replay_journaler.start_append(0);
434
435 C_SaferCond replay_ctx;
436 ReplayHandler replay_handler(&journaler, &replay_journaler,
437 &replay_ctx);
438
439 // copy journal events from source image to replay image
440 journaler.start_replay(&replay_handler);
441 r = replay_ctx.wait();
442
443 journaler.stop_replay();
444
445 C_SaferCond stop_ctx;
446 replay_journaler.stop_append(&stop_ctx);
447 int stop_r = stop_ctx.wait();
448 if (r == 0 && stop_r < 0) {
449 r = stop_r;
450 }
451
452 if (r < 0) {
453 simple_err("failed to replay journal", r);
454 return r;
455 }
456
457 librbd::RBD rbd;
458 librbd::Image image;
459 r = rbd.open(io_ctx, image, replay_image_name);
460 if (r < 0) {
461 simple_err("failed to open replay image", r);
462 return r;
463 }
464
465 // perform an IO op to initiate the journal replay
466 bufferlist bl;
467 r = static_cast<ssize_t>(image.write(0, 0, bl));
468 if (r < 0) {
469 simple_err("failed to write to replay image", r);
470 return r;
471 }
472 return 0;
473 }
474
475 int finalize_journal(rados_ioctx_t ioctx, const char *imagename, int clones,
476 int order, uint64_t stripe_unit, int stripe_count) {
477 char replayimagename[1024];
478 replay_imagename(replayimagename, sizeof(replayimagename), clones);
479
480 char lastreplayimagename[1024];
481 if (clones > 0) {
482 replay_imagename(lastreplayimagename,
483 sizeof(lastreplayimagename), clones - 1);
484 }
485
486 int ret = create_replay_image(ioctx, order, stripe_unit,
487 stripe_count, replayimagename,
488 clones > 0 ? lastreplayimagename :
489 nullptr);
490 if (ret < 0) {
491 exit(EXIT_FAILURE);
492 }
493
494 ret = replay_journal(ioctx, imagename, replayimagename);
495 if (ret < 0) {
496 exit(EXIT_FAILURE);
497 }
498 return 0;
499 }
500
501 } // anonymous namespace
502
503 /*
504 * rbd
505 */
506
507 struct rbd_ctx {
508 const char *name; /* image name */
509 rbd_image_t image; /* image handle */
510 const char *krbd_name; /* image /dev/rbd<id> name */ /* reused for nbd test */
511 int krbd_fd; /* image /dev/rbd<id> fd */ /* reused for nbd test */
512 };
513
514 #define RBD_CTX_INIT (struct rbd_ctx) { NULL, NULL, NULL, -1}
515
516 struct rbd_operations {
517 int (*open)(const char *name, struct rbd_ctx *ctx);
518 int (*close)(struct rbd_ctx *ctx);
519 ssize_t (*read)(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf);
520 ssize_t (*write)(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf);
521 int (*flush)(struct rbd_ctx *ctx);
522 int (*discard)(struct rbd_ctx *ctx, uint64_t off, uint64_t len);
523 int (*get_size)(struct rbd_ctx *ctx, uint64_t *size);
524 int (*resize)(struct rbd_ctx *ctx, uint64_t size);
525 int (*clone)(struct rbd_ctx *ctx, const char *src_snapname,
526 const char *dst_imagename, int *order, int stripe_unit,
527 int stripe_count);
528 int (*flatten)(struct rbd_ctx *ctx);
529 ssize_t (*writesame)(struct rbd_ctx *ctx, uint64_t off, size_t len,
530 const char *buf, size_t data_len);
531 ssize_t (*compare_and_write)(struct rbd_ctx *ctx, uint64_t off, size_t len,
532 const char *cmp_buf, const char *buf);
533 };
534
535 char *pool; /* name of the pool our test image is in */
536 char *iname; /* name of our test image */
537 rados_t cluster; /* handle for our test cluster */
538 rados_ioctx_t ioctx; /* handle for our test pool */
539 #if defined(WITH_KRBD)
540 struct krbd_ctx *krbd; /* handle for libkrbd */
541 #endif
542 bool skip_partial_discard; /* rbd_skip_partial_discard config value*/
543
544 int get_features(uint64_t* features) {
545 char buf[1024];
546 int r = rados_conf_get(cluster, "rbd_default_features", buf,
547 sizeof(buf));
548 if (r < 0) {
549 simple_err("Could not get rbd_default_features value", r);
550 return r;
551 }
552
553 *features = strtol(buf, NULL, 0);
554
555 if (clone_calls) {
556 *features |= RBD_FEATURE_LAYERING;
557 }
558 if (journal_replay) {
559 *features |= (RBD_FEATURE_EXCLUSIVE_LOCK |
560 RBD_FEATURE_JOURNALING);
561 }
562 return 0;
563 }
564
565 /*
566 * librbd/krbd rbd_operations handlers. Given the rest of fsx.c, no
567 * attempt to do error handling is made in these handlers.
568 */
569
570 int
571 __librbd_open(const char *name, struct rbd_ctx *ctx)
572 {
573 rbd_image_t image;
574 int ret;
575
576 ceph_assert(!ctx->name && !ctx->image &&
577 !ctx->krbd_name && ctx->krbd_fd < 0);
578
579 ret = rbd_open(ioctx, name, &image, NULL);
580 if (ret < 0) {
581 prt("rbd_open(%s) failed\n", name);
582 return ret;
583 }
584
585 ctx->name = strdup(name);
586 ctx->image = image;
587 ctx->krbd_name = NULL;
588 ctx->krbd_fd = -1;
589
590 return 0;
591 }
592
593 int
594 librbd_open(const char *name, struct rbd_ctx *ctx)
595 {
596 return __librbd_open(name, ctx);
597 }
598
599 int
600 __librbd_close(struct rbd_ctx *ctx)
601 {
602 int ret;
603
604 ceph_assert(ctx->name && ctx->image);
605
606 ret = rbd_close(ctx->image);
607 if (ret < 0) {
608 prt("rbd_close(%s) failed\n", ctx->name);
609 return ret;
610 }
611
612 free((void *)ctx->name);
613
614 ctx->name = NULL;
615 ctx->image = NULL;
616
617 return 0;
618 }
619
620 int
621 librbd_close(struct rbd_ctx *ctx)
622 {
623 return __librbd_close(ctx);
624 }
625
626 int
627 librbd_verify_object_map(struct rbd_ctx *ctx)
628 {
629 int n;
630 uint64_t flags;
631 n = rbd_get_flags(ctx->image, &flags);
632 if (n < 0) {
633 prt("rbd_get_flags() failed\n");
634 return n;
635 }
636
637 if ((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
638 prt("rbd_get_flags() indicates object map is invalid\n");
639 return -EINVAL;
640 }
641 return 0;
642 }
643
644 ssize_t
645 librbd_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
646 {
647 ssize_t n;
648
649 n = rbd_read(ctx->image, off, len, buf);
650 if (n < 0)
651 prt("rbd_read(%llu, %zu) failed\n", off, len);
652
653 return n;
654 }
655
656 ssize_t
657 librbd_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
658 {
659 ssize_t n;
660 int ret;
661
662 n = rbd_write(ctx->image, off, len, buf);
663 if (n < 0) {
664 prt("rbd_write(%llu, %zu) failed\n", off, len);
665 return n;
666 }
667
668 ret = librbd_verify_object_map(ctx);
669 if (ret < 0) {
670 return ret;
671 }
672 return n;
673 }
674
675 int
676 librbd_flush(struct rbd_ctx *ctx)
677 {
678 int ret;
679
680 ret = rbd_flush(ctx->image);
681 if (ret < 0) {
682 prt("rbd_flush failed\n");
683 return ret;
684 }
685
686 return librbd_verify_object_map(ctx);
687 }
688
689 int
690 librbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
691 {
692 int ret;
693
694 ret = rbd_discard(ctx->image, off, len);
695 if (ret < 0) {
696 prt("rbd_discard(%llu, %llu) failed\n", off, len);
697 return ret;
698 }
699
700 return librbd_verify_object_map(ctx);
701 }
702
703 ssize_t
704 librbd_writesame(struct rbd_ctx *ctx, uint64_t off, size_t len,
705 const char *buf, size_t data_len)
706 {
707 ssize_t n;
708 int ret;
709
710 n = rbd_writesame(ctx->image, off, len, buf, data_len, 0);
711 if (n < 0) {
712 prt("rbd_writesame(%llu, %zu) failed\n", off, len);
713 return n;
714 }
715
716 ret = librbd_verify_object_map(ctx);
717 if (ret < 0) {
718 return ret;
719 }
720 return n;
721 }
722
723 ssize_t
724 librbd_compare_and_write(struct rbd_ctx *ctx, uint64_t off, size_t len,
725 const char *cmp_buf, const char *buf)
726 {
727 ssize_t n;
728 int ret;
729 uint64_t mismatch_off = 0;
730
731 n = rbd_compare_and_write(ctx->image, off, len, cmp_buf, buf, &mismatch_off, 0);
732 if (n == -EINVAL) {
733 return n;
734 } else if (n < 0) {
735 prt("rbd_compare_and_write mismatch(%llu, %zu, %llu) failed\n",
736 off, len, mismatch_off);
737 return n;
738 }
739
740 ret = librbd_verify_object_map(ctx);
741 if (ret < 0) {
742 return ret;
743 }
744 return n;
745
746 }
747
748 int
749 librbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
750 {
751 int ret;
752
753 ret = rbd_get_size(ctx->image, size);
754 if (ret < 0) {
755 prt("rbd_get_size failed\n");
756 return ret;
757 }
758
759 return 0;
760 }
761
762 int
763 __librbd_resize(struct rbd_ctx *ctx, uint64_t size)
764 {
765 int ret;
766
767 ret = rbd_resize(ctx->image, size);
768 if (ret < 0) {
769 prt("rbd_resize(%llu) failed\n", size);
770 return ret;
771 }
772
773 return librbd_verify_object_map(ctx);
774 }
775
776 int
777 librbd_resize(struct rbd_ctx *ctx, uint64_t size)
778 {
779 return __librbd_resize(ctx, size);
780 }
781
782 int
783 __librbd_deep_copy(struct rbd_ctx *ctx, const char *src_snapname,
784 const char *dst_imagename, uint64_t features, int *order,
785 int stripe_unit, int stripe_count) {
786 int ret;
787
788 rbd_image_options_t opts;
789 rbd_image_options_create(&opts);
790 BOOST_SCOPE_EXIT_ALL( (&opts) ) {
791 rbd_image_options_destroy(opts);
792 };
793 ret = rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_FEATURES,
794 features);
795 ceph_assert(ret == 0);
796 ret = rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_ORDER,
797 *order);
798 ceph_assert(ret == 0);
799 ret = rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_STRIPE_UNIT,
800 stripe_unit);
801 ceph_assert(ret == 0);
802 ret = rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_STRIPE_COUNT,
803 stripe_count);
804 ceph_assert(ret == 0);
805
806 ret = rbd_snap_set(ctx->image, src_snapname);
807 if (ret < 0) {
808 prt("rbd_snap_set(%s@%s) failed\n", ctx->name, src_snapname);
809 return ret;
810 }
811
812 ret = rbd_deep_copy(ctx->image, ioctx, dst_imagename, opts);
813 if (ret < 0) {
814 prt("rbd_deep_copy(%s@%s -> %s) failed\n",
815 ctx->name, src_snapname, dst_imagename);
816 return ret;
817 }
818
819 ret = rbd_snap_set(ctx->image, "");
820 if (ret < 0) {
821 prt("rbd_snap_set(%s@) failed\n", ctx->name);
822 return ret;
823 }
824
825 rbd_image_t image;
826 ret = rbd_open(ioctx, dst_imagename, &image, nullptr);
827 if (ret < 0) {
828 prt("rbd_open(%s) failed\n", dst_imagename);
829 return ret;
830 }
831
832 ret = rbd_snap_unprotect(image, src_snapname);
833 if (ret < 0) {
834 prt("rbd_snap_unprotect(%s@%s) failed\n", dst_imagename,
835 src_snapname);
836 return ret;
837 }
838
839 ret = rbd_snap_remove(image, src_snapname);
840 if (ret < 0) {
841 prt("rbd_snap_remove(%s@%s) failed\n", dst_imagename,
842 src_snapname);
843 return ret;
844 }
845
846 ret = rbd_close(image);
847 if (ret < 0) {
848 prt("rbd_close(%s) failed\n", dst_imagename);
849 return ret;
850 }
851
852 return 0;
853 }
854
855 int
856 __librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
857 const char *dst_imagename, int *order, int stripe_unit,
858 int stripe_count)
859 {
860 int ret;
861
862 ret = rbd_snap_create(ctx->image, src_snapname);
863 if (ret < 0) {
864 prt("rbd_snap_create(%s@%s) failed\n", ctx->name,
865 src_snapname);
866 return ret;
867 }
868
869 ret = rbd_snap_protect(ctx->image, src_snapname);
870 if (ret < 0) {
871 prt("rbd_snap_protect(%s@%s) failed\n", ctx->name,
872 src_snapname);
873 return ret;
874 }
875
876 uint64_t features;
877 ret = get_features(&features);
878 if (ret < 0) {
879 return ret;
880 }
881
882 if (deep_copy) {
883 ret = __librbd_deep_copy(ctx, src_snapname, dst_imagename, features,
884 order, stripe_unit, stripe_count);
885 if (ret < 0) {
886 prt("deep_copy(%s@%s -> %s) failed\n", ctx->name,
887 src_snapname, dst_imagename);
888 return ret;
889 }
890 } else {
891 ret = rbd_clone2(ioctx, ctx->name, src_snapname, ioctx,
892 dst_imagename, features, order,
893 stripe_unit, stripe_count);
894 if (ret < 0) {
895 prt("rbd_clone2(%s@%s -> %s) failed\n", ctx->name,
896 src_snapname, dst_imagename);
897 return ret;
898 }
899 }
900
901 return 0;
902 }
903
904 int
905 librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
906 const char *dst_imagename, int *order, int stripe_unit,
907 int stripe_count)
908 {
909 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
910 stripe_unit, stripe_count);
911 }
912
913 int
914 __librbd_flatten(struct rbd_ctx *ctx)
915 {
916 int ret;
917
918 ret = rbd_flatten(ctx->image);
919 if (ret < 0) {
920 prt("rbd_flatten failed\n");
921 return ret;
922 }
923
924 return librbd_verify_object_map(ctx);
925 }
926
927 int
928 librbd_flatten(struct rbd_ctx *ctx)
929 {
930 return __librbd_flatten(ctx);
931 }
932
933 const struct rbd_operations librbd_operations = {
934 librbd_open,
935 librbd_close,
936 librbd_read,
937 librbd_write,
938 librbd_flush,
939 librbd_discard,
940 librbd_get_size,
941 librbd_resize,
942 librbd_clone,
943 librbd_flatten,
944 librbd_writesame,
945 librbd_compare_and_write,
946 };
947
948 #if defined(WITH_KRBD)
949 int
950 krbd_open(const char *name, struct rbd_ctx *ctx)
951 {
952 char *devnode;
953 int fd;
954 int ret;
955
956 ret = __librbd_open(name, ctx);
957 if (ret < 0)
958 return ret;
959
960 ret = krbd_map(krbd, pool, "", name, "", "", &devnode);
961 if (ret < 0) {
962 prt("krbd_map(%s) failed\n", name);
963 return ret;
964 }
965
966 fd = open(devnode, O_RDWR | o_direct);
967 if (fd < 0) {
968 ret = -errno;
969 prt("open(%s) failed\n", devnode);
970 return ret;
971 }
972
973 ctx->krbd_name = devnode;
974 ctx->krbd_fd = fd;
975
976 return 0;
977 }
978
979 int
980 krbd_close(struct rbd_ctx *ctx)
981 {
982 int ret;
983
984 ceph_assert(ctx->krbd_name && ctx->krbd_fd >= 0);
985
986 if (close(ctx->krbd_fd) < 0) {
987 ret = -errno;
988 prt("close(%s) failed\n", ctx->krbd_name);
989 return ret;
990 }
991
992 ret = krbd_unmap(krbd, ctx->krbd_name, "");
993 if (ret < 0) {
994 prt("krbd_unmap(%s) failed\n", ctx->krbd_name);
995 return ret;
996 }
997
998 free((void *)ctx->krbd_name);
999
1000 ctx->krbd_name = NULL;
1001 ctx->krbd_fd = -1;
1002
1003 return __librbd_close(ctx);
1004 }
1005 #endif // WITH_KRBD
1006
1007 #if defined(__linux__)
1008 ssize_t
1009 krbd_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
1010 {
1011 ssize_t n;
1012
1013 n = pread(ctx->krbd_fd, buf, len, off);
1014 if (n < 0) {
1015 n = -errno;
1016 prt("pread(%llu, %zu) failed\n", off, len);
1017 return n;
1018 }
1019
1020 return n;
1021 }
1022
1023 ssize_t
1024 krbd_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
1025 {
1026 ssize_t n;
1027
1028 n = pwrite(ctx->krbd_fd, buf, len, off);
1029 if (n < 0) {
1030 n = -errno;
1031 prt("pwrite(%llu, %zu) failed\n", off, len);
1032 return n;
1033 }
1034
1035 return n;
1036 }
1037
1038 int
1039 __krbd_flush(struct rbd_ctx *ctx, bool invalidate)
1040 {
1041 int ret;
1042
1043 if (o_direct)
1044 return 0;
1045
1046 /*
1047 * BLKFLSBUF will sync the filesystem on top of the device (we
1048 * don't care about that here, since we write directly to it),
1049 * write out any dirty buffers and invalidate the buffer cache.
1050 * It won't do a hardware cache flush.
1051 *
1052 * fsync() will write out any dirty buffers and do a hardware
1053 * cache flush (which we don't care about either, because for
1054 * krbd it's a noop). It won't try to empty the buffer cache
1055 * nor poke the filesystem before writing out.
1056 *
1057 * Given that, for our purposes, fsync is a flush, while
1058 * BLKFLSBUF is a flush+invalidate.
1059 */
1060 if (invalidate)
1061 ret = ioctl(ctx->krbd_fd, BLKFLSBUF, NULL);
1062 else
1063 ret = fsync(ctx->krbd_fd);
1064 if (ret < 0) {
1065 ret = -errno;
1066 prt("%s failed\n", invalidate ? "BLKFLSBUF" : "fsync");
1067 return ret;
1068 }
1069
1070 return 0;
1071 }
1072
1073 int
1074 krbd_flush(struct rbd_ctx *ctx)
1075 {
1076 return __krbd_flush(ctx, false);
1077 }
1078
1079 int
1080 krbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
1081 {
1082 uint64_t range[2] = { off, len };
1083 int ret;
1084
1085 /*
1086 * BLKZEROOUT goes straight to disk and doesn't do anything
1087 * about dirty buffers. This means we need to flush so that
1088 *
1089 * write 0..3M
1090 * discard 1..2M
1091 *
1092 * results in "data 0000 data" rather than "data data data" on
1093 * disk and invalidate so that
1094 *
1095 * discard 1..2M
1096 * read 0..3M
1097 *
1098 * returns "data 0000 data" rather than "data data data" in
1099 * case 1..2M was cached.
1100 *
1101 * Note: These cache coherency issues are supposed to be fixed
1102 * in recent kernels.
1103 */
1104 ret = __krbd_flush(ctx, true);
1105 if (ret < 0)
1106 return ret;
1107
1108 /*
1109 * off and len must be 512-byte aligned, otherwise BLKZEROOUT
1110 * will fail with -EINVAL. This means that -K (enable krbd
1111 * mode) requires -h 512 or similar.
1112 */
1113 if (ioctl(ctx->krbd_fd, BLKZEROOUT, &range) < 0) {
1114 ret = -errno;
1115 prt("BLKZEROOUT(%llu, %llu) failed\n", off, len);
1116 return ret;
1117 }
1118
1119 return 0;
1120 }
1121
1122 int
1123 krbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
1124 {
1125 uint64_t bytes;
1126
1127 if (ioctl(ctx->krbd_fd, BLKGETSIZE64, &bytes) < 0) {
1128 int ret = -errno;
1129 prt("BLKGETSIZE64 failed\n");
1130 return ret;
1131 }
1132
1133 *size = bytes;
1134
1135 return 0;
1136 }
1137
1138 int
1139 krbd_resize(struct rbd_ctx *ctx, uint64_t size)
1140 {
1141 int ret;
1142
1143 ceph_assert(size % truncbdy == 0);
1144
1145 /*
1146 * When krbd detects a size change, it calls revalidate_disk(),
1147 * which ends up calling invalidate_bdev(), which invalidates
1148 * clean pages and does nothing about dirty pages beyond the
1149 * new size. The preceding cache flush makes sure those pages
1150 * are invalidated, which is what we need on shrink so that
1151 *
1152 * write 0..1M
1153 * resize 0
1154 * resize 2M
1155 * read 0..2M
1156 *
1157 * returns "0000 0000" rather than "data 0000".
1158 */
1159 ret = __krbd_flush(ctx, false);
1160 if (ret < 0)
1161 return ret;
1162
1163 return __librbd_resize(ctx, size);
1164 }
1165
1166 int
1167 krbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
1168 const char *dst_imagename, int *order, int stripe_unit,
1169 int stripe_count)
1170 {
1171 int ret;
1172
1173 ret = __krbd_flush(ctx, false);
1174 if (ret < 0)
1175 return ret;
1176
1177 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
1178 stripe_unit, stripe_count);
1179 }
1180
1181 int
1182 krbd_flatten(struct rbd_ctx *ctx)
1183 {
1184 int ret;
1185
1186 ret = __krbd_flush(ctx, false);
1187 if (ret < 0)
1188 return ret;
1189
1190 return __librbd_flatten(ctx);
1191 }
1192 #endif // __linux__
1193
1194 #if defined(WITH_KRBD)
1195 const struct rbd_operations krbd_operations = {
1196 krbd_open,
1197 krbd_close,
1198 krbd_read,
1199 krbd_write,
1200 krbd_flush,
1201 krbd_discard,
1202 krbd_get_size,
1203 krbd_resize,
1204 krbd_clone,
1205 krbd_flatten,
1206 NULL,
1207 };
1208 #endif // WITH_KRBD
1209
1210 #if defined(__linux__)
1211 int
1212 nbd_open(const char *name, struct rbd_ctx *ctx)
1213 {
1214 int r;
1215 int fd;
1216 char dev[4096];
1217 char *devnode;
1218
1219 SubProcess process("rbd-nbd", SubProcess::KEEP, SubProcess::PIPE,
1220 SubProcess::KEEP);
1221 process.add_cmd_arg("map");
1222 process.add_cmd_arg("--timeout=600");
1223 std::string img;
1224 img.append(pool);
1225 img.append("/");
1226 img.append(name);
1227 process.add_cmd_arg(img.c_str());
1228
1229 r = __librbd_open(name, ctx);
1230 if (r < 0)
1231 return r;
1232
1233 r = process.spawn();
1234 if (r < 0) {
1235 prt("nbd_open failed to run rbd-nbd error: %s\n", process.err().c_str());
1236 return r;
1237 }
1238 r = safe_read(process.get_stdout(), dev, sizeof(dev));
1239 if (r < 0) {
1240 prt("nbd_open failed to get nbd device path\n");
1241 return r;
1242 }
1243 for (int i = 0; i < r; ++i)
1244 if (dev[i] == 10 || dev[i] == 13)
1245 dev[i] = 0;
1246 dev[r] = 0;
1247 r = process.join();
1248 if (r) {
1249 prt("rbd-nbd failed with error: %s", process.err().c_str());
1250 return -EINVAL;
1251 }
1252
1253 devnode = strdup(dev);
1254 if (!devnode)
1255 return -ENOMEM;
1256
1257 fd = open(devnode, O_RDWR | o_direct);
1258 if (fd < 0) {
1259 r = -errno;
1260 prt("open(%s) failed\n", devnode);
1261 return r;
1262 }
1263
1264 ctx->krbd_name = devnode;
1265 ctx->krbd_fd = fd;
1266
1267 return 0;
1268 }
1269
1270 int
1271 nbd_close(struct rbd_ctx *ctx)
1272 {
1273 int r;
1274
1275 ceph_assert(ctx->krbd_name && ctx->krbd_fd >= 0);
1276
1277 if (close(ctx->krbd_fd) < 0) {
1278 r = -errno;
1279 prt("close(%s) failed\n", ctx->krbd_name);
1280 return r;
1281 }
1282
1283 SubProcess process("rbd-nbd");
1284 process.add_cmd_arg("unmap");
1285 process.add_cmd_arg(ctx->krbd_name);
1286
1287 r = process.spawn();
1288 if (r < 0) {
1289 prt("nbd_close failed to run rbd-nbd error: %s\n", process.err().c_str());
1290 return r;
1291 }
1292 r = process.join();
1293 if (r) {
1294 prt("rbd-nbd failed with error: %d", process.err().c_str());
1295 return -EINVAL;
1296 }
1297
1298 free((void *)ctx->krbd_name);
1299
1300 ctx->krbd_name = NULL;
1301 ctx->krbd_fd = -1;
1302
1303 return __librbd_close(ctx);
1304 }
1305
1306 int
1307 nbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
1308 const char *dst_imagename, int *order, int stripe_unit,
1309 int stripe_count)
1310 {
1311 int ret;
1312
1313 ret = __krbd_flush(ctx, false);
1314 if (ret < 0)
1315 return ret;
1316
1317 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
1318 stripe_unit, stripe_count);
1319 }
1320
1321 const struct rbd_operations nbd_operations = {
1322 nbd_open,
1323 nbd_close,
1324 krbd_read,
1325 krbd_write,
1326 krbd_flush,
1327 krbd_discard,
1328 krbd_get_size,
1329 krbd_resize,
1330 nbd_clone,
1331 krbd_flatten,
1332 NULL,
1333 };
1334 #endif // __linux__
1335
1336 #if defined(__FreeBSD__)
1337 int
1338 ggate_open(const char *name, struct rbd_ctx *ctx)
1339 {
1340 int r;
1341 int fd;
1342 char dev[4096];
1343 char *devnode;
1344
1345 SubProcess process("rbd-ggate", SubProcess::KEEP, SubProcess::PIPE,
1346 SubProcess::KEEP);
1347 process.add_cmd_arg("map");
1348 std::string img;
1349 img.append(pool);
1350 img.append("/");
1351 img.append(name);
1352 process.add_cmd_arg(img.c_str());
1353
1354 r = __librbd_open(name, ctx);
1355 if (r < 0) {
1356 return r;
1357 }
1358
1359 r = process.spawn();
1360 if (r < 0) {
1361 prt("ggate_open failed to run rbd-ggate: %s\n",
1362 process.err().c_str());
1363 return r;
1364 }
1365 r = safe_read(process.get_stdout(), dev, sizeof(dev));
1366 if (r < 0) {
1367 prt("ggate_open failed to get ggate device path\n");
1368 return r;
1369 }
1370 for (int i = 0; i < r; ++i) {
1371 if (dev[i] == '\r' || dev[i] == '\n') {
1372 dev[i] = 0;
1373 }
1374 }
1375 dev[r] = 0;
1376 r = process.join();
1377 if (r) {
1378 prt("rbd-ggate failed with error: %s", process.err().c_str());
1379 return -EINVAL;
1380 }
1381
1382 devnode = strdup(dev);
1383 if (!devnode) {
1384 return -ENOMEM;
1385 }
1386
1387 for (int i = 0; i < 100; i++) {
1388 fd = open(devnode, O_RDWR | o_direct);
1389 if (fd >= 0 || errno != ENOENT) {
1390 break;
1391 }
1392 usleep(100000);
1393 }
1394 if (fd < 0) {
1395 r = -errno;
1396 prt("open(%s) failed\n", devnode);
1397 return r;
1398 }
1399
1400 ctx->krbd_name = devnode;
1401 ctx->krbd_fd = fd;
1402
1403 return 0;
1404 }
1405
1406 int
1407 ggate_close(struct rbd_ctx *ctx)
1408 {
1409 int r;
1410
1411 ceph_assert(ctx->krbd_name && ctx->krbd_fd >= 0);
1412
1413 if (close(ctx->krbd_fd) < 0) {
1414 r = -errno;
1415 prt("close(%s) failed\n", ctx->krbd_name);
1416 return r;
1417 }
1418
1419 SubProcess process("rbd-ggate");
1420 process.add_cmd_arg("unmap");
1421 process.add_cmd_arg(ctx->krbd_name);
1422
1423 r = process.spawn();
1424 if (r < 0) {
1425 prt("ggate_close failed to run rbd-nbd: %s\n",
1426 process.err().c_str());
1427 return r;
1428 }
1429 r = process.join();
1430 if (r) {
1431 prt("rbd-ggate failed with error: %d", process.err().c_str());
1432 return -EINVAL;
1433 }
1434
1435 free((void *)ctx->krbd_name);
1436
1437 ctx->krbd_name = NULL;
1438 ctx->krbd_fd = -1;
1439
1440 return __librbd_close(ctx);
1441 }
1442
1443 ssize_t
1444 ggate_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
1445 {
1446 ssize_t n;
1447
1448 n = pread(ctx->krbd_fd, buf, len, off);
1449 if (n < 0) {
1450 n = -errno;
1451 prt("pread(%llu, %zu) failed\n", off, len);
1452 return n;
1453 }
1454
1455 return n;
1456 }
1457
1458 ssize_t
1459 ggate_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
1460 {
1461 ssize_t n;
1462
1463 n = pwrite(ctx->krbd_fd, buf, len, off);
1464 if (n < 0) {
1465 n = -errno;
1466 prt("pwrite(%llu, %zu) failed\n", off, len);
1467 return n;
1468 }
1469
1470 return n;
1471 }
1472
1473 int
1474 __ggate_flush(struct rbd_ctx *ctx, bool invalidate)
1475 {
1476 int ret;
1477
1478 if (o_direct) {
1479 return 0;
1480 }
1481
1482 if (invalidate) {
1483 ret = ioctl(ctx->krbd_fd, DIOCGFLUSH, NULL);
1484 } else {
1485 ret = fsync(ctx->krbd_fd);
1486 }
1487 if (ret < 0) {
1488 ret = -errno;
1489 prt("%s failed\n", invalidate ? "DIOCGFLUSH" : "fsync");
1490 return ret;
1491 }
1492
1493 return 0;
1494 }
1495
1496 int
1497 ggate_flush(struct rbd_ctx *ctx)
1498 {
1499 return __ggate_flush(ctx, false);
1500 }
1501
1502 int
1503 ggate_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
1504 {
1505 off_t range[2] = {static_cast<off_t>(off), static_cast<off_t>(len)};
1506 int ret;
1507
1508 ret = __ggate_flush(ctx, true);
1509 if (ret < 0) {
1510 return ret;
1511 }
1512
1513 if (ioctl(ctx->krbd_fd, DIOCGDELETE, &range) < 0) {
1514 ret = -errno;
1515 prt("DIOCGDELETE(%llu, %llu) failed\n", off, len);
1516 return ret;
1517 }
1518
1519 return 0;
1520 }
1521
1522 int
1523 ggate_get_size(struct rbd_ctx *ctx, uint64_t *size)
1524 {
1525 off_t bytes;
1526
1527 if (ioctl(ctx->krbd_fd, DIOCGMEDIASIZE, &bytes) < 0) {
1528 int ret = -errno;
1529 prt("DIOCGMEDIASIZE failed\n");
1530 return ret;
1531 }
1532
1533 *size = bytes;
1534
1535 return 0;
1536 }
1537
1538 int
1539 ggate_resize(struct rbd_ctx *ctx, uint64_t size)
1540 {
1541 int ret;
1542
1543 ceph_assert(size % truncbdy == 0);
1544
1545 ret = __ggate_flush(ctx, false);
1546 if (ret < 0) {
1547 return ret;
1548 }
1549
1550 return __librbd_resize(ctx, size);
1551 }
1552
1553 int
1554 ggate_clone(struct rbd_ctx *ctx, const char *src_snapname,
1555 const char *dst_imagename, int *order, int stripe_unit,
1556 int stripe_count)
1557 {
1558 int ret;
1559
1560 ret = __ggate_flush(ctx, false);
1561 if (ret < 0) {
1562 return ret;
1563 }
1564
1565 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
1566 stripe_unit, stripe_count);
1567 }
1568
1569 int
1570 ggate_flatten(struct rbd_ctx *ctx)
1571 {
1572 int ret;
1573
1574 ret = __ggate_flush(ctx, false);
1575 if (ret < 0) {
1576 return ret;
1577 }
1578
1579 return __librbd_flatten(ctx);
1580 }
1581
1582 const struct rbd_operations ggate_operations = {
1583 ggate_open,
1584 ggate_close,
1585 ggate_read,
1586 ggate_write,
1587 ggate_flush,
1588 ggate_discard,
1589 ggate_get_size,
1590 ggate_resize,
1591 ggate_clone,
1592 ggate_flatten,
1593 NULL,
1594 };
1595 #endif // __FreeBSD__
1596
1597 struct rbd_ctx ctx = RBD_CTX_INIT;
1598 const struct rbd_operations *ops = &librbd_operations;
1599
1600 static bool rbd_image_has_parent(struct rbd_ctx *ctx)
1601 {
1602 int ret;
1603 rbd_linked_image_spec_t parent_image;
1604 rbd_snap_spec_t parent_snap;
1605
1606 ret = rbd_get_parent(ctx->image, &parent_image, &parent_snap);
1607 if (ret < 0 && ret != -ENOENT) {
1608 prterrcode("rbd_get_parent_info", ret);
1609 exit(1);
1610 }
1611 rbd_linked_image_spec_cleanup(&parent_image);
1612 rbd_snap_spec_cleanup(&parent_snap);
1613
1614 return !ret;
1615 }
1616
1617 /*
1618 * fsx
1619 */
1620
1621 void
1622 log4(int operation, int arg0, int arg1, int arg2)
1623 {
1624 struct log_entry *le;
1625
1626 le = &oplog[logptr];
1627 le->operation = operation;
1628 if (closeopen)
1629 le->operation = ~ le->operation;
1630 le->args[0] = arg0;
1631 le->args[1] = arg1;
1632 le->args[2] = arg2;
1633 logptr++;
1634 logcount++;
1635 if (logptr >= LOGSIZE)
1636 logptr = 0;
1637 }
1638
1639 void
1640 logdump(void)
1641 {
1642 int i, count, down;
1643 struct log_entry *lp;
1644 const char *falloc_type[3] = {"PAST_EOF", "EXTENDING", "INTERIOR"};
1645
1646 prt("LOG DUMP (%d total operations):\n", logcount);
1647 if (logcount < LOGSIZE) {
1648 i = 0;
1649 count = logcount;
1650 } else {
1651 i = logptr;
1652 count = LOGSIZE;
1653 }
1654 for ( ; count > 0; count--) {
1655 int opnum;
1656
1657 opnum = i+1 + (logcount/LOGSIZE)*LOGSIZE;
1658 prt("%d(%3d mod 256): ", opnum, opnum%256);
1659 lp = &oplog[i];
1660 if ((closeopen = lp->operation < 0))
1661 lp->operation = ~ lp->operation;
1662
1663 switch (lp->operation) {
1664 case OP_MAPREAD:
1665 prt("MAPREAD 0x%x thru 0x%x\t(0x%x bytes)",
1666 lp->args[0], lp->args[0] + lp->args[1] - 1,
1667 lp->args[1]);
1668 if (badoff >= lp->args[0] && badoff <
1669 lp->args[0] + lp->args[1])
1670 prt("\t***RRRR***");
1671 break;
1672 case OP_MAPWRITE:
1673 prt("MAPWRITE 0x%x thru 0x%x\t(0x%x bytes)",
1674 lp->args[0], lp->args[0] + lp->args[1] - 1,
1675 lp->args[1]);
1676 if (badoff >= lp->args[0] && badoff <
1677 lp->args[0] + lp->args[1])
1678 prt("\t******WWWW");
1679 break;
1680 case OP_READ:
1681 prt("READ 0x%x thru 0x%x\t(0x%x bytes)",
1682 lp->args[0], lp->args[0] + lp->args[1] - 1,
1683 lp->args[1]);
1684 if (badoff >= lp->args[0] &&
1685 badoff < lp->args[0] + lp->args[1])
1686 prt("\t***RRRR***");
1687 break;
1688 case OP_WRITE:
1689 prt("WRITE 0x%x thru 0x%x\t(0x%x bytes)",
1690 lp->args[0], lp->args[0] + lp->args[1] - 1,
1691 lp->args[1]);
1692 if (lp->args[0] > lp->args[2])
1693 prt(" HOLE");
1694 else if (lp->args[0] + lp->args[1] > lp->args[2])
1695 prt(" EXTEND");
1696 if ((badoff >= lp->args[0] || badoff >=lp->args[2]) &&
1697 badoff < lp->args[0] + lp->args[1])
1698 prt("\t***WWWW");
1699 break;
1700 case OP_TRUNCATE:
1701 down = lp->args[0] < lp->args[1];
1702 prt("TRUNCATE %s\tfrom 0x%x to 0x%x",
1703 down ? "DOWN" : "UP", lp->args[1], lp->args[0]);
1704 if (badoff >= lp->args[!down] &&
1705 badoff < lp->args[!!down])
1706 prt("\t******WWWW");
1707 break;
1708 case OP_FALLOCATE:
1709 /* 0: offset 1: length 2: where alloced */
1710 prt("FALLOC 0x%x thru 0x%x\t(0x%x bytes) %s",
1711 lp->args[0], lp->args[0] + lp->args[1],
1712 lp->args[1], falloc_type[lp->args[2]]);
1713 if (badoff >= lp->args[0] &&
1714 badoff < lp->args[0] + lp->args[1])
1715 prt("\t******FFFF");
1716 break;
1717 case OP_PUNCH_HOLE:
1718 prt("PUNCH 0x%x thru 0x%x\t(0x%x bytes)",
1719 lp->args[0], lp->args[0] + lp->args[1] - 1,
1720 lp->args[1]);
1721 if (badoff >= lp->args[0] && badoff <
1722 lp->args[0] + lp->args[1])
1723 prt("\t******PPPP");
1724 break;
1725 case OP_WRITESAME:
1726 prt("WRITESAME 0x%x thru 0x%x\t(0x%x bytes) data_size 0x%x",
1727 lp->args[0], lp->args[0] + lp->args[1] - 1,
1728 lp->args[1], lp->args[2]);
1729 if (badoff >= lp->args[0] &&
1730 badoff < lp->args[0] + lp->args[1])
1731 prt("\t***WSWSWSWS");
1732 break;
1733 case OP_COMPARE_AND_WRITE:
1734 prt("COMPARE_AND_WRITE 0x%x thru 0x%x\t(0x%x bytes)",
1735 lp->args[0], lp->args[0] + lp->args[1] - 1,
1736 lp->args[1]);
1737 if (lp->args[0] > lp->args[2])
1738 prt(" HOLE");
1739 else if (lp->args[0] + lp->args[1] > lp->args[2])
1740 prt(" EXTEND");
1741 if ((badoff >= lp->args[0] || badoff >=lp->args[2]) &&
1742 badoff < lp->args[0] + lp->args[1])
1743 prt("\t***WWWW");
1744 break;
1745 case OP_CLONE:
1746 prt("CLONE");
1747 break;
1748 case OP_FLATTEN:
1749 prt("FLATTEN");
1750 break;
1751 case OP_SKIPPED:
1752 prt("SKIPPED (no operation)");
1753 break;
1754 default:
1755 prt("BOGUS LOG ENTRY (operation code = %d)!",
1756 lp->operation);
1757 }
1758 if (closeopen)
1759 prt("\n\t\tCLOSE/OPEN");
1760 prt("\n");
1761 i++;
1762 if (i == LOGSIZE)
1763 i = 0;
1764 }
1765 }
1766
1767 void
1768 save_buffer(char *buffer, off_t bufferlength, int fd)
1769 {
1770 off_t ret;
1771 ssize_t byteswritten;
1772
1773 if (fd <= 0 || bufferlength == 0)
1774 return;
1775
1776 if (bufferlength > SSIZE_MAX) {
1777 prt("fsx flaw: overflow in save_buffer\n");
1778 exit(67);
1779 }
1780
1781 ret = lseek(fd, (off_t)0, SEEK_SET);
1782 if (ret == (off_t)-1)
1783 prterr("save_buffer: lseek 0");
1784
1785 byteswritten = write(fd, buffer, (size_t)bufferlength);
1786 if (byteswritten != bufferlength) {
1787 if (byteswritten == -1)
1788 prterr("save_buffer write");
1789 else
1790 warn("save_buffer: short write, 0x%x bytes instead of 0x%llx\n",
1791 (unsigned)byteswritten,
1792 (unsigned long long)bufferlength);
1793 }
1794 }
1795
1796
1797 void
1798 report_failure(int status)
1799 {
1800 logdump();
1801
1802 if (fsxgoodfd) {
1803 if (good_buf) {
1804 save_buffer(good_buf, file_size, fsxgoodfd);
1805 prt("Correct content saved for comparison\n");
1806 prt("(maybe hexdump \"%s\" vs \"%s.fsxgood\")\n",
1807 iname, iname);
1808 }
1809 close(fsxgoodfd);
1810 }
1811 sleep(3); // so the log can flush to disk. KLUDGEY!
1812 exit(status);
1813 }
1814
1815 #define short_at(cp) ((unsigned short)((*((unsigned char *)(cp)) << 8) | \
1816 *(((unsigned char *)(cp)) + 1)))
1817
1818 int
1819 fsxcmp(char *good_buf, char *temp_buf, unsigned size)
1820 {
1821 if (!skip_partial_discard) {
1822 return memcmp(good_buf, temp_buf, size);
1823 }
1824
1825 for (unsigned i = 0; i < size; i++) {
1826 if (good_buf[i] != temp_buf[i] && good_buf[i] != 0) {
1827 return good_buf[i] - temp_buf[i];
1828 }
1829 }
1830 return 0;
1831 }
1832
1833 void
1834 check_buffers(char *good_buf, char *temp_buf, unsigned offset, unsigned size)
1835 {
1836 if (fsxcmp(good_buf + offset, temp_buf, size) != 0) {
1837 unsigned i = 0;
1838 unsigned n = 0;
1839
1840 prt("READ BAD DATA: offset = 0x%x, size = 0x%x, fname = %s\n",
1841 offset, size, iname);
1842 prt("OFFSET\tGOOD\tBAD\tRANGE\n");
1843 while (size > 0) {
1844 unsigned char c = good_buf[offset];
1845 unsigned char t = temp_buf[i];
1846 if (c != t) {
1847 if (n < 16) {
1848 unsigned bad = short_at(&temp_buf[i]);
1849 prt("0x%5x\t0x%04x\t0x%04x", offset,
1850 short_at(&good_buf[offset]), bad);
1851 unsigned op = temp_buf[(offset & 1) ? i+1 : i];
1852 prt("\t0x%5x\n", n);
1853 if (op)
1854 prt("operation# (mod 256) for "
1855 "the bad data may be %u\n",
1856 ((unsigned)op & 0xff));
1857 else
1858 prt("operation# (mod 256) for "
1859 "the bad data unknown, check"
1860 " HOLE and EXTEND ops\n");
1861 }
1862 n++;
1863 badoff = offset;
1864 }
1865 offset++;
1866 i++;
1867 size--;
1868 }
1869 report_failure(110);
1870 }
1871 }
1872
1873
1874 void
1875 check_size(void)
1876 {
1877 uint64_t size;
1878 int ret;
1879
1880 ret = ops->get_size(&ctx, &size);
1881 if (ret < 0)
1882 prterrcode("check_size: ops->get_size", ret);
1883
1884 if ((uint64_t)file_size != size) {
1885 prt("Size error: expected 0x%llx stat 0x%llx\n",
1886 (unsigned long long)file_size,
1887 (unsigned long long)size);
1888 report_failure(120);
1889 }
1890 }
1891
1892 #define TRUNC_HACK_SIZE (200ULL << 9) /* 512-byte aligned for krbd */
1893
1894 void
1895 check_trunc_hack(void)
1896 {
1897 uint64_t size;
1898 int ret;
1899
1900 ret = ops->resize(&ctx, 0ULL);
1901 if (ret < 0)
1902 prterrcode("check_trunc_hack: ops->resize pre", ret);
1903
1904 ret = ops->resize(&ctx, TRUNC_HACK_SIZE);
1905 if (ret < 0)
1906 prterrcode("check_trunc_hack: ops->resize actual", ret);
1907
1908 ret = ops->get_size(&ctx, &size);
1909 if (ret < 0)
1910 prterrcode("check_trunc_hack: ops->get_size", ret);
1911
1912 if (size != TRUNC_HACK_SIZE) {
1913 prt("no extend on truncate! not posix!\n");
1914 exit(130);
1915 }
1916
1917 ret = ops->resize(&ctx, 0ULL);
1918 if (ret < 0)
1919 prterrcode("check_trunc_hack: ops->resize post", ret);
1920 }
1921
1922 int
1923 create_image()
1924 {
1925 int r;
1926 int order = 0;
1927 char buf[32];
1928 char client_name[256];
1929
1930 sprintf(client_name, "client.%s", client_id);
1931
1932 r = rados_create2(&cluster, cluster_name, client_name, 0);
1933 if (r < 0) {
1934 simple_err("Could not create cluster handle", r);
1935 return r;
1936 }
1937 rados_conf_parse_env(cluster, NULL);
1938 r = rados_conf_read_file(cluster, NULL);
1939 if (r < 0) {
1940 simple_err("Error reading ceph config file", r);
1941 goto failed_shutdown;
1942 }
1943 r = rados_connect(cluster);
1944 if (r < 0) {
1945 simple_err("Error connecting to cluster", r);
1946 goto failed_shutdown;
1947 }
1948 #if defined(WITH_KRBD)
1949 r = krbd_create_from_context(rados_cct(cluster), &krbd);
1950 if (r < 0) {
1951 simple_err("Could not create libkrbd handle", r);
1952 goto failed_shutdown;
1953 }
1954 #endif
1955
1956 r = rados_pool_create(cluster, pool);
1957 if (r < 0 && r != -EEXIST) {
1958 simple_err("Error creating pool", r);
1959 goto failed_krbd;
1960 }
1961 r = rados_ioctx_create(cluster, pool, &ioctx);
1962 if (r < 0) {
1963 simple_err("Error creating ioctx", r);
1964 goto failed_krbd;
1965 }
1966 rados_application_enable(ioctx, "rbd", 1);
1967
1968 if (clone_calls || journal_replay) {
1969 uint64_t features;
1970 r = get_features(&features);
1971 if (r < 0) {
1972 goto failed_open;
1973 }
1974
1975 r = rbd_create2(ioctx, iname, file_size, features, &order);
1976 } else {
1977 r = rbd_create(ioctx, iname, file_size, &order);
1978 }
1979 if (r < 0) {
1980 simple_err("Error creating image", r);
1981 goto failed_open;
1982 }
1983
1984 if (journal_replay) {
1985 r = register_journal(ioctx, iname);
1986 if (r < 0) {
1987 goto failed_open;
1988 }
1989 }
1990
1991 r = rados_conf_get(cluster, "rbd_skip_partial_discard", buf,
1992 sizeof(buf));
1993 if (r < 0) {
1994 simple_err("Could not get rbd_skip_partial_discard value", r);
1995 goto failed_open;
1996 }
1997 skip_partial_discard = (strcmp(buf, "true") == 0);
1998
1999 return 0;
2000
2001 failed_open:
2002 rados_ioctx_destroy(ioctx);
2003 failed_krbd:
2004 #if defined(WITH_KRBD)
2005 krbd_destroy(krbd);
2006 #endif
2007 failed_shutdown:
2008 rados_shutdown(cluster);
2009 return r;
2010 }
2011
2012 void
2013 doflush(unsigned offset, unsigned size)
2014 {
2015 int ret;
2016
2017 if (o_direct)
2018 return;
2019
2020 ret = ops->flush(&ctx);
2021 if (ret < 0)
2022 prterrcode("doflush: ops->flush", ret);
2023 }
2024
2025 void
2026 doread(unsigned offset, unsigned size)
2027 {
2028 int ret;
2029
2030 offset -= offset % readbdy;
2031 if (o_direct)
2032 size -= size % readbdy;
2033 if (size == 0) {
2034 if (!quiet && testcalls > simulatedopcount && !o_direct)
2035 prt("skipping zero size read\n");
2036 log4(OP_SKIPPED, OP_READ, offset, size);
2037 return;
2038 }
2039 if (size + offset > file_size) {
2040 if (!quiet && testcalls > simulatedopcount)
2041 prt("skipping seek/read past end of file\n");
2042 log4(OP_SKIPPED, OP_READ, offset, size);
2043 return;
2044 }
2045
2046 log4(OP_READ, offset, size, 0);
2047
2048 if (testcalls <= simulatedopcount)
2049 return;
2050
2051 if (!quiet &&
2052 ((progressinterval && testcalls % progressinterval == 0) ||
2053 (debug &&
2054 (monitorstart == -1 ||
2055 (static_cast<long>(offset + size) > monitorstart &&
2056 (monitorend == -1 ||
2057 static_cast<long>(offset) <= monitorend))))))
2058 prt("%lu read\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
2059 offset, offset + size - 1, size);
2060
2061 ret = ops->read(&ctx, offset, size, temp_buf);
2062 if (ret != (int)size) {
2063 if (ret < 0)
2064 prterrcode("doread: ops->read", ret);
2065 else
2066 prt("short read: 0x%x bytes instead of 0x%x\n",
2067 ret, size);
2068 report_failure(141);
2069 }
2070
2071 check_buffers(good_buf, temp_buf, offset, size);
2072 }
2073
2074
2075 void
2076 check_eofpage(char *s, unsigned offset, char *p, int size)
2077 {
2078 unsigned long last_page, should_be_zero;
2079
2080 if (offset + size <= (file_size & ~page_mask))
2081 return;
2082 /*
2083 * we landed in the last page of the file
2084 * test to make sure the VM system provided 0's
2085 * beyond the true end of the file mapping
2086 * (as required by mmap def in 1996 posix 1003.1)
2087 */
2088 last_page = ((unsigned long)p + (offset & page_mask) + size) & ~page_mask;
2089
2090 for (should_be_zero = last_page + (file_size & page_mask);
2091 should_be_zero < last_page + page_size;
2092 should_be_zero++)
2093 if (*(char *)should_be_zero) {
2094 prt("Mapped %s: non-zero data past EOF (0x%llx) page offset 0x%x is 0x%04x\n",
2095 s, file_size - 1, should_be_zero & page_mask,
2096 short_at(should_be_zero));
2097 report_failure(205);
2098 }
2099 }
2100
2101
2102 void
2103 gendata(char *original_buf, char *good_buf, unsigned offset, unsigned size)
2104 {
2105 while (size--) {
2106 good_buf[offset] = testcalls % 256;
2107 if (offset % 2)
2108 good_buf[offset] += original_buf[offset];
2109 offset++;
2110 }
2111 }
2112
2113
2114 void
2115 dowrite(unsigned offset, unsigned size)
2116 {
2117 ssize_t ret;
2118 off_t newsize;
2119
2120 offset -= offset % writebdy;
2121 if (o_direct)
2122 size -= size % writebdy;
2123 if (size == 0) {
2124 if (!quiet && testcalls > simulatedopcount && !o_direct)
2125 prt("skipping zero size write\n");
2126 log4(OP_SKIPPED, OP_WRITE, offset, size);
2127 return;
2128 }
2129
2130 log4(OP_WRITE, offset, size, file_size);
2131
2132 gendata(original_buf, good_buf, offset, size);
2133 if (file_size < offset + size) {
2134 newsize = ceil(((double)offset + size) / truncbdy) * truncbdy;
2135 if (file_size < newsize)
2136 memset(good_buf + file_size, '\0', newsize - file_size);
2137 file_size = newsize;
2138 if (lite) {
2139 warn("Lite file size bug in fsx!");
2140 report_failure(149);
2141 }
2142 ret = ops->resize(&ctx, newsize);
2143 if (ret < 0) {
2144 prterrcode("dowrite: ops->resize", ret);
2145 report_failure(150);
2146 }
2147 }
2148
2149 if (testcalls <= simulatedopcount)
2150 return;
2151
2152 if (!quiet &&
2153 ((progressinterval && testcalls % progressinterval == 0) ||
2154 (debug &&
2155 (monitorstart == -1 ||
2156 (static_cast<long>(offset + size) > monitorstart &&
2157 (monitorend == -1 ||
2158 static_cast<long>(offset) <= monitorend))))))
2159 prt("%lu write\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
2160 offset, offset + size - 1, size);
2161
2162 ret = ops->write(&ctx, offset, size, good_buf + offset);
2163 if (ret != (ssize_t)size) {
2164 if (ret < 0)
2165 prterrcode("dowrite: ops->write", ret);
2166 else
2167 prt("short write: 0x%x bytes instead of 0x%x\n",
2168 ret, size);
2169 report_failure(151);
2170 }
2171
2172 if (flush_enabled)
2173 doflush(offset, size);
2174 }
2175
2176
2177 void
2178 dotruncate(unsigned size)
2179 {
2180 int oldsize = file_size;
2181 int ret;
2182
2183 size -= size % truncbdy;
2184 if (size > biggest) {
2185 biggest = size;
2186 if (!quiet && testcalls > simulatedopcount)
2187 prt("truncating to largest ever: 0x%x\n", size);
2188 }
2189
2190 log4(OP_TRUNCATE, size, (unsigned)file_size, 0);
2191
2192 if (size > file_size)
2193 memset(good_buf + file_size, '\0', size - file_size);
2194 else if (size < file_size)
2195 memset(good_buf + size, '\0', file_size - size);
2196 file_size = size;
2197
2198 if (testcalls <= simulatedopcount)
2199 return;
2200
2201 if ((progressinterval && testcalls % progressinterval == 0) ||
2202 (debug && (monitorstart == -1 || monitorend == -1 ||
2203 (long)size <= monitorend)))
2204 prt("%lu trunc\tfrom 0x%x to 0x%x\n", testcalls, oldsize, size);
2205
2206 ret = ops->resize(&ctx, size);
2207 if (ret < 0) {
2208 prterrcode("dotruncate: ops->resize", ret);
2209 report_failure(160);
2210 }
2211 }
2212
2213 void
2214 do_punch_hole(unsigned offset, unsigned length)
2215 {
2216 unsigned end_offset;
2217 int max_offset = 0;
2218 int max_len = 0;
2219 int ret;
2220
2221 offset -= offset % holebdy;
2222 length -= length % holebdy;
2223 if (length == 0) {
2224 if (!quiet && testcalls > simulatedopcount)
2225 prt("skipping zero length punch hole\n");
2226 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, length);
2227 return;
2228 }
2229
2230 if (file_size <= (loff_t)offset) {
2231 if (!quiet && testcalls > simulatedopcount)
2232 prt("skipping hole punch off the end of the file\n");
2233 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, length);
2234 return;
2235 }
2236
2237 end_offset = offset + length;
2238
2239 log4(OP_PUNCH_HOLE, offset, length, 0);
2240
2241 if (testcalls <= simulatedopcount)
2242 return;
2243
2244 if ((progressinterval && testcalls % progressinterval == 0) ||
2245 (debug && (monitorstart == -1 || monitorend == -1 ||
2246 (long)end_offset <= monitorend))) {
2247 prt("%lu punch\tfrom 0x%x to 0x%x, (0x%x bytes)\n", testcalls,
2248 offset, offset+length, length);
2249 }
2250
2251 ret = ops->discard(&ctx, (unsigned long long)offset,
2252 (unsigned long long)length);
2253 if (ret < 0) {
2254 prterrcode("do_punch_hole: ops->discard", ret);
2255 report_failure(161);
2256 }
2257
2258 max_offset = offset < file_size ? offset : file_size;
2259 max_len = max_offset + length <= file_size ? length :
2260 file_size - max_offset;
2261 memset(good_buf + max_offset, '\0', max_len);
2262 }
2263
2264 unsigned get_data_size(unsigned size)
2265 {
2266 unsigned i;
2267 unsigned hint;
2268 unsigned max = sqrt((double)size) + 1;
2269 unsigned good = 1;
2270 unsigned curr = good;
2271
2272 hint = get_random() % max;
2273
2274 for (i = 1; i < max && curr < hint; i++) {
2275 if (size % i == 0) {
2276 good = curr;
2277 curr = i;
2278 }
2279 }
2280
2281 if (curr == hint)
2282 good = curr;
2283
2284 return good;
2285 }
2286
2287 void
2288 dowritesame(unsigned offset, unsigned size)
2289 {
2290 ssize_t ret;
2291 off_t newsize;
2292 unsigned buf_off;
2293 unsigned data_size;
2294 int n;
2295
2296 offset -= offset % writebdy;
2297 if (o_direct)
2298 size -= size % writebdy;
2299 if (size == 0) {
2300 if (!quiet && testcalls > simulatedopcount && !o_direct)
2301 prt("skipping zero size writesame\n");
2302 log4(OP_SKIPPED, OP_WRITESAME, offset, size);
2303 return;
2304 }
2305
2306 data_size = get_data_size(size);
2307
2308 log4(OP_WRITESAME, offset, size, data_size);
2309
2310 gendata(original_buf, good_buf, offset, data_size);
2311 if (file_size < offset + size) {
2312 newsize = ceil(((double)offset + size) / truncbdy) * truncbdy;
2313 if (file_size < newsize)
2314 memset(good_buf + file_size, '\0', newsize - file_size);
2315 file_size = newsize;
2316 if (lite) {
2317 warn("Lite file size bug in fsx!");
2318 report_failure(162);
2319 }
2320 ret = ops->resize(&ctx, newsize);
2321 if (ret < 0) {
2322 prterrcode("dowritesame: ops->resize", ret);
2323 report_failure(163);
2324 }
2325 }
2326
2327 for (n = size / data_size, buf_off = data_size; n > 1; n--) {
2328 memcpy(good_buf + offset + buf_off, good_buf + offset, data_size);
2329 buf_off += data_size;
2330 }
2331
2332 if (testcalls <= simulatedopcount)
2333 return;
2334
2335 if (!quiet &&
2336 ((progressinterval && testcalls % progressinterval == 0) ||
2337 (debug &&
2338 (monitorstart == -1 ||
2339 (static_cast<long>(offset + size) > monitorstart &&
2340 (monitorend == -1 ||
2341 static_cast<long>(offset) <= monitorend))))))
2342 prt("%lu writesame\t0x%x thru\t0x%x\tdata_size\t0x%x(0x%x bytes)\n", testcalls,
2343 offset, offset + size - 1, data_size, size);
2344
2345 ret = ops->writesame(&ctx, offset, size, good_buf + offset, data_size);
2346 if (ret != (ssize_t)size) {
2347 if (ret < 0)
2348 prterrcode("dowritesame: ops->writesame", ret);
2349 else
2350 prt("short writesame: 0x%x bytes instead of 0x%x\n",
2351 ret, size);
2352 report_failure(164);
2353 }
2354
2355 if (flush_enabled)
2356 doflush(offset, size);
2357 }
2358
2359 void
2360 docompareandwrite(unsigned offset, unsigned size)
2361 {
2362 int ret;
2363
2364 if (skip_partial_discard) {
2365 if (!quiet && testcalls > simulatedopcount)
2366 prt("compare and write disabled\n");
2367 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
2368 return;
2369 }
2370
2371 offset -= offset % writebdy;
2372 if (o_direct)
2373 size -= size % writebdy;
2374
2375 if (size == 0) {
2376 if (!quiet && testcalls > simulatedopcount && !o_direct)
2377 prt("skipping zero size read\n");
2378 log4(OP_SKIPPED, OP_READ, offset, size);
2379 return;
2380 }
2381
2382 if (size + offset > file_size) {
2383 if (!quiet && testcalls > simulatedopcount)
2384 prt("skipping seek/compare past end of file\n");
2385 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
2386 return;
2387 }
2388
2389 memcpy(temp_buf + offset, good_buf + offset, size);
2390 gendata(original_buf, good_buf, offset, size);
2391 log4(OP_COMPARE_AND_WRITE, offset, size, 0);
2392
2393 if (testcalls <= simulatedopcount)
2394 return;
2395
2396 if (!quiet &&
2397 ((progressinterval && testcalls % progressinterval == 0) ||
2398 (debug &&
2399 (monitorstart == -1 ||
2400 (static_cast<long>(offset + size) > monitorstart &&
2401 (monitorend == -1 ||
2402 static_cast<long>(offset) <= monitorend))))))
2403 prt("%lu compareandwrite\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
2404 offset, offset + size - 1, size);
2405
2406 ret = ops->compare_and_write(&ctx, offset, size, temp_buf + offset,
2407 good_buf + offset);
2408 if (ret != (ssize_t)size) {
2409 if (ret == -EINVAL) {
2410 memcpy(good_buf + offset, temp_buf + offset, size);
2411 return;
2412 }
2413 if (ret < 0)
2414 prterrcode("docompareandwrite: ops->compare_and_write", ret);
2415 else
2416 prt("short write: 0x%x bytes instead of 0x%x\n", ret, size);
2417 report_failure(151);
2418 return;
2419 }
2420
2421 if (flush_enabled)
2422 doflush(offset, size);
2423 }
2424
2425 void clone_filename(char *buf, size_t len, int clones)
2426 {
2427 #if __GNUC__ && __GNUC__ >= 8
2428 #pragma GCC diagnostic push
2429 #pragma GCC diagnostic ignored "-Wformat-truncation"
2430 #endif
2431 snprintf(buf, len, "%s/fsx-%s-parent%d",
2432 dirpath, iname, clones);
2433 #if __GNUC__ && __GNUC__ >= 8
2434 #pragma GCC diagnostic pop
2435 #endif
2436 }
2437
2438 void clone_imagename(char *buf, size_t len, int clones)
2439 {
2440 if (clones > 0)
2441 snprintf(buf, len, "%s-clone%d", iname, clones);
2442 else
2443 strncpy(buf, iname, len);
2444 buf[len - 1] = '\0';
2445 }
2446
2447 void replay_imagename(char *buf, size_t len, int clones)
2448 {
2449 clone_imagename(buf, len, clones);
2450 strncat(buf, "-replay", len - strlen(buf));
2451 buf[len - 1] = '\0';
2452 }
2453
2454 void check_clone(int clonenum, bool replay_image);
2455
2456 void
2457 do_clone()
2458 {
2459 char filename[1024];
2460 char imagename[1024];
2461 char lastimagename[1024];
2462 int ret, fd;
2463 int order = 0, stripe_unit = 0, stripe_count = 0;
2464 uint64_t newsize = file_size;
2465
2466 log4(OP_CLONE, 0, 0, 0);
2467 ++num_clones;
2468
2469 if (randomize_striping) {
2470 order = 18 + get_random() % 8;
2471 stripe_unit = 1ull << (order - 1 - (get_random() % 8));
2472 stripe_count = 2 + get_random() % 14;
2473 }
2474
2475 prt("%lu clone\t%d order %d su %d sc %d\n", testcalls, num_clones,
2476 order, stripe_unit, stripe_count);
2477
2478 clone_imagename(imagename, sizeof(imagename), num_clones);
2479 clone_imagename(lastimagename, sizeof(lastimagename),
2480 num_clones - 1);
2481 ceph_assert(strcmp(lastimagename, ctx.name) == 0);
2482
2483 ret = ops->clone(&ctx, "snap", imagename, &order, stripe_unit,
2484 stripe_count);
2485 if (ret < 0) {
2486 prterrcode("do_clone: ops->clone", ret);
2487 exit(165);
2488 }
2489
2490 if (randomize_parent_overlap && rbd_image_has_parent(&ctx)) {
2491 int rand = get_random() % 16 + 1; // [1..16]
2492
2493 if (rand < 13) {
2494 uint64_t overlap;
2495
2496 ret = rbd_get_overlap(ctx.image, &overlap);
2497 if (ret < 0) {
2498 prterrcode("do_clone: rbd_get_overlap", ret);
2499 exit(1);
2500 }
2501
2502 if (rand < 10) { // 9/16
2503 newsize = overlap * ((double)rand / 10);
2504 newsize -= newsize % truncbdy;
2505 } else { // 3/16
2506 newsize = 0;
2507 }
2508
2509 ceph_assert(newsize != (uint64_t)file_size);
2510 prt("truncating image %s from 0x%llx (overlap 0x%llx) to 0x%llx\n",
2511 ctx.name, file_size, overlap, newsize);
2512
2513 ret = ops->resize(&ctx, newsize);
2514 if (ret < 0) {
2515 prterrcode("do_clone: ops->resize", ret);
2516 exit(1);
2517 }
2518 } else if (rand < 15) { // 2/16
2519 prt("flattening image %s\n", ctx.name);
2520
2521 ret = ops->flatten(&ctx);
2522 if (ret < 0) {
2523 prterrcode("do_clone: ops->flatten", ret);
2524 exit(1);
2525 }
2526 } else { // 2/16
2527 prt("leaving image %s intact\n", ctx.name);
2528 }
2529 }
2530
2531 clone_filename(filename, sizeof(filename), num_clones);
2532 if ((fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0666)) < 0) {
2533 simple_err("do_clone: open", -errno);
2534 exit(162);
2535 }
2536 save_buffer(good_buf, newsize, fd);
2537 if ((ret = close(fd)) < 0) {
2538 simple_err("do_clone: close", -errno);
2539 exit(163);
2540 }
2541
2542 /*
2543 * Close parent.
2544 */
2545 if ((ret = ops->close(&ctx)) < 0) {
2546 prterrcode("do_clone: ops->close", ret);
2547 exit(174);
2548 }
2549
2550 if (journal_replay) {
2551 ret = finalize_journal(ioctx, lastimagename, num_clones - 1,
2552 order, stripe_unit, stripe_count);
2553 if (ret < 0) {
2554 exit(EXIT_FAILURE);
2555 }
2556
2557 ret = register_journal(ioctx, imagename);
2558 if (ret < 0) {
2559 exit(EXIT_FAILURE);
2560 }
2561 }
2562
2563 /*
2564 * Open freshly made clone.
2565 */
2566 if ((ret = ops->open(imagename, &ctx)) < 0) {
2567 prterrcode("do_clone: ops->open", ret);
2568 exit(166);
2569 }
2570
2571 if (num_clones > 1) {
2572 if (journal_replay) {
2573 check_clone(num_clones - 2, true);
2574 }
2575 check_clone(num_clones - 2, false);
2576 }
2577 }
2578
2579 void
2580 check_clone(int clonenum, bool replay_image)
2581 {
2582 char filename[128];
2583 char imagename[128];
2584 int ret, fd;
2585 struct rbd_ctx cur_ctx = RBD_CTX_INIT;
2586 struct stat file_info;
2587 char *good_buf, *temp_buf;
2588
2589 if (replay_image) {
2590 replay_imagename(imagename, sizeof(imagename), clonenum);
2591 } else {
2592 clone_imagename(imagename, sizeof(imagename), clonenum);
2593 }
2594
2595 if ((ret = ops->open(imagename, &cur_ctx)) < 0) {
2596 prterrcode("check_clone: ops->open", ret);
2597 exit(167);
2598 }
2599
2600 clone_filename(filename, sizeof(filename), clonenum + 1);
2601 if ((fd = open(filename, O_RDONLY)) < 0) {
2602 simple_err("check_clone: open", -errno);
2603 exit(168);
2604 }
2605
2606 prt("checking clone #%d, image %s against file %s\n",
2607 clonenum, imagename, filename);
2608 if ((ret = fstat(fd, &file_info)) < 0) {
2609 simple_err("check_clone: fstat", -errno);
2610 exit(169);
2611 }
2612
2613 good_buf = NULL;
2614 ret = posix_memalign((void **)&good_buf,
2615 std::max(writebdy, (int)sizeof(void *)),
2616 file_info.st_size);
2617 if (ret > 0) {
2618 prterrcode("check_clone: posix_memalign(good_buf)", -ret);
2619 exit(96);
2620 }
2621
2622 temp_buf = NULL;
2623 ret = posix_memalign((void **)&temp_buf,
2624 std::max(readbdy, (int)sizeof(void *)),
2625 file_info.st_size);
2626 if (ret > 0) {
2627 prterrcode("check_clone: posix_memalign(temp_buf)", -ret);
2628 exit(97);
2629 }
2630
2631 if ((ret = pread(fd, good_buf, file_info.st_size, 0)) < 0) {
2632 simple_err("check_clone: pread", -errno);
2633 exit(170);
2634 }
2635 if ((ret = ops->read(&cur_ctx, 0, file_info.st_size, temp_buf)) < 0) {
2636 prterrcode("check_clone: ops->read", ret);
2637 exit(171);
2638 }
2639 close(fd);
2640 if ((ret = ops->close(&cur_ctx)) < 0) {
2641 prterrcode("check_clone: ops->close", ret);
2642 exit(174);
2643 }
2644 check_buffers(good_buf, temp_buf, 0, file_info.st_size);
2645
2646 if (!replay_image) {
2647 unlink(filename);
2648 }
2649
2650 free(good_buf);
2651 free(temp_buf);
2652 }
2653
2654 void
2655 writefileimage()
2656 {
2657 ssize_t ret;
2658
2659 ret = ops->write(&ctx, 0, file_size, good_buf);
2660 if (ret != file_size) {
2661 if (ret < 0)
2662 prterrcode("writefileimage: ops->write", ret);
2663 else
2664 prt("short write: 0x%x bytes instead of 0x%llx\n",
2665 ret, (unsigned long long)file_size);
2666 report_failure(172);
2667 }
2668
2669 if (!lite) {
2670 ret = ops->resize(&ctx, file_size);
2671 if (ret < 0) {
2672 prterrcode("writefileimage: ops->resize", ret);
2673 report_failure(173);
2674 }
2675 }
2676 }
2677
2678 void
2679 do_flatten()
2680 {
2681 int ret;
2682
2683 if (!rbd_image_has_parent(&ctx)) {
2684 log4(OP_SKIPPED, OP_FLATTEN, 0, 0);
2685 return;
2686 }
2687 log4(OP_FLATTEN, 0, 0, 0);
2688 prt("%lu flatten\n", testcalls);
2689
2690 ret = ops->flatten(&ctx);
2691 if (ret < 0) {
2692 prterrcode("writefileimage: ops->flatten", ret);
2693 exit(177);
2694 }
2695 }
2696
2697 void
2698 docloseopen(void)
2699 {
2700 char *name;
2701 int ret;
2702
2703 if (testcalls <= simulatedopcount)
2704 return;
2705
2706 name = strdup(ctx.name);
2707
2708 if (debug)
2709 prt("%lu close/open\n", testcalls);
2710
2711 ret = ops->close(&ctx);
2712 if (ret < 0) {
2713 prterrcode("docloseopen: ops->close", ret);
2714 report_failure(180);
2715 }
2716
2717 ret = ops->open(name, &ctx);
2718 if (ret < 0) {
2719 prterrcode("docloseopen: ops->open", ret);
2720 report_failure(181);
2721 }
2722
2723 free(name);
2724 }
2725
2726 #define TRIM_OFF_LEN(off, len, size) \
2727 do { \
2728 if (size) \
2729 (off) %= (size); \
2730 else \
2731 (off) = 0; \
2732 if ((unsigned)(off) + (unsigned)(len) > (unsigned)(size)) \
2733 (len) = (size) - (off); \
2734 } while (0)
2735
2736 void
2737 test(void)
2738 {
2739 unsigned long offset;
2740 unsigned long size = maxoplen;
2741 unsigned long rv = get_random();
2742 unsigned long op;
2743
2744 if (simulatedopcount > 0 && testcalls == simulatedopcount)
2745 writefileimage();
2746
2747 testcalls++;
2748
2749 if (closeprob)
2750 closeopen = (rv >> 3) < (1u << 28) / (unsigned)closeprob;
2751
2752 if (debugstart > 0 && testcalls >= debugstart)
2753 debug = 1;
2754
2755 if (!quiet && testcalls < simulatedopcount && testcalls % 100000 == 0)
2756 prt("%lu...\n", testcalls);
2757
2758 offset = get_random();
2759 if (randomoplen)
2760 size = get_random() % (maxoplen + 1);
2761
2762 /* calculate appropriate op to run */
2763 if (lite)
2764 op = rv % OP_MAX_LITE;
2765 else
2766 op = rv % OP_MAX_FULL;
2767
2768 switch (op) {
2769 case OP_MAPREAD:
2770 if (!mapped_reads)
2771 op = OP_READ;
2772 break;
2773 case OP_MAPWRITE:
2774 if (!mapped_writes)
2775 op = OP_WRITE;
2776 break;
2777 case OP_FALLOCATE:
2778 if (!fallocate_calls) {
2779 log4(OP_SKIPPED, OP_FALLOCATE, offset, size);
2780 goto out;
2781 }
2782 break;
2783 case OP_PUNCH_HOLE:
2784 if (!punch_hole_calls) {
2785 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, size);
2786 goto out;
2787 }
2788 break;
2789 case OP_CLONE:
2790 /* clone, 8% chance */
2791 if (!clone_calls || file_size == 0 || get_random() % 100 >= 8) {
2792 log4(OP_SKIPPED, OP_CLONE, 0, 0);
2793 goto out;
2794 }
2795 break;
2796 case OP_FLATTEN:
2797 /* flatten four times as rarely as clone, 2% chance */
2798 if (get_random() % 100 >= 2) {
2799 log4(OP_SKIPPED, OP_FLATTEN, 0, 0);
2800 goto out;
2801 }
2802 break;
2803 case OP_WRITESAME:
2804 /* writesame not implemented */
2805 if (!ops->writesame) {
2806 log4(OP_SKIPPED, OP_WRITESAME, offset, size);
2807 goto out;
2808 }
2809 break;
2810 case OP_COMPARE_AND_WRITE:
2811 /* compare_and_write not implemented */
2812 if (!ops->compare_and_write) {
2813 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
2814 goto out;
2815 }
2816 break;
2817 }
2818
2819 switch (op) {
2820 case OP_READ:
2821 TRIM_OFF_LEN(offset, size, file_size);
2822 doread(offset, size);
2823 break;
2824
2825 case OP_WRITE:
2826 TRIM_OFF_LEN(offset, size, maxfilelen);
2827 dowrite(offset, size);
2828 break;
2829
2830 case OP_MAPREAD:
2831 TRIM_OFF_LEN(offset, size, file_size);
2832 exit(183);
2833 break;
2834
2835 case OP_MAPWRITE:
2836 TRIM_OFF_LEN(offset, size, maxfilelen);
2837 exit(182);
2838 break;
2839
2840 case OP_TRUNCATE:
2841 if (!style)
2842 size = get_random() % maxfilelen;
2843 dotruncate(size);
2844 break;
2845
2846 case OP_PUNCH_HOLE:
2847 TRIM_OFF_LEN(offset, size, file_size);
2848 do_punch_hole(offset, size);
2849 break;
2850
2851 case OP_WRITESAME:
2852 TRIM_OFF_LEN(offset, size, maxfilelen);
2853 dowritesame(offset, size);
2854 break;
2855 case OP_COMPARE_AND_WRITE:
2856 TRIM_OFF_LEN(offset, size, file_size);
2857 docompareandwrite(offset, size);
2858 break;
2859
2860 case OP_CLONE:
2861 do_clone();
2862 break;
2863
2864 case OP_FLATTEN:
2865 do_flatten();
2866 break;
2867
2868 default:
2869 prterr("test: unknown operation");
2870 report_failure(42);
2871 break;
2872 }
2873
2874 out:
2875 if (sizechecks && testcalls > simulatedopcount)
2876 check_size();
2877 if (closeopen)
2878 docloseopen();
2879 }
2880
2881
2882 void
2883 cleanup(int sig)
2884 {
2885 if (sig)
2886 prt("signal %d\n", sig);
2887 prt("testcalls = %lu\n", testcalls);
2888 exit(sig);
2889 }
2890
2891
2892 void
2893 usage(void)
2894 {
2895 fprintf(stdout, "usage: %s",
2896 "fsx [-dfjknqxyACFHKLORUWZ] [-b opnum] [-c Prob] [-h holebdy] [-l flen] [-m start:end] [-o oplen] [-p progressinterval] [-r readbdy] [-s style] [-t truncbdy] [-w writebdy] [-D startingop] [-N numops] [-P dirpath] [-S seed] pname iname\n\
2897 -b opnum: beginning operation number (default 1)\n\
2898 -c P: 1 in P chance of file close+open at each op (default infinity)\n\
2899 -d: debug output for all operations\n\
2900 -f: flush and invalidate cache after I/O\n\
2901 -g: deep copy instead of clone\n\
2902 -h holebdy: 4096 would make discards page aligned (default 1)\n\
2903 -j: journal replay stress test\n\
2904 -k: keep data on success (default 0)\n\
2905 -l flen: the upper bound on file size (default 262144)\n\
2906 -m startop:endop: monitor (print debug output) specified byte range (default 0:infinity)\n\
2907 -n: no verifications of file size\n\
2908 -o oplen: the upper bound on operation size (default 65536)\n\
2909 -p progressinterval: debug output at specified operation interval\n\
2910 -q: quieter operation\n\
2911 -r readbdy: 4096 would make reads page aligned (default 1)\n\
2912 -s style: 1 gives smaller truncates (default 0)\n\
2913 -t truncbdy: 4096 would make truncates page aligned (default 1)\n\
2914 -w writebdy: 4096 would make writes page aligned (default 1)\n\
2915 -x: preallocate file space before starting, XFS only (default 0)\n\
2916 -y: synchronize changes to a file\n"
2917
2918 " -C: do not use clone calls\n\
2919 -D startingop: debug output starting at specified operation\n"
2920 #ifdef FALLOCATE
2921 " -F: Do not use fallocate (preallocation) calls\n"
2922 #endif
2923 #if defined(__FreeBSD__)
2924 " -G: enable rbd-ggate mode (use -L, -r and -w too)\n"
2925 #endif
2926 " -H: do not use punch hole calls\n"
2927 #if defined(WITH_KRBD)
2928 " -K: enable krbd mode (use -t and -h too)\n"
2929 #endif
2930 #if defined(__linux__)
2931 " -M: enable rbd-nbd mode (use -t and -h too)\n"
2932 #endif
2933 " -L: fsxLite - no file creations & no file size changes\n\
2934 -N numops: total # operations to do (default infinity)\n\
2935 -O: use oplen (see -o flag) for every op (default random)\n\
2936 -P dirpath: save .fsxlog and .fsxgood files in dirpath (default ./)\n\
2937 -R: read() system calls only (mapped reads disabled)\n\
2938 -S seed: for random # generator (default 1) 0 gets timestamp\n\
2939 -U: disable randomized striping\n\
2940 -W: mapped write operations DISabled\n\
2941 -Z: O_DIRECT (use -R, -W, -r and -w too)\n\
2942 poolname: this is REQUIRED (no default)\n\
2943 imagename: this is REQUIRED (no default)\n");
2944 exit(89);
2945 }
2946
2947
2948 int
2949 getnum(char *s, char **e)
2950 {
2951 int ret;
2952
2953 *e = (char *) 0;
2954 ret = strtol(s, e, 0);
2955 if (*e)
2956 switch (**e) {
2957 case 'b':
2958 case 'B':
2959 ret *= 512;
2960 *e = *e + 1;
2961 break;
2962 case 'k':
2963 case 'K':
2964 ret *= 1024;
2965 *e = *e + 1;
2966 break;
2967 case 'm':
2968 case 'M':
2969 ret *= 1024*1024;
2970 *e = *e + 1;
2971 break;
2972 case 'w':
2973 case 'W':
2974 ret *= 4;
2975 *e = *e + 1;
2976 break;
2977 }
2978 return (ret);
2979 }
2980
2981 void
2982 test_fallocate()
2983 {
2984 #ifdef FALLOCATE
2985 if (!lite && fallocate_calls) {
2986 if (fallocate(fd, 0, 0, 1) && errno == EOPNOTSUPP) {
2987 if(!quiet)
2988 warn("main: filesystem does not support fallocate, disabling\n");
2989 fallocate_calls = 0;
2990 } else {
2991 ftruncate(fd, 0);
2992 }
2993 }
2994 #else /* ! FALLOCATE */
2995 fallocate_calls = 0;
2996 #endif
2997
2998 }
2999
3000 void remove_image(rados_ioctx_t ioctx, char *imagename, bool remove_snap,
3001 bool unregister) {
3002 rbd_image_t image;
3003 char errmsg[128];
3004 int ret;
3005
3006 if ((ret = rbd_open(ioctx, imagename, &image, NULL)) < 0) {
3007 sprintf(errmsg, "rbd_open %s", imagename);
3008 prterrcode(errmsg, ret);
3009 report_failure(101);
3010 }
3011 if (remove_snap) {
3012 if ((ret = rbd_snap_unprotect(image, "snap")) < 0) {
3013 sprintf(errmsg, "rbd_snap_unprotect %s@snap",
3014 imagename);
3015 prterrcode(errmsg, ret);
3016 report_failure(102);
3017 }
3018 if ((ret = rbd_snap_remove(image, "snap")) < 0) {
3019 sprintf(errmsg, "rbd_snap_remove %s@snap",
3020 imagename);
3021 prterrcode(errmsg, ret);
3022 report_failure(103);
3023 }
3024 }
3025 if ((ret = rbd_close(image)) < 0) {
3026 sprintf(errmsg, "rbd_close %s", imagename);
3027 prterrcode(errmsg, ret);
3028 report_failure(104);
3029 }
3030
3031 if (unregister &&
3032 (ret = unregister_journal(ioctx, imagename)) < 0) {
3033 report_failure(105);
3034 }
3035
3036 if ((ret = rbd_remove(ioctx, imagename)) < 0) {
3037 sprintf(errmsg, "rbd_remove %s", imagename);
3038 prterrcode(errmsg, ret);
3039 report_failure(106);
3040 }
3041 }
3042
3043 int
3044 main(int argc, char **argv)
3045 {
3046 enum {
3047 LONG_OPT_CLUSTER = 1000,
3048 LONG_OPT_ID = 1001
3049 };
3050
3051 int i, style, ch, ret;
3052 char *endp;
3053 char goodfile[1024];
3054 char logfile[1024];
3055
3056 const char* optstring = "b:c:dfgh:jkl:m:no:p:qr:s:t:w:xyCD:FGHKMLN:OP:RS:UWZ";
3057 const struct option longopts[] = {
3058 {"cluster", 1, NULL, LONG_OPT_CLUSTER},
3059 {"id", 1, NULL, LONG_OPT_ID}};
3060
3061 goodfile[0] = 0;
3062 logfile[0] = 0;
3063
3064 page_size = getpagesize();
3065 page_mask = page_size - 1;
3066 mmap_mask = page_mask;
3067
3068 setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
3069
3070 while ((ch = getopt_long(argc, argv, optstring, longopts, NULL)) != EOF) {
3071 switch (ch) {
3072 case LONG_OPT_CLUSTER:
3073 cluster_name = optarg;
3074 break;
3075 case LONG_OPT_ID:
3076 client_id = optarg;
3077 break;
3078 case 'b':
3079 simulatedopcount = getnum(optarg, &endp);
3080 if (!quiet)
3081 fprintf(stdout, "Will begin at operation %lu\n",
3082 simulatedopcount);
3083 if (simulatedopcount == 0)
3084 usage();
3085 simulatedopcount -= 1;
3086 break;
3087 case 'c':
3088 closeprob = getnum(optarg, &endp);
3089 if (!quiet)
3090 fprintf(stdout,
3091 "Chance of close/open is 1 in %d\n",
3092 closeprob);
3093 if (closeprob <= 0)
3094 usage();
3095 break;
3096 case 'd':
3097 debug = 1;
3098 break;
3099 case 'f':
3100 flush_enabled = 1;
3101 break;
3102 case 'g':
3103 deep_copy = 1;
3104 break;
3105 case 'h':
3106 holebdy = getnum(optarg, &endp);
3107 if (holebdy <= 0)
3108 usage();
3109 break;
3110 case 'j':
3111 journal_replay = true;
3112 break;
3113 case 'k':
3114 keep_on_success = 1;
3115 break;
3116 case 'l':
3117 {
3118 int _num = getnum(optarg, &endp);
3119 if (_num <= 0)
3120 usage();
3121 maxfilelen = _num;
3122 }
3123 break;
3124 case 'm':
3125 monitorstart = getnum(optarg, &endp);
3126 if (monitorstart < 0)
3127 usage();
3128 if (!endp || *endp++ != ':')
3129 usage();
3130 monitorend = getnum(endp, &endp);
3131 if (monitorend < 0)
3132 usage();
3133 if (monitorend == 0)
3134 monitorend = -1; /* aka infinity */
3135 debug = 1;
3136 break;
3137 case 'n':
3138 sizechecks = 0;
3139 break;
3140 case 'o':
3141 maxoplen = getnum(optarg, &endp);
3142 if (maxoplen <= 0)
3143 usage();
3144 break;
3145 case 'p':
3146 progressinterval = getnum(optarg, &endp);
3147 if (progressinterval == 0)
3148 usage();
3149 break;
3150 case 'q':
3151 quiet = 1;
3152 break;
3153 case 'r':
3154 readbdy = getnum(optarg, &endp);
3155 if (readbdy <= 0)
3156 usage();
3157 break;
3158 case 's':
3159 style = getnum(optarg, &endp);
3160 if (style < 0 || style > 1)
3161 usage();
3162 break;
3163 case 't':
3164 truncbdy = getnum(optarg, &endp);
3165 if (truncbdy <= 0)
3166 usage();
3167 break;
3168 case 'w':
3169 writebdy = getnum(optarg, &endp);
3170 if (writebdy <= 0)
3171 usage();
3172 break;
3173 case 'x':
3174 prealloc = 1;
3175 break;
3176 case 'y':
3177 do_fsync = 1;
3178 break;
3179 case 'C':
3180 clone_calls = 0;
3181 break;
3182 case 'D':
3183 debugstart = getnum(optarg, &endp);
3184 if (debugstart < 1)
3185 usage();
3186 break;
3187 case 'F':
3188 fallocate_calls = 0;
3189 break;
3190 #if defined(__FreeBSD__)
3191 case 'G':
3192 prt("rbd-ggate mode enabled\n");
3193 ops = &ggate_operations;
3194 break;
3195 #endif
3196 case 'H':
3197 punch_hole_calls = 0;
3198 break;
3199 #if defined(WITH_KRBD)
3200 case 'K':
3201 prt("krbd mode enabled\n");
3202 ops = &krbd_operations;
3203 break;
3204 #endif
3205 #if defined(__linux__)
3206 case 'M':
3207 prt("rbd-nbd mode enabled\n");
3208 ops = &nbd_operations;
3209 break;
3210 #endif
3211 case 'L':
3212 lite = 1;
3213 break;
3214 case 'N':
3215 numops = getnum(optarg, &endp);
3216 if (numops < 0)
3217 usage();
3218 break;
3219 case 'O':
3220 randomoplen = 0;
3221 break;
3222 case 'P':
3223 strncpy(dirpath, optarg, sizeof(dirpath)-1);
3224 dirpath[sizeof(dirpath)-1] = '\0';
3225 strncpy(goodfile, dirpath, sizeof(goodfile)-1);
3226 goodfile[sizeof(goodfile)-1] = '\0';
3227 if (strlen(goodfile) < sizeof(goodfile)-2) {
3228 strcat(goodfile, "/");
3229 } else {
3230 prt("file name to long\n");
3231 exit(1);
3232 }
3233 strncpy(logfile, dirpath, sizeof(logfile)-1);
3234 logfile[sizeof(logfile)-1] = '\0';
3235 if (strlen(logfile) < sizeof(logfile)-2) {
3236 strcat(logfile, "/");
3237 } else {
3238 prt("file path to long\n");
3239 exit(1);
3240 }
3241 break;
3242 case 'R':
3243 mapped_reads = 0;
3244 if (!quiet)
3245 fprintf(stdout, "mapped reads DISABLED\n");
3246 break;
3247 case 'S':
3248 seed = getnum(optarg, &endp);
3249 if (seed == 0)
3250 seed = std::random_device()() % 10000;
3251 if (!quiet)
3252 fprintf(stdout, "Seed set to %d\n", seed);
3253 if (seed < 0)
3254 usage();
3255 break;
3256 case 'U':
3257 randomize_striping = 0;
3258 break;
3259 case 'W':
3260 mapped_writes = 0;
3261 if (!quiet)
3262 fprintf(stdout, "mapped writes DISABLED\n");
3263 break;
3264 case 'Z':
3265 o_direct = O_DIRECT;
3266 break;
3267 default:
3268 usage();
3269 /* NOTREACHED */
3270 }
3271 }
3272 argc -= optind;
3273 argv += optind;
3274 if (argc != 2)
3275 usage();
3276 pool = argv[0];
3277 iname = argv[1];
3278
3279 signal(SIGHUP, cleanup);
3280 signal(SIGINT, cleanup);
3281 signal(SIGPIPE, cleanup);
3282 signal(SIGALRM, cleanup);
3283 signal(SIGTERM, cleanup);
3284 signal(SIGXCPU, cleanup);
3285 signal(SIGXFSZ, cleanup);
3286 signal(SIGVTALRM, cleanup);
3287 signal(SIGUSR1, cleanup);
3288 signal(SIGUSR2, cleanup);
3289
3290 random_generator.seed(seed);
3291
3292 if (lite) {
3293 file_size = maxfilelen;
3294 }
3295
3296 ret = create_image();
3297 if (ret < 0) {
3298 prterrcode(iname, ret);
3299 exit(90);
3300 }
3301 ret = ops->open(iname, &ctx);
3302 if (ret < 0) {
3303 simple_err("Error opening image", ret);
3304 exit(91);
3305 }
3306 if (!dirpath[0])
3307 strcat(dirpath, ".");
3308 strncat(goodfile, iname, 256);
3309 strcat (goodfile, ".fsxgood");
3310 fsxgoodfd = open(goodfile, O_RDWR|O_CREAT|O_TRUNC, 0666);
3311 if (fsxgoodfd < 0) {
3312 prterr(goodfile);
3313 exit(92);
3314 }
3315 strncat(logfile, iname, 256);
3316 strcat (logfile, ".fsxlog");
3317 fsxlogf = fopen(logfile, "w");
3318 if (fsxlogf == NULL) {
3319 prterr(logfile);
3320 exit(93);
3321 }
3322
3323 original_buf = (char *) malloc(maxfilelen);
3324 for (i = 0; i < (int)maxfilelen; i++)
3325 original_buf[i] = get_random() % 256;
3326
3327 ret = posix_memalign((void **)&good_buf,
3328 std::max(writebdy, (int)sizeof(void *)), maxfilelen);
3329 if (ret > 0) {
3330 if (ret == EINVAL)
3331 prt("writebdy is not a suitable power of two\n");
3332 else
3333 prterrcode("main: posix_memalign(good_buf)", -ret);
3334 exit(94);
3335 }
3336 memset(good_buf, '\0', maxfilelen);
3337
3338 ret = posix_memalign((void **)&temp_buf,
3339 std::max(readbdy, (int)sizeof(void *)), maxfilelen);
3340 if (ret > 0) {
3341 if (ret == EINVAL)
3342 prt("readbdy is not a suitable power of two\n");
3343 else
3344 prterrcode("main: posix_memalign(temp_buf)", -ret);
3345 exit(95);
3346 }
3347 memset(temp_buf, '\0', maxfilelen);
3348
3349 if (lite) { /* zero entire existing file */
3350 ssize_t written;
3351
3352 written = ops->write(&ctx, 0, (size_t)maxfilelen, good_buf);
3353 if (written != (ssize_t)maxfilelen) {
3354 if (written < 0) {
3355 prterrcode(iname, written);
3356 warn("main: error on write");
3357 } else
3358 warn("main: short write, 0x%x bytes instead "
3359 "of 0x%lx\n",
3360 (unsigned)written,
3361 maxfilelen);
3362 exit(98);
3363 }
3364 } else
3365 check_trunc_hack();
3366
3367 //test_fallocate();
3368
3369 while (numops == -1 || numops--)
3370 test();
3371
3372 ret = ops->close(&ctx);
3373 if (ret < 0) {
3374 prterrcode("ops->close", ret);
3375 report_failure(99);
3376 }
3377
3378 if (journal_replay) {
3379 char imagename[1024];
3380 clone_imagename(imagename, sizeof(imagename), num_clones);
3381 ret = finalize_journal(ioctx, imagename, num_clones, 0, 0, 0);
3382 if (ret < 0) {
3383 report_failure(100);
3384 }
3385 }
3386
3387 if (num_clones > 0) {
3388 if (journal_replay) {
3389 check_clone(num_clones - 1, true);
3390 }
3391 check_clone(num_clones - 1, false);
3392 }
3393
3394 if (!keep_on_success) {
3395 while (num_clones >= 0) {
3396 static bool remove_snap = false;
3397
3398 if (journal_replay) {
3399 char replayimagename[1024];
3400 replay_imagename(replayimagename,
3401 sizeof(replayimagename),
3402 num_clones);
3403 remove_image(ioctx, replayimagename,
3404 remove_snap,
3405 false);
3406 }
3407
3408 char clonename[128];
3409 clone_imagename(clonename, 128, num_clones);
3410 remove_image(ioctx, clonename, remove_snap,
3411 journal_replay);
3412
3413 remove_snap = true;
3414 num_clones--;
3415 }
3416 }
3417
3418 prt("All operations completed A-OK!\n");
3419 fclose(fsxlogf);
3420
3421 rados_ioctx_destroy(ioctx);
3422 #if defined(WITH_KRBD)
3423 krbd_destroy(krbd);
3424 #endif
3425 rados_shutdown(cluster);
3426
3427 free(original_buf);
3428 free(good_buf);
3429 free(temp_buf);
3430
3431 exit(0);
3432 return 0;
3433 }