]> git.proxmox.com Git - ceph.git/blob - ceph/src/test/librbd/fsx.cc
import ceph 12.2.12
[ceph.git] / ceph / src / test / librbd / fsx.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:8; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=8 smarttab
3 /*
4 * Copyright (C) 1991, NeXT Computer, Inc. All Rights Reserverd.
5 *
6 * File: fsx.cc
7 * Author: Avadis Tevanian, Jr.
8 *
9 * File system exerciser.
10 *
11 * Rewritten 8/98 by Conrad Minshall.
12 *
13 * Small changes to work under Linux -- davej.
14 *
15 * Checks for mmap last-page zero fill.
16 */
17
18 #include <sys/types.h>
19 #include <unistd.h>
20 #include <limits.h>
21 #include <time.h>
22 #include <strings.h>
23 #include <sys/file.h>
24 #include <sys/stat.h>
25 #include <sys/mman.h>
26 #include <linux/fs.h>
27 #include <sys/ioctl.h>
28 #ifdef HAVE_ERR_H
29 #include <err.h>
30 #endif
31 #include <signal.h>
32 #include <stdbool.h>
33 #include <stddef.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <stdarg.h>
38 #include <assert.h>
39 #include <errno.h>
40 #include <math.h>
41 #include <fcntl.h>
42 #include <random>
43
44 #include "include/intarith.h"
45 #include "include/krbd.h"
46 #include "include/rados/librados.h"
47 #include "include/rados/librados.hpp"
48 #include "include/rbd/librbd.h"
49 #include "include/rbd/librbd.hpp"
50 #include "common/Cond.h"
51 #include "common/SubProcess.h"
52 #include "common/safe_io.h"
53 #include "journal/Journaler.h"
54 #include "journal/ReplayEntry.h"
55 #include "journal/ReplayHandler.h"
56 #include "journal/Settings.h"
57
58 #include <boost/scope_exit.hpp>
59
60 #define NUMPRINTCOLUMNS 32 /* # columns of data to print on each line */
61
62 /*
63 * A log entry is an operation and a bunch of arguments.
64 */
65
66 struct log_entry {
67 int operation;
68 int args[3];
69 };
70
71 #define LOGSIZE 1000
72
73 struct log_entry oplog[LOGSIZE]; /* the log */
74 int logptr = 0; /* current position in log */
75 int logcount = 0; /* total ops */
76
77 /*
78 * The operation matrix is complex due to conditional execution of different
79 * features. Hence when we come to deciding what operation to run, we need to
80 * be careful in how we select the different operations. The active operations
81 * are mapped to numbers as follows:
82 *
83 * lite !lite
84 * READ: 0 0
85 * WRITE: 1 1
86 * MAPREAD: 2 2
87 * MAPWRITE: 3 3
88 * TRUNCATE: - 4
89 * FALLOCATE: - 5
90 * PUNCH HOLE: - 6
91 * WRITESAME: - 7
92 * COMPAREANDWRITE: - 8
93 *
94 * When mapped read/writes are disabled, they are simply converted to normal
95 * reads and writes. When fallocate/fpunch calls are disabled, they are
96 * converted to OP_SKIPPED. Hence OP_SKIPPED needs to have a number higher than
97 * the operation selction matrix, as does the OP_CLOSEOPEN which is an
98 * operation modifier rather than an operation in itself.
99 *
100 * Because of the "lite" version, we also need to have different "maximum
101 * operation" defines to allow the ops to be selected correctly based on the
102 * mode being run.
103 */
104
105 /* common operations */
106 #define OP_READ 0
107 #define OP_WRITE 1
108 #define OP_MAPREAD 2
109 #define OP_MAPWRITE 3
110 #define OP_MAX_LITE 4
111
112 /* !lite operations */
113 #define OP_TRUNCATE 4
114 #define OP_FALLOCATE 5
115 #define OP_PUNCH_HOLE 6
116 #define OP_WRITESAME 7
117 #define OP_COMPARE_AND_WRITE 8
118 /* rbd-specific operations */
119 #define OP_CLONE 9
120 #define OP_FLATTEN 10
121 #define OP_MAX_FULL 11
122
123 /* operation modifiers */
124 #define OP_CLOSEOPEN 100
125 #define OP_SKIPPED 101
126
127 #undef PAGE_SIZE
128 #define PAGE_SIZE getpagesize()
129 #undef PAGE_MASK
130 #define PAGE_MASK (PAGE_SIZE - 1)
131
132
133 char *original_buf; /* a pointer to the original data */
134 char *good_buf; /* a pointer to the correct data */
135 char *temp_buf; /* a pointer to the current data */
136
137 char dirpath[1024];
138
139 off_t file_size = 0;
140 off_t biggest = 0;
141 unsigned long testcalls = 0; /* calls to function "test" */
142
143 unsigned long simulatedopcount = 0; /* -b flag */
144 int closeprob = 0; /* -c flag */
145 int debug = 0; /* -d flag */
146 unsigned long debugstart = 0; /* -D flag */
147 int flush_enabled = 0; /* -f flag */
148 int holebdy = 1; /* -h flag */
149 bool journal_replay = false; /* -j flah */
150 int keep_on_success = 0; /* -k flag */
151 int do_fsync = 0; /* -y flag */
152 unsigned long maxfilelen = 256 * 1024; /* -l flag */
153 int sizechecks = 1; /* -n flag disables them */
154 int maxoplen = 64 * 1024; /* -o flag */
155 int quiet = 0; /* -q flag */
156 unsigned long progressinterval = 0; /* -p flag */
157 int readbdy = 1; /* -r flag */
158 int style = 0; /* -s flag */
159 int prealloc = 0; /* -x flag */
160 int truncbdy = 1; /* -t flag */
161 int writebdy = 1; /* -w flag */
162 long monitorstart = -1; /* -m flag */
163 long monitorend = -1; /* -m flag */
164 int lite = 0; /* -L flag */
165 long numops = -1; /* -N flag */
166 int randomoplen = 1; /* -O flag disables it */
167 int seed = 1; /* -S flag */
168 int mapped_writes = 0; /* -W flag disables */
169 int fallocate_calls = 0; /* -F flag disables */
170 int punch_hole_calls = 1; /* -H flag disables */
171 int clone_calls = 1; /* -C flag disables */
172 int randomize_striping = 1; /* -U flag disables */
173 int randomize_parent_overlap = 1;
174 int mapped_reads = 0; /* -R flag disables it */
175 int fsxgoodfd = 0;
176 int o_direct = 0; /* -Z flag */
177
178 int num_clones = 0;
179
180 int page_size;
181 int page_mask;
182 int mmap_mask;
183
184 FILE * fsxlogf = NULL;
185 int badoff = -1;
186 int closeopen = 0;
187
188 void
189 vwarnc(int code, const char *fmt, va_list ap) {
190 fprintf(stderr, "fsx: ");
191 if (fmt != NULL) {
192 vfprintf(stderr, fmt, ap);
193 fprintf(stderr, ": ");
194 }
195 fprintf(stderr, "%s\n", strerror(code));
196 }
197
198 void
199 warn(const char * fmt, ...) {
200 va_list ap;
201 va_start(ap, fmt);
202 vwarnc(errno, fmt, ap);
203 va_end(ap);
204 }
205
206 #define BUF_SIZE 1024
207
208 void
209 prt(const char *fmt, ...)
210 {
211 va_list args;
212 char buffer[BUF_SIZE];
213
214 va_start(args, fmt);
215 vsnprintf(buffer, BUF_SIZE, fmt, args);
216 va_end(args);
217 fprintf(stdout, "%s", buffer);
218 if (fsxlogf)
219 fprintf(fsxlogf, "%s", buffer);
220 }
221
222 void
223 prterr(const char *prefix)
224 {
225 prt("%s%s%s\n", prefix, prefix ? ": " : "", strerror(errno));
226 }
227
228 void
229 prterrcode(const char *prefix, int code)
230 {
231 prt("%s%s%s\n", prefix, prefix ? ": " : "", strerror(-code));
232 }
233
234 void
235 simple_err(const char *msg, int err)
236 {
237 fprintf(stderr, "%s: %s\n", msg, strerror(-err));
238 }
239
240 /*
241 * random
242 */
243 std::mt19937 random_generator;
244
245 uint_fast32_t
246 get_random(void)
247 {
248 return random_generator();
249 }
250
251 void replay_imagename(char *buf, size_t len, int clones);
252
253 namespace {
254
255 static const std::string JOURNAL_CLIENT_ID("fsx");
256
257 struct ReplayHandler : public journal::ReplayHandler {
258 journal::Journaler *journaler;
259 journal::Journaler *replay_journaler;
260 Context *on_finish;
261
262 ReplayHandler(journal::Journaler *journaler,
263 journal::Journaler *replay_journaler, Context *on_finish)
264 : journaler(journaler), replay_journaler(replay_journaler),
265 on_finish(on_finish) {
266 }
267
268 void get() override {
269 }
270 void put() override {
271 }
272
273 void handle_entries_available() override {
274 while (true) {
275 journal::ReplayEntry replay_entry;
276 if (!journaler->try_pop_front(&replay_entry)) {
277 return;
278 }
279
280 replay_journaler->append(0, replay_entry.get_data());
281 }
282 }
283
284 void handle_complete(int r) override {
285 on_finish->complete(r);
286 }
287 };
288
289 int get_image_id(librados::IoCtx &io_ctx, const char *image_name,
290 std::string *image_id) {
291 librbd::RBD rbd;
292 librbd::Image image;
293 int r = rbd.open(io_ctx, image, image_name);
294 if (r < 0) {
295 simple_err("failed to open image", r);
296 return r;
297 }
298
299 rbd_image_info_t info;
300 r = image.stat(info, sizeof(info));
301 if (r < 0) {
302 simple_err("failed to stat image", r);
303 return r;
304 }
305
306 *image_id = std::string(&info.block_name_prefix[strlen(RBD_DATA_PREFIX)]);
307 return 0;
308 }
309
310 int register_journal(rados_ioctx_t ioctx, const char *image_name) {
311 librados::IoCtx io_ctx;
312 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
313
314 std::string image_id;
315 int r = get_image_id(io_ctx, image_name, &image_id);
316 if (r < 0) {
317 return r;
318 }
319
320 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {});
321 r = journaler.register_client(bufferlist());
322 if (r < 0) {
323 simple_err("failed to register journal client", r);
324 return r;
325 }
326 return 0;
327 }
328
329 int unregister_journal(rados_ioctx_t ioctx, const char *image_name) {
330 librados::IoCtx io_ctx;
331 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
332
333 std::string image_id;
334 int r = get_image_id(io_ctx, image_name, &image_id);
335 if (r < 0) {
336 return r;
337 }
338
339 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {});
340 r = journaler.unregister_client();
341 if (r < 0) {
342 simple_err("failed to unregister journal client", r);
343 return r;
344 }
345 return 0;
346 }
347
348 int create_replay_image(rados_ioctx_t ioctx, int order,
349 uint64_t stripe_unit, int stripe_count,
350 const char *replay_image_name,
351 const char *last_replay_image_name) {
352 librados::IoCtx io_ctx;
353 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
354
355 int r;
356 librbd::RBD rbd;
357 if (last_replay_image_name == nullptr) {
358 r = rbd.create2(io_ctx, replay_image_name, 0,
359 RBD_FEATURES_ALL, &order);
360 } else {
361 r = rbd.clone2(io_ctx, last_replay_image_name, "snap",
362 io_ctx, replay_image_name, RBD_FEATURES_ALL,
363 &order, stripe_unit, stripe_count);
364 }
365
366 if (r < 0) {
367 simple_err("failed to create replay image", r);
368 return r;
369 }
370
371 return 0;
372 }
373
374 int replay_journal(rados_ioctx_t ioctx, const char *image_name,
375 const char *replay_image_name) {
376 librados::IoCtx io_ctx;
377 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
378
379 std::string image_id;
380 int r = get_image_id(io_ctx, image_name, &image_id);
381 if (r < 0) {
382 return r;
383 }
384
385 std::string replay_image_id;
386 r = get_image_id(io_ctx, replay_image_name, &replay_image_id);
387 if (r < 0) {
388 return r;
389 }
390
391 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {});
392 C_SaferCond init_ctx;
393 journaler.init(&init_ctx);
394 BOOST_SCOPE_EXIT_ALL( (&journaler) ) {
395 journaler.shut_down();
396 };
397
398 r = init_ctx.wait();
399 if (r < 0) {
400 simple_err("failed to initialize journal", r);
401 return r;
402 }
403
404 journal::Journaler replay_journaler(io_ctx, replay_image_id, "", {});
405
406 C_SaferCond replay_init_ctx;
407 replay_journaler.init(&replay_init_ctx);
408 BOOST_SCOPE_EXIT_ALL( (&replay_journaler) ) {
409 replay_journaler.shut_down();
410 };
411
412 r = replay_init_ctx.wait();
413 if (r < 0) {
414 simple_err("failed to initialize replay journal", r);
415 return r;
416 }
417
418 replay_journaler.start_append(0, 0, 0);
419
420 C_SaferCond replay_ctx;
421 ReplayHandler replay_handler(&journaler, &replay_journaler,
422 &replay_ctx);
423
424 // copy journal events from source image to replay image
425 journaler.start_replay(&replay_handler);
426 r = replay_ctx.wait();
427
428 journaler.stop_replay();
429
430 C_SaferCond stop_ctx;
431 replay_journaler.stop_append(&stop_ctx);
432 int stop_r = stop_ctx.wait();
433 if (r == 0 && stop_r < 0) {
434 r = stop_r;
435 }
436
437 if (r < 0) {
438 simple_err("failed to replay journal", r);
439 return r;
440 }
441
442 librbd::RBD rbd;
443 librbd::Image image;
444 r = rbd.open(io_ctx, image, replay_image_name);
445 if (r < 0) {
446 simple_err("failed to open replay image", r);
447 return r;
448 }
449
450 // perform an IO op to initiate the journal replay
451 bufferlist bl;
452 r = static_cast<ssize_t>(image.write(0, 0, bl));
453 if (r < 0) {
454 simple_err("failed to write to replay image", r);
455 return r;
456 }
457 return 0;
458 }
459
460 int finalize_journal(rados_ioctx_t ioctx, const char *imagename, int clones,
461 int order, uint64_t stripe_unit, int stripe_count) {
462 char replayimagename[1024];
463 replay_imagename(replayimagename, sizeof(replayimagename), clones);
464
465 char lastreplayimagename[1024];
466 if (clones > 0) {
467 replay_imagename(lastreplayimagename,
468 sizeof(lastreplayimagename), clones - 1);
469 }
470
471 int ret = create_replay_image(ioctx, order, stripe_unit,
472 stripe_count, replayimagename,
473 clones > 0 ? lastreplayimagename :
474 nullptr);
475 if (ret < 0) {
476 exit(EXIT_FAILURE);
477 }
478
479 ret = replay_journal(ioctx, imagename, replayimagename);
480 if (ret < 0) {
481 exit(EXIT_FAILURE);
482 }
483 return 0;
484 }
485
486 } // anonymous namespace
487
488 /*
489 * rbd
490 */
491
492 struct rbd_ctx {
493 const char *name; /* image name */
494 rbd_image_t image; /* image handle */
495 const char *krbd_name; /* image /dev/rbd<id> name */ /* reused for nbd test */
496 int krbd_fd; /* image /dev/rbd<id> fd */ /* reused for nbd test */
497 };
498
499 #define RBD_CTX_INIT (struct rbd_ctx) { NULL, NULL, NULL, -1}
500
501 struct rbd_operations {
502 int (*open)(const char *name, struct rbd_ctx *ctx);
503 int (*close)(struct rbd_ctx *ctx);
504 ssize_t (*read)(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf);
505 ssize_t (*write)(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf);
506 int (*flush)(struct rbd_ctx *ctx);
507 int (*discard)(struct rbd_ctx *ctx, uint64_t off, uint64_t len);
508 int (*get_size)(struct rbd_ctx *ctx, uint64_t *size);
509 int (*resize)(struct rbd_ctx *ctx, uint64_t size);
510 int (*clone)(struct rbd_ctx *ctx, const char *src_snapname,
511 const char *dst_imagename, int *order, int stripe_unit,
512 int stripe_count);
513 int (*flatten)(struct rbd_ctx *ctx);
514 ssize_t (*writesame)(struct rbd_ctx *ctx, uint64_t off, size_t len,
515 const char *buf, size_t data_len);
516 ssize_t (*compare_and_write)(struct rbd_ctx *ctx, uint64_t off, size_t len,
517 const char *cmp_buf, const char *buf);
518 };
519
520 char *pool; /* name of the pool our test image is in */
521 char *iname; /* name of our test image */
522 rados_t cluster; /* handle for our test cluster */
523 rados_ioctx_t ioctx; /* handle for our test pool */
524 struct krbd_ctx *krbd; /* handle for libkrbd */
525 bool skip_partial_discard; /* rbd_skip_partial_discard config value*/
526
527 /*
528 * librbd/krbd rbd_operations handlers. Given the rest of fsx.c, no
529 * attempt to do error handling is made in these handlers.
530 */
531
532 int
533 __librbd_open(const char *name, struct rbd_ctx *ctx)
534 {
535 rbd_image_t image;
536 int ret;
537
538 assert(!ctx->name && !ctx->image &&
539 !ctx->krbd_name && ctx->krbd_fd < 0);
540
541 ret = rbd_open(ioctx, name, &image, NULL);
542 if (ret < 0) {
543 prt("rbd_open(%s) failed\n", name);
544 return ret;
545 }
546
547 ctx->name = strdup(name);
548 ctx->image = image;
549 ctx->krbd_name = NULL;
550 ctx->krbd_fd = -1;
551
552 return 0;
553 }
554
555 int
556 librbd_open(const char *name, struct rbd_ctx *ctx)
557 {
558 return __librbd_open(name, ctx);
559 }
560
561 int
562 __librbd_close(struct rbd_ctx *ctx)
563 {
564 int ret;
565
566 assert(ctx->name && ctx->image);
567
568 ret = rbd_close(ctx->image);
569 if (ret < 0) {
570 prt("rbd_close(%s) failed\n", ctx->name);
571 return ret;
572 }
573
574 free((void *)ctx->name);
575
576 ctx->name = NULL;
577 ctx->image = NULL;
578
579 return 0;
580 }
581
582 int
583 librbd_close(struct rbd_ctx *ctx)
584 {
585 return __librbd_close(ctx);
586 }
587
588 int
589 librbd_verify_object_map(struct rbd_ctx *ctx)
590 {
591 int n;
592 uint64_t flags;
593 n = rbd_get_flags(ctx->image, &flags);
594 if (n < 0) {
595 prt("rbd_get_flags() failed\n");
596 return n;
597 }
598
599 if ((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
600 prt("rbd_get_flags() indicates object map is invalid\n");
601 return -EINVAL;
602 }
603 return 0;
604 }
605
606 ssize_t
607 librbd_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
608 {
609 ssize_t n;
610
611 n = rbd_read(ctx->image, off, len, buf);
612 if (n < 0)
613 prt("rbd_read(%llu, %zu) failed\n", off, len);
614
615 return n;
616 }
617
618 ssize_t
619 librbd_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
620 {
621 ssize_t n;
622 int ret;
623
624 n = rbd_write(ctx->image, off, len, buf);
625 if (n < 0) {
626 prt("rbd_write(%llu, %zu) failed\n", off, len);
627 return n;
628 }
629
630 ret = librbd_verify_object_map(ctx);
631 if (ret < 0) {
632 return ret;
633 }
634 return n;
635 }
636
637 int
638 librbd_flush(struct rbd_ctx *ctx)
639 {
640 int ret;
641
642 ret = rbd_flush(ctx->image);
643 if (ret < 0) {
644 prt("rbd_flush failed\n");
645 return ret;
646 }
647
648 return librbd_verify_object_map(ctx);
649 }
650
651 int
652 librbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
653 {
654 int ret;
655
656 ret = rbd_discard(ctx->image, off, len);
657 if (ret < 0) {
658 prt("rbd_discard(%llu, %llu) failed\n", off, len);
659 return ret;
660 }
661
662 return librbd_verify_object_map(ctx);
663 }
664
665 ssize_t
666 librbd_writesame(struct rbd_ctx *ctx, uint64_t off, size_t len,
667 const char *buf, size_t data_len)
668 {
669 ssize_t n;
670 int ret;
671
672 n = rbd_writesame(ctx->image, off, len, buf, data_len, 0);
673 if (n < 0) {
674 prt("rbd_writesame(%llu, %zu) failed\n", off, len);
675 return n;
676 }
677
678 ret = librbd_verify_object_map(ctx);
679 if (ret < 0) {
680 return ret;
681 }
682 return n;
683 }
684
685 ssize_t
686 librbd_compare_and_write(struct rbd_ctx *ctx, uint64_t off, size_t len,
687 const char *cmp_buf, const char *buf)
688 {
689 ssize_t n;
690 int ret;
691 uint64_t mismatch_off = 0;
692
693 n = rbd_compare_and_write(ctx->image, off, len, cmp_buf, buf, &mismatch_off, 0);
694 if (n == -EINVAL) {
695 return n;
696 } else if (n < 0) {
697 prt("rbd_compare_and_write mismatch(%llu, %zu, %llu) failed\n",
698 off, len, mismatch_off);
699 return n;
700 }
701
702 ret = librbd_verify_object_map(ctx);
703 if (ret < 0) {
704 return ret;
705 }
706 return n;
707
708 }
709
710 int
711 librbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
712 {
713 rbd_image_info_t info;
714 int ret;
715
716 ret = rbd_stat(ctx->image, &info, sizeof(info));
717 if (ret < 0) {
718 prt("rbd_stat failed\n");
719 return ret;
720 }
721
722 *size = info.size;
723
724 return 0;
725 }
726
727 int
728 __librbd_resize(struct rbd_ctx *ctx, uint64_t size)
729 {
730 int ret;
731
732 ret = rbd_resize(ctx->image, size);
733 if (ret < 0) {
734 prt("rbd_resize(%llu) failed\n", size);
735 return ret;
736 }
737
738 return librbd_verify_object_map(ctx);
739 }
740
741 int
742 librbd_resize(struct rbd_ctx *ctx, uint64_t size)
743 {
744 return __librbd_resize(ctx, size);
745 }
746
747 int
748 __librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
749 const char *dst_imagename, int *order, int stripe_unit,
750 int stripe_count, bool krbd)
751 {
752 int ret;
753
754 ret = rbd_snap_create(ctx->image, src_snapname);
755 if (ret < 0) {
756 prt("rbd_snap_create(%s@%s) failed\n", ctx->name,
757 src_snapname);
758 return ret;
759 }
760
761 ret = rbd_snap_protect(ctx->image, src_snapname);
762 if (ret < 0) {
763 prt("rbd_snap_protect(%s@%s) failed\n", ctx->name,
764 src_snapname);
765 return ret;
766 }
767
768 uint64_t features = RBD_FEATURES_ALL;
769 if (krbd) {
770 features &= ~(RBD_FEATURE_OBJECT_MAP |
771 RBD_FEATURE_FAST_DIFF |
772 RBD_FEATURE_DEEP_FLATTEN |
773 RBD_FEATURE_JOURNALING);
774 }
775 ret = rbd_clone2(ioctx, ctx->name, src_snapname, ioctx,
776 dst_imagename, features, order,
777 stripe_unit, stripe_count);
778 if (ret < 0) {
779 prt("rbd_clone2(%s@%s -> %s) failed\n", ctx->name,
780 src_snapname, dst_imagename);
781 return ret;
782 }
783
784 return 0;
785 }
786
787 int
788 librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
789 const char *dst_imagename, int *order, int stripe_unit,
790 int stripe_count)
791 {
792 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
793 stripe_unit, stripe_count, false);
794 }
795
796 int
797 __librbd_flatten(struct rbd_ctx *ctx)
798 {
799 int ret;
800
801 ret = rbd_flatten(ctx->image);
802 if (ret < 0) {
803 prt("rbd_flatten failed\n");
804 return ret;
805 }
806
807 return librbd_verify_object_map(ctx);
808 }
809
810 int
811 librbd_flatten(struct rbd_ctx *ctx)
812 {
813 return __librbd_flatten(ctx);
814 }
815
816 const struct rbd_operations librbd_operations = {
817 librbd_open,
818 librbd_close,
819 librbd_read,
820 librbd_write,
821 librbd_flush,
822 librbd_discard,
823 librbd_get_size,
824 librbd_resize,
825 librbd_clone,
826 librbd_flatten,
827 librbd_writesame,
828 librbd_compare_and_write,
829 };
830
831 int
832 krbd_open(const char *name, struct rbd_ctx *ctx)
833 {
834 char *devnode;
835 int fd;
836 int ret;
837
838 ret = __librbd_open(name, ctx);
839 if (ret < 0)
840 return ret;
841
842 ret = krbd_map(krbd, pool, name, "", "", &devnode);
843 if (ret < 0) {
844 prt("krbd_map(%s) failed\n", name);
845 return ret;
846 }
847
848 fd = open(devnode, O_RDWR | o_direct);
849 if (fd < 0) {
850 ret = -errno;
851 prt("open(%s) failed\n", devnode);
852 return ret;
853 }
854
855 ctx->krbd_name = devnode;
856 ctx->krbd_fd = fd;
857
858 return 0;
859 }
860
861 int
862 krbd_close(struct rbd_ctx *ctx)
863 {
864 int ret;
865
866 assert(ctx->krbd_name && ctx->krbd_fd >= 0);
867
868 if (close(ctx->krbd_fd) < 0) {
869 ret = -errno;
870 prt("close(%s) failed\n", ctx->krbd_name);
871 return ret;
872 }
873
874 ret = krbd_unmap(krbd, ctx->krbd_name, "");
875 if (ret < 0) {
876 prt("krbd_unmap(%s) failed\n", ctx->krbd_name);
877 return ret;
878 }
879
880 free((void *)ctx->krbd_name);
881
882 ctx->krbd_name = NULL;
883 ctx->krbd_fd = -1;
884
885 return __librbd_close(ctx);
886 }
887
888 ssize_t
889 krbd_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
890 {
891 ssize_t n;
892
893 n = pread(ctx->krbd_fd, buf, len, off);
894 if (n < 0) {
895 n = -errno;
896 prt("pread(%llu, %zu) failed\n", off, len);
897 return n;
898 }
899
900 return n;
901 }
902
903 ssize_t
904 krbd_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
905 {
906 ssize_t n;
907
908 n = pwrite(ctx->krbd_fd, buf, len, off);
909 if (n < 0) {
910 n = -errno;
911 prt("pwrite(%llu, %zu) failed\n", off, len);
912 return n;
913 }
914
915 return n;
916 }
917
918 int
919 __krbd_flush(struct rbd_ctx *ctx, bool invalidate)
920 {
921 int ret;
922
923 if (o_direct)
924 return 0;
925
926 /*
927 * BLKFLSBUF will sync the filesystem on top of the device (we
928 * don't care about that here, since we write directly to it),
929 * write out any dirty buffers and invalidate the buffer cache.
930 * It won't do a hardware cache flush.
931 *
932 * fsync() will write out any dirty buffers and do a hardware
933 * cache flush (which we don't care about either, because for
934 * krbd it's a noop). It won't try to empty the buffer cache
935 * nor poke the filesystem before writing out.
936 *
937 * Given that, for our purposes, fsync is a flush, while
938 * BLKFLSBUF is a flush+invalidate.
939 */
940 if (invalidate)
941 ret = ioctl(ctx->krbd_fd, BLKFLSBUF, NULL);
942 else
943 ret = fsync(ctx->krbd_fd);
944 if (ret < 0) {
945 ret = -errno;
946 prt("%s failed\n", invalidate ? "BLKFLSBUF" : "fsync");
947 return ret;
948 }
949
950 return 0;
951 }
952
953 int
954 krbd_flush(struct rbd_ctx *ctx)
955 {
956 return __krbd_flush(ctx, false);
957 }
958
959 int
960 krbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
961 {
962 uint64_t range[2] = { off, len };
963 int ret;
964
965 /*
966 * BLKZEROOUT goes straight to disk and doesn't do anything
967 * about dirty buffers. This means we need to flush so that
968 *
969 * write 0..3M
970 * discard 1..2M
971 *
972 * results in "data 0000 data" rather than "data data data" on
973 * disk and invalidate so that
974 *
975 * discard 1..2M
976 * read 0..3M
977 *
978 * returns "data 0000 data" rather than "data data data" in
979 * case 1..2M was cached.
980 *
981 * Note: These cache coherency issues are supposed to be fixed
982 * in recent kernels.
983 */
984 ret = __krbd_flush(ctx, true);
985 if (ret < 0)
986 return ret;
987
988 /*
989 * off and len must be 512-byte aligned, otherwise BLKZEROOUT
990 * will fail with -EINVAL. This means that -K (enable krbd
991 * mode) requires -h 512 or similar.
992 */
993 if (ioctl(ctx->krbd_fd, BLKZEROOUT, &range) < 0) {
994 ret = -errno;
995 prt("BLKZEROOUT(%llu, %llu) failed\n", off, len);
996 return ret;
997 }
998
999 return 0;
1000 }
1001
1002 int
1003 krbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
1004 {
1005 uint64_t bytes;
1006
1007 if (ioctl(ctx->krbd_fd, BLKGETSIZE64, &bytes) < 0) {
1008 int ret = -errno;
1009 prt("BLKGETSIZE64 failed\n");
1010 return ret;
1011 }
1012
1013 *size = bytes;
1014
1015 return 0;
1016 }
1017
1018 int
1019 krbd_resize(struct rbd_ctx *ctx, uint64_t size)
1020 {
1021 int ret;
1022
1023 assert(size % truncbdy == 0);
1024
1025 /*
1026 * When krbd detects a size change, it calls revalidate_disk(),
1027 * which ends up calling invalidate_bdev(), which invalidates
1028 * clean pages and does nothing about dirty pages beyond the
1029 * new size. The preceding cache flush makes sure those pages
1030 * are invalidated, which is what we need on shrink so that
1031 *
1032 * write 0..1M
1033 * resize 0
1034 * resize 2M
1035 * read 0..2M
1036 *
1037 * returns "0000 0000" rather than "data 0000".
1038 */
1039 ret = __krbd_flush(ctx, false);
1040 if (ret < 0)
1041 return ret;
1042
1043 return __librbd_resize(ctx, size);
1044 }
1045
1046 int
1047 krbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
1048 const char *dst_imagename, int *order, int stripe_unit,
1049 int stripe_count)
1050 {
1051 int ret;
1052
1053 ret = __krbd_flush(ctx, false);
1054 if (ret < 0)
1055 return ret;
1056
1057 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
1058 stripe_unit, stripe_count, true);
1059 }
1060
1061 int
1062 krbd_flatten(struct rbd_ctx *ctx)
1063 {
1064 int ret;
1065
1066 ret = __krbd_flush(ctx, false);
1067 if (ret < 0)
1068 return ret;
1069
1070 return __librbd_flatten(ctx);
1071 }
1072
1073 const struct rbd_operations krbd_operations = {
1074 krbd_open,
1075 krbd_close,
1076 krbd_read,
1077 krbd_write,
1078 krbd_flush,
1079 krbd_discard,
1080 krbd_get_size,
1081 krbd_resize,
1082 krbd_clone,
1083 krbd_flatten,
1084 NULL,
1085 };
1086
1087 int
1088 nbd_open(const char *name, struct rbd_ctx *ctx)
1089 {
1090 int r;
1091 int fd;
1092 char dev[4096];
1093 char *devnode;
1094
1095 SubProcess process("rbd-nbd", SubProcess::KEEP, SubProcess::PIPE,
1096 SubProcess::KEEP);
1097 process.add_cmd_arg("map");
1098 std::string img;
1099 img.append(pool);
1100 img.append("/");
1101 img.append(name);
1102 process.add_cmd_arg(img.c_str());
1103
1104 r = __librbd_open(name, ctx);
1105 if (r < 0)
1106 return r;
1107
1108 r = process.spawn();
1109 if (r < 0) {
1110 prt("nbd_open failed to run rbd-nbd error: %s\n", process.err().c_str());
1111 return r;
1112 }
1113 r = safe_read(process.get_stdout(), dev, sizeof(dev));
1114 if (r < 0) {
1115 prt("nbd_open failed to get nbd device path\n");
1116 return r;
1117 }
1118 for (int i = 0; i < r; ++i)
1119 if (dev[i] == 10 || dev[i] == 13)
1120 dev[i] = 0;
1121 dev[r] = 0;
1122 r = process.join();
1123 if (r) {
1124 prt("rbd-nbd failed with error: %s", process.err().c_str());
1125 return -EINVAL;
1126 }
1127
1128 devnode = strdup(dev);
1129 if (!devnode)
1130 return -ENOMEM;
1131
1132 fd = open(devnode, O_RDWR | o_direct);
1133 if (fd < 0) {
1134 r = -errno;
1135 prt("open(%s) failed\n", devnode);
1136 return r;
1137 }
1138
1139 ctx->krbd_name = devnode;
1140 ctx->krbd_fd = fd;
1141
1142 return 0;
1143 }
1144
1145 int
1146 nbd_close(struct rbd_ctx *ctx)
1147 {
1148 int r;
1149
1150 assert(ctx->krbd_name && ctx->krbd_fd >= 0);
1151
1152 if (close(ctx->krbd_fd) < 0) {
1153 r = -errno;
1154 prt("close(%s) failed\n", ctx->krbd_name);
1155 return r;
1156 }
1157
1158 SubProcess process("rbd-nbd");
1159 process.add_cmd_arg("unmap");
1160 process.add_cmd_arg(ctx->krbd_name);
1161
1162 r = process.spawn();
1163 if (r < 0) {
1164 prt("nbd_close failed to run rbd-nbd error: %s\n", process.err().c_str());
1165 return r;
1166 }
1167 r = process.join();
1168 if (r) {
1169 prt("rbd-nbd failed with error: %d", process.err().c_str());
1170 return -EINVAL;
1171 }
1172
1173 free((void *)ctx->krbd_name);
1174
1175 ctx->krbd_name = NULL;
1176 ctx->krbd_fd = -1;
1177
1178 return __librbd_close(ctx);
1179 }
1180
1181 int
1182 nbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
1183 const char *dst_imagename, int *order, int stripe_unit,
1184 int stripe_count)
1185 {
1186 int ret;
1187
1188 ret = __krbd_flush(ctx, false);
1189 if (ret < 0)
1190 return ret;
1191
1192 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
1193 stripe_unit, stripe_count, false);
1194 }
1195
1196 const struct rbd_operations nbd_operations = {
1197 nbd_open,
1198 nbd_close,
1199 krbd_read,
1200 krbd_write,
1201 krbd_flush,
1202 krbd_discard,
1203 krbd_get_size,
1204 krbd_resize,
1205 nbd_clone,
1206 krbd_flatten,
1207 NULL,
1208 };
1209
1210 struct rbd_ctx ctx = RBD_CTX_INIT;
1211 const struct rbd_operations *ops = &librbd_operations;
1212
1213 static bool rbd_image_has_parent(struct rbd_ctx *ctx)
1214 {
1215 int ret;
1216
1217 ret = rbd_get_parent_info(ctx->image, NULL, 0, NULL, 0, NULL, 0);
1218 if (ret < 0 && ret != -ENOENT) {
1219 prterrcode("rbd_get_parent_info", ret);
1220 exit(1);
1221 }
1222
1223 return !ret;
1224 }
1225
1226 /*
1227 * fsx
1228 */
1229
1230 void
1231 log4(int operation, int arg0, int arg1, int arg2)
1232 {
1233 struct log_entry *le;
1234
1235 le = &oplog[logptr];
1236 le->operation = operation;
1237 if (closeopen)
1238 le->operation = ~ le->operation;
1239 le->args[0] = arg0;
1240 le->args[1] = arg1;
1241 le->args[2] = arg2;
1242 logptr++;
1243 logcount++;
1244 if (logptr >= LOGSIZE)
1245 logptr = 0;
1246 }
1247
1248 void
1249 logdump(void)
1250 {
1251 int i, count, down;
1252 struct log_entry *lp;
1253 const char *falloc_type[3] = {"PAST_EOF", "EXTENDING", "INTERIOR"};
1254
1255 prt("LOG DUMP (%d total operations):\n", logcount);
1256 if (logcount < LOGSIZE) {
1257 i = 0;
1258 count = logcount;
1259 } else {
1260 i = logptr;
1261 count = LOGSIZE;
1262 }
1263 for ( ; count > 0; count--) {
1264 int opnum;
1265
1266 opnum = i+1 + (logcount/LOGSIZE)*LOGSIZE;
1267 prt("%d(%3d mod 256): ", opnum, opnum%256);
1268 lp = &oplog[i];
1269 if ((closeopen = lp->operation < 0))
1270 lp->operation = ~ lp->operation;
1271
1272 switch (lp->operation) {
1273 case OP_MAPREAD:
1274 prt("MAPREAD 0x%x thru 0x%x\t(0x%x bytes)",
1275 lp->args[0], lp->args[0] + lp->args[1] - 1,
1276 lp->args[1]);
1277 if (badoff >= lp->args[0] && badoff <
1278 lp->args[0] + lp->args[1])
1279 prt("\t***RRRR***");
1280 break;
1281 case OP_MAPWRITE:
1282 prt("MAPWRITE 0x%x thru 0x%x\t(0x%x bytes)",
1283 lp->args[0], lp->args[0] + lp->args[1] - 1,
1284 lp->args[1]);
1285 if (badoff >= lp->args[0] && badoff <
1286 lp->args[0] + lp->args[1])
1287 prt("\t******WWWW");
1288 break;
1289 case OP_READ:
1290 prt("READ 0x%x thru 0x%x\t(0x%x bytes)",
1291 lp->args[0], lp->args[0] + lp->args[1] - 1,
1292 lp->args[1]);
1293 if (badoff >= lp->args[0] &&
1294 badoff < lp->args[0] + lp->args[1])
1295 prt("\t***RRRR***");
1296 break;
1297 case OP_WRITE:
1298 prt("WRITE 0x%x thru 0x%x\t(0x%x bytes)",
1299 lp->args[0], lp->args[0] + lp->args[1] - 1,
1300 lp->args[1]);
1301 if (lp->args[0] > lp->args[2])
1302 prt(" HOLE");
1303 else if (lp->args[0] + lp->args[1] > lp->args[2])
1304 prt(" EXTEND");
1305 if ((badoff >= lp->args[0] || badoff >=lp->args[2]) &&
1306 badoff < lp->args[0] + lp->args[1])
1307 prt("\t***WWWW");
1308 break;
1309 case OP_TRUNCATE:
1310 down = lp->args[0] < lp->args[1];
1311 prt("TRUNCATE %s\tfrom 0x%x to 0x%x",
1312 down ? "DOWN" : "UP", lp->args[1], lp->args[0]);
1313 if (badoff >= lp->args[!down] &&
1314 badoff < lp->args[!!down])
1315 prt("\t******WWWW");
1316 break;
1317 case OP_FALLOCATE:
1318 /* 0: offset 1: length 2: where alloced */
1319 prt("FALLOC 0x%x thru 0x%x\t(0x%x bytes) %s",
1320 lp->args[0], lp->args[0] + lp->args[1],
1321 lp->args[1], falloc_type[lp->args[2]]);
1322 if (badoff >= lp->args[0] &&
1323 badoff < lp->args[0] + lp->args[1])
1324 prt("\t******FFFF");
1325 break;
1326 case OP_PUNCH_HOLE:
1327 prt("PUNCH 0x%x thru 0x%x\t(0x%x bytes)",
1328 lp->args[0], lp->args[0] + lp->args[1] - 1,
1329 lp->args[1]);
1330 if (badoff >= lp->args[0] && badoff <
1331 lp->args[0] + lp->args[1])
1332 prt("\t******PPPP");
1333 break;
1334 case OP_WRITESAME:
1335 prt("WRITESAME 0x%x thru 0x%x\t(0x%x bytes) data_size 0x%x",
1336 lp->args[0], lp->args[0] + lp->args[1] - 1,
1337 lp->args[1], lp->args[2]);
1338 if (badoff >= lp->args[0] &&
1339 badoff < lp->args[0] + lp->args[1])
1340 prt("\t***WSWSWSWS");
1341 break;
1342 case OP_COMPARE_AND_WRITE:
1343 prt("COMPARE_AND_WRITE 0x%x thru 0x%x\t(0x%x bytes)",
1344 lp->args[0], lp->args[0] + lp->args[1] - 1,
1345 lp->args[1]);
1346 if (lp->args[0] > lp->args[2])
1347 prt(" HOLE");
1348 else if (lp->args[0] + lp->args[1] > lp->args[2])
1349 prt(" EXTEND");
1350 if ((badoff >= lp->args[0] || badoff >=lp->args[2]) &&
1351 badoff < lp->args[0] + lp->args[1])
1352 prt("\t***WWWW");
1353 break;
1354 case OP_CLONE:
1355 prt("CLONE");
1356 break;
1357 case OP_FLATTEN:
1358 prt("FLATTEN");
1359 break;
1360 case OP_SKIPPED:
1361 prt("SKIPPED (no operation)");
1362 break;
1363 default:
1364 prt("BOGUS LOG ENTRY (operation code = %d)!",
1365 lp->operation);
1366 }
1367 if (closeopen)
1368 prt("\n\t\tCLOSE/OPEN");
1369 prt("\n");
1370 i++;
1371 if (i == LOGSIZE)
1372 i = 0;
1373 }
1374 }
1375
1376 void
1377 save_buffer(char *buffer, off_t bufferlength, int fd)
1378 {
1379 off_t ret;
1380 ssize_t byteswritten;
1381
1382 if (fd <= 0 || bufferlength == 0)
1383 return;
1384
1385 if (bufferlength > SSIZE_MAX) {
1386 prt("fsx flaw: overflow in save_buffer\n");
1387 exit(67);
1388 }
1389
1390 ret = lseek(fd, (off_t)0, SEEK_SET);
1391 if (ret == (off_t)-1)
1392 prterr("save_buffer: lseek 0");
1393
1394 byteswritten = write(fd, buffer, (size_t)bufferlength);
1395 if (byteswritten != bufferlength) {
1396 if (byteswritten == -1)
1397 prterr("save_buffer write");
1398 else
1399 warn("save_buffer: short write, 0x%x bytes instead of 0x%llx\n",
1400 (unsigned)byteswritten,
1401 (unsigned long long)bufferlength);
1402 }
1403 }
1404
1405
1406 void
1407 report_failure(int status)
1408 {
1409 logdump();
1410
1411 if (fsxgoodfd) {
1412 if (good_buf) {
1413 save_buffer(good_buf, file_size, fsxgoodfd);
1414 prt("Correct content saved for comparison\n");
1415 prt("(maybe hexdump \"%s\" vs \"%s.fsxgood\")\n",
1416 iname, iname);
1417 }
1418 close(fsxgoodfd);
1419 }
1420 sleep(3); // so the log can flush to disk. KLUDGEY!
1421 exit(status);
1422 }
1423
1424 #define short_at(cp) ((unsigned short)((*((unsigned char *)(cp)) << 8) | \
1425 *(((unsigned char *)(cp)) + 1)))
1426
1427 int
1428 fsxcmp(char *good_buf, char *temp_buf, unsigned size)
1429 {
1430 if (!skip_partial_discard) {
1431 return memcmp(good_buf, temp_buf, size);
1432 }
1433
1434 for (unsigned i = 0; i < size; i++) {
1435 if (good_buf[i] != temp_buf[i] && good_buf[i] != 0) {
1436 return good_buf[i] - temp_buf[i];
1437 }
1438 }
1439 return 0;
1440 }
1441
1442 void
1443 check_buffers(char *good_buf, char *temp_buf, unsigned offset, unsigned size)
1444 {
1445 if (fsxcmp(good_buf + offset, temp_buf, size) != 0) {
1446 unsigned i = 0;
1447 unsigned n = 0;
1448
1449 prt("READ BAD DATA: offset = 0x%x, size = 0x%x, fname = %s\n",
1450 offset, size, iname);
1451 prt("OFFSET\tGOOD\tBAD\tRANGE\n");
1452 while (size > 0) {
1453 unsigned char c = good_buf[offset];
1454 unsigned char t = temp_buf[i];
1455 if (c != t) {
1456 if (n < 16) {
1457 unsigned bad = short_at(&temp_buf[i]);
1458 prt("0x%5x\t0x%04x\t0x%04x", offset,
1459 short_at(&good_buf[offset]), bad);
1460 unsigned op = temp_buf[(offset & 1) ? i+1 : i];
1461 prt("\t0x%5x\n", n);
1462 if (op)
1463 prt("operation# (mod 256) for "
1464 "the bad data may be %u\n",
1465 ((unsigned)op & 0xff));
1466 else
1467 prt("operation# (mod 256) for "
1468 "the bad data unknown, check"
1469 " HOLE and EXTEND ops\n");
1470 }
1471 n++;
1472 badoff = offset;
1473 }
1474 offset++;
1475 i++;
1476 size--;
1477 }
1478 report_failure(110);
1479 }
1480 }
1481
1482
1483 void
1484 check_size(void)
1485 {
1486 uint64_t size;
1487 int ret;
1488
1489 ret = ops->get_size(&ctx, &size);
1490 if (ret < 0)
1491 prterrcode("check_size: ops->get_size", ret);
1492
1493 if ((uint64_t)file_size != size) {
1494 prt("Size error: expected 0x%llx stat 0x%llx\n",
1495 (unsigned long long)file_size,
1496 (unsigned long long)size);
1497 report_failure(120);
1498 }
1499 }
1500
1501 #define TRUNC_HACK_SIZE (200ULL << 9) /* 512-byte aligned for krbd */
1502
1503 void
1504 check_trunc_hack(void)
1505 {
1506 uint64_t size;
1507 int ret;
1508
1509 ret = ops->resize(&ctx, 0ULL);
1510 if (ret < 0)
1511 prterrcode("check_trunc_hack: ops->resize pre", ret);
1512
1513 ret = ops->resize(&ctx, TRUNC_HACK_SIZE);
1514 if (ret < 0)
1515 prterrcode("check_trunc_hack: ops->resize actual", ret);
1516
1517 ret = ops->get_size(&ctx, &size);
1518 if (ret < 0)
1519 prterrcode("check_trunc_hack: ops->get_size", ret);
1520
1521 if (size != TRUNC_HACK_SIZE) {
1522 prt("no extend on truncate! not posix!\n");
1523 exit(130);
1524 }
1525
1526 ret = ops->resize(&ctx, 0ULL);
1527 if (ret < 0)
1528 prterrcode("check_trunc_hack: ops->resize post", ret);
1529 }
1530
1531 int
1532 create_image()
1533 {
1534 int r;
1535 int order = 0;
1536 char buf[32];
1537
1538 r = rados_create(&cluster, NULL);
1539 if (r < 0) {
1540 simple_err("Could not create cluster handle", r);
1541 return r;
1542 }
1543 rados_conf_parse_env(cluster, NULL);
1544 r = rados_conf_read_file(cluster, NULL);
1545 if (r < 0) {
1546 simple_err("Error reading ceph config file", r);
1547 goto failed_shutdown;
1548 }
1549 r = rados_connect(cluster);
1550 if (r < 0) {
1551 simple_err("Error connecting to cluster", r);
1552 goto failed_shutdown;
1553 }
1554 r = krbd_create_from_context(rados_cct(cluster), &krbd);
1555 if (r < 0) {
1556 simple_err("Could not create libkrbd handle", r);
1557 goto failed_shutdown;
1558 }
1559
1560 r = rados_pool_create(cluster, pool);
1561 if (r < 0 && r != -EEXIST) {
1562 simple_err("Error creating pool", r);
1563 goto failed_krbd;
1564 }
1565 r = rados_ioctx_create(cluster, pool, &ioctx);
1566 if (r < 0) {
1567 simple_err("Error creating ioctx", r);
1568 goto failed_krbd;
1569 }
1570 rados_application_enable(ioctx, "rbd", 1);
1571
1572 if (clone_calls || journal_replay) {
1573 uint64_t features = 0;
1574 if (clone_calls) {
1575 features |= RBD_FEATURE_LAYERING;
1576 }
1577 if (journal_replay) {
1578 features |= (RBD_FEATURE_EXCLUSIVE_LOCK |
1579 RBD_FEATURE_JOURNALING);
1580 }
1581 r = rbd_create2(ioctx, iname, 0, features, &order);
1582 } else {
1583 r = rbd_create(ioctx, iname, 0, &order);
1584 }
1585 if (r < 0) {
1586 simple_err("Error creating image", r);
1587 goto failed_open;
1588 }
1589
1590 if (journal_replay) {
1591 r = register_journal(ioctx, iname);
1592 if (r < 0) {
1593 goto failed_open;
1594 }
1595 }
1596
1597 r = rados_conf_get(cluster, "rbd_skip_partial_discard", buf,
1598 sizeof(buf));
1599 if (r < 0) {
1600 simple_err("Could not get rbd_skip_partial_discard value", r);
1601 goto failed_open;
1602 }
1603 skip_partial_discard = (strcmp(buf, "true") == 0);
1604
1605 return 0;
1606
1607 failed_open:
1608 rados_ioctx_destroy(ioctx);
1609 failed_krbd:
1610 krbd_destroy(krbd);
1611 failed_shutdown:
1612 rados_shutdown(cluster);
1613 return r;
1614 }
1615
1616 void
1617 doflush(unsigned offset, unsigned size)
1618 {
1619 int ret;
1620
1621 if (o_direct)
1622 return;
1623
1624 ret = ops->flush(&ctx);
1625 if (ret < 0)
1626 prterrcode("doflush: ops->flush", ret);
1627 }
1628
1629 void
1630 doread(unsigned offset, unsigned size)
1631 {
1632 int ret;
1633
1634 offset -= offset % readbdy;
1635 if (o_direct)
1636 size -= size % readbdy;
1637 if (size == 0) {
1638 if (!quiet && testcalls > simulatedopcount && !o_direct)
1639 prt("skipping zero size read\n");
1640 log4(OP_SKIPPED, OP_READ, offset, size);
1641 return;
1642 }
1643 if (size + offset > file_size) {
1644 if (!quiet && testcalls > simulatedopcount)
1645 prt("skipping seek/read past end of file\n");
1646 log4(OP_SKIPPED, OP_READ, offset, size);
1647 return;
1648 }
1649
1650 log4(OP_READ, offset, size, 0);
1651
1652 if (testcalls <= simulatedopcount)
1653 return;
1654
1655 if (!quiet &&
1656 ((progressinterval && testcalls % progressinterval == 0) ||
1657 (debug &&
1658 (monitorstart == -1 ||
1659 (static_cast<long>(offset + size) > monitorstart &&
1660 (monitorend == -1 ||
1661 static_cast<long>(offset) <= monitorend))))))
1662 prt("%lu read\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
1663 offset, offset + size - 1, size);
1664
1665 ret = ops->read(&ctx, offset, size, temp_buf);
1666 if (ret != (int)size) {
1667 if (ret < 0)
1668 prterrcode("doread: ops->read", ret);
1669 else
1670 prt("short read: 0x%x bytes instead of 0x%x\n",
1671 ret, size);
1672 report_failure(141);
1673 }
1674
1675 check_buffers(good_buf, temp_buf, offset, size);
1676 }
1677
1678
1679 void
1680 check_eofpage(char *s, unsigned offset, char *p, int size)
1681 {
1682 unsigned long last_page, should_be_zero;
1683
1684 if (offset + size <= (file_size & ~page_mask))
1685 return;
1686 /*
1687 * we landed in the last page of the file
1688 * test to make sure the VM system provided 0's
1689 * beyond the true end of the file mapping
1690 * (as required by mmap def in 1996 posix 1003.1)
1691 */
1692 last_page = ((unsigned long)p + (offset & page_mask) + size) & ~page_mask;
1693
1694 for (should_be_zero = last_page + (file_size & page_mask);
1695 should_be_zero < last_page + page_size;
1696 should_be_zero++)
1697 if (*(char *)should_be_zero) {
1698 prt("Mapped %s: non-zero data past EOF (0x%llx) page offset 0x%x is 0x%04x\n",
1699 s, file_size - 1, should_be_zero & page_mask,
1700 short_at(should_be_zero));
1701 report_failure(205);
1702 }
1703 }
1704
1705
1706 void
1707 gendata(char *original_buf, char *good_buf, unsigned offset, unsigned size)
1708 {
1709 while (size--) {
1710 good_buf[offset] = testcalls % 256;
1711 if (offset % 2)
1712 good_buf[offset] += original_buf[offset];
1713 offset++;
1714 }
1715 }
1716
1717
1718 void
1719 dowrite(unsigned offset, unsigned size)
1720 {
1721 ssize_t ret;
1722 off_t newsize;
1723
1724 offset -= offset % writebdy;
1725 if (o_direct)
1726 size -= size % writebdy;
1727 if (size == 0) {
1728 if (!quiet && testcalls > simulatedopcount && !o_direct)
1729 prt("skipping zero size write\n");
1730 log4(OP_SKIPPED, OP_WRITE, offset, size);
1731 return;
1732 }
1733
1734 log4(OP_WRITE, offset, size, file_size);
1735
1736 gendata(original_buf, good_buf, offset, size);
1737 if (file_size < offset + size) {
1738 newsize = ceil(((double)offset + size) / truncbdy) * truncbdy;
1739 if (file_size < newsize)
1740 memset(good_buf + file_size, '\0', newsize - file_size);
1741 file_size = newsize;
1742 if (lite) {
1743 warn("Lite file size bug in fsx!");
1744 report_failure(149);
1745 }
1746 ret = ops->resize(&ctx, newsize);
1747 if (ret < 0) {
1748 prterrcode("dowrite: ops->resize", ret);
1749 report_failure(150);
1750 }
1751 }
1752
1753 if (testcalls <= simulatedopcount)
1754 return;
1755
1756 if (!quiet &&
1757 ((progressinterval && testcalls % progressinterval == 0) ||
1758 (debug &&
1759 (monitorstart == -1 ||
1760 (static_cast<long>(offset + size) > monitorstart &&
1761 (monitorend == -1 ||
1762 static_cast<long>(offset) <= monitorend))))))
1763 prt("%lu write\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
1764 offset, offset + size - 1, size);
1765
1766 ret = ops->write(&ctx, offset, size, good_buf + offset);
1767 if (ret != (ssize_t)size) {
1768 if (ret < 0)
1769 prterrcode("dowrite: ops->write", ret);
1770 else
1771 prt("short write: 0x%x bytes instead of 0x%x\n",
1772 ret, size);
1773 report_failure(151);
1774 }
1775
1776 if (flush_enabled)
1777 doflush(offset, size);
1778 }
1779
1780
1781 void
1782 dotruncate(unsigned size)
1783 {
1784 int oldsize = file_size;
1785 int ret;
1786
1787 size -= size % truncbdy;
1788 if (size > biggest) {
1789 biggest = size;
1790 if (!quiet && testcalls > simulatedopcount)
1791 prt("truncating to largest ever: 0x%x\n", size);
1792 }
1793
1794 log4(OP_TRUNCATE, size, (unsigned)file_size, 0);
1795
1796 if (size > file_size)
1797 memset(good_buf + file_size, '\0', size - file_size);
1798 else if (size < file_size)
1799 memset(good_buf + size, '\0', file_size - size);
1800 file_size = size;
1801
1802 if (testcalls <= simulatedopcount)
1803 return;
1804
1805 if ((progressinterval && testcalls % progressinterval == 0) ||
1806 (debug && (monitorstart == -1 || monitorend == -1 ||
1807 (long)size <= monitorend)))
1808 prt("%lu trunc\tfrom 0x%x to 0x%x\n", testcalls, oldsize, size);
1809
1810 ret = ops->resize(&ctx, size);
1811 if (ret < 0) {
1812 prterrcode("dotruncate: ops->resize", ret);
1813 report_failure(160);
1814 }
1815 }
1816
1817 void
1818 do_punch_hole(unsigned offset, unsigned length)
1819 {
1820 unsigned end_offset;
1821 int max_offset = 0;
1822 int max_len = 0;
1823 int ret;
1824
1825 offset -= offset % holebdy;
1826 length -= length % holebdy;
1827 if (length == 0) {
1828 if (!quiet && testcalls > simulatedopcount)
1829 prt("skipping zero length punch hole\n");
1830 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, length);
1831 return;
1832 }
1833
1834 if (file_size <= (loff_t)offset) {
1835 if (!quiet && testcalls > simulatedopcount)
1836 prt("skipping hole punch off the end of the file\n");
1837 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, length);
1838 return;
1839 }
1840
1841 end_offset = offset + length;
1842
1843 log4(OP_PUNCH_HOLE, offset, length, 0);
1844
1845 if (testcalls <= simulatedopcount)
1846 return;
1847
1848 if ((progressinterval && testcalls % progressinterval == 0) ||
1849 (debug && (monitorstart == -1 || monitorend == -1 ||
1850 (long)end_offset <= monitorend))) {
1851 prt("%lu punch\tfrom 0x%x to 0x%x, (0x%x bytes)\n", testcalls,
1852 offset, offset+length, length);
1853 }
1854
1855 ret = ops->discard(&ctx, (unsigned long long)offset,
1856 (unsigned long long)length);
1857 if (ret < 0) {
1858 prterrcode("do_punch_hole: ops->discard", ret);
1859 report_failure(161);
1860 }
1861
1862 max_offset = offset < file_size ? offset : file_size;
1863 max_len = max_offset + length <= file_size ? length :
1864 file_size - max_offset;
1865 memset(good_buf + max_offset, '\0', max_len);
1866 }
1867
1868 unsigned get_data_size(unsigned size)
1869 {
1870 unsigned i;
1871 unsigned hint;
1872 unsigned max = sqrt((double)size) + 1;
1873 unsigned good = 1;
1874 unsigned curr = good;
1875
1876 hint = get_random() % max;
1877
1878 for (i = 1; i < max && curr < hint; i++) {
1879 if (size % i == 0) {
1880 good = curr;
1881 curr = i;
1882 }
1883 }
1884
1885 if (curr == hint)
1886 good = curr;
1887
1888 return good;
1889 }
1890
1891 void
1892 dowritesame(unsigned offset, unsigned size)
1893 {
1894 ssize_t ret;
1895 off_t newsize;
1896 unsigned buf_off;
1897 unsigned data_size;
1898 int n;
1899
1900 offset -= offset % writebdy;
1901 if (o_direct)
1902 size -= size % writebdy;
1903 if (size == 0) {
1904 if (!quiet && testcalls > simulatedopcount && !o_direct)
1905 prt("skipping zero size writesame\n");
1906 log4(OP_SKIPPED, OP_WRITESAME, offset, size);
1907 return;
1908 }
1909
1910 data_size = get_data_size(size);
1911
1912 log4(OP_WRITESAME, offset, size, data_size);
1913
1914 gendata(original_buf, good_buf, offset, data_size);
1915 if (file_size < offset + size) {
1916 newsize = ceil(((double)offset + size) / truncbdy) * truncbdy;
1917 if (file_size < newsize)
1918 memset(good_buf + file_size, '\0', newsize - file_size);
1919 file_size = newsize;
1920 if (lite) {
1921 warn("Lite file size bug in fsx!");
1922 report_failure(162);
1923 }
1924 ret = ops->resize(&ctx, newsize);
1925 if (ret < 0) {
1926 prterrcode("dowritesame: ops->resize", ret);
1927 report_failure(163);
1928 }
1929 }
1930
1931 for (n = size / data_size, buf_off = data_size; n > 1; n--) {
1932 memcpy(good_buf + offset + buf_off, good_buf + offset, data_size);
1933 buf_off += data_size;
1934 }
1935
1936 if (testcalls <= simulatedopcount)
1937 return;
1938
1939 if (!quiet &&
1940 ((progressinterval && testcalls % progressinterval == 0) ||
1941 (debug &&
1942 (monitorstart == -1 ||
1943 (static_cast<long>(offset + size) > monitorstart &&
1944 (monitorend == -1 ||
1945 static_cast<long>(offset) <= monitorend))))))
1946 prt("%lu writesame\t0x%x thru\t0x%x\tdata_size\t0x%x(0x%x bytes)\n", testcalls,
1947 offset, offset + size - 1, data_size, size);
1948
1949 ret = ops->writesame(&ctx, offset, size, good_buf + offset, data_size);
1950 if (ret != (ssize_t)size) {
1951 if (ret < 0)
1952 prterrcode("dowritesame: ops->writesame", ret);
1953 else
1954 prt("short writesame: 0x%x bytes instead of 0x%x\n",
1955 ret, size);
1956 report_failure(164);
1957 }
1958
1959 if (flush_enabled)
1960 doflush(offset, size);
1961 }
1962
1963 void
1964 docompareandwrite(unsigned offset, unsigned size)
1965 {
1966 int ret;
1967
1968 if (skip_partial_discard) {
1969 if (!quiet && testcalls > simulatedopcount)
1970 prt("compare and write disabled\n");
1971 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
1972 return;
1973 }
1974
1975 offset -= offset % writebdy;
1976 if (o_direct)
1977 size -= size % writebdy;
1978
1979 if (size == 0) {
1980 if (!quiet && testcalls > simulatedopcount && !o_direct)
1981 prt("skipping zero size read\n");
1982 log4(OP_SKIPPED, OP_READ, offset, size);
1983 return;
1984 }
1985
1986 if (size + offset > file_size) {
1987 if (!quiet && testcalls > simulatedopcount)
1988 prt("skipping seek/compare past end of file\n");
1989 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
1990 return;
1991 }
1992
1993 memcpy(temp_buf + offset, good_buf + offset, size);
1994 gendata(original_buf, good_buf, offset, size);
1995 log4(OP_COMPARE_AND_WRITE, offset, size, 0);
1996
1997 if (testcalls <= simulatedopcount)
1998 return;
1999
2000 if (!quiet &&
2001 ((progressinterval && testcalls % progressinterval == 0) ||
2002 (debug &&
2003 (monitorstart == -1 ||
2004 (static_cast<long>(offset + size) > monitorstart &&
2005 (monitorend == -1 ||
2006 static_cast<long>(offset) <= monitorend))))))
2007 prt("%lu compareandwrite\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
2008 offset, offset + size - 1, size);
2009
2010 ret = ops->compare_and_write(&ctx, offset, size, temp_buf + offset,
2011 good_buf + offset);
2012 if (ret != (ssize_t)size) {
2013 if (ret == -EINVAL) {
2014 memcpy(good_buf + offset, temp_buf + offset, size);
2015 return;
2016 }
2017 if (ret < 0)
2018 prterrcode("docompareandwrite: ops->compare_and_write", ret);
2019 else
2020 prt("short write: 0x%x bytes instead of 0x%x\n", ret, size);
2021 report_failure(151);
2022 return;
2023 }
2024
2025 if (flush_enabled)
2026 doflush(offset, size);
2027 }
2028
2029 void clone_filename(char *buf, size_t len, int clones)
2030 {
2031 snprintf(buf, len, "%s/fsx-%s-parent%d",
2032 dirpath, iname, clones);
2033 }
2034
2035 void clone_imagename(char *buf, size_t len, int clones)
2036 {
2037 if (clones > 0)
2038 snprintf(buf, len, "%s-clone%d", iname, clones);
2039 else
2040 strncpy(buf, iname, len);
2041 buf[len - 1] = '\0';
2042 }
2043
2044 void replay_imagename(char *buf, size_t len, int clones)
2045 {
2046 clone_imagename(buf, len, clones);
2047 strncat(buf, "-replay", len - strlen(buf));
2048 buf[len - 1] = '\0';
2049 }
2050
2051 void check_clone(int clonenum, bool replay_image);
2052
2053 void
2054 do_clone()
2055 {
2056 char filename[1024];
2057 char imagename[1024];
2058 char lastimagename[1024];
2059 int ret, fd;
2060 int order = 0, stripe_unit = 0, stripe_count = 0;
2061 uint64_t newsize = file_size;
2062
2063 log4(OP_CLONE, 0, 0, 0);
2064 ++num_clones;
2065
2066 if (randomize_striping) {
2067 order = 18 + get_random() % 8;
2068 stripe_unit = 1ull << (order - 1 - (get_random() % 8));
2069 stripe_count = 2 + get_random() % 14;
2070 }
2071
2072 prt("%lu clone\t%d order %d su %d sc %d\n", testcalls, num_clones,
2073 order, stripe_unit, stripe_count);
2074
2075 clone_imagename(imagename, sizeof(imagename), num_clones);
2076 clone_imagename(lastimagename, sizeof(lastimagename),
2077 num_clones - 1);
2078 assert(strcmp(lastimagename, ctx.name) == 0);
2079
2080 ret = ops->clone(&ctx, "snap", imagename, &order, stripe_unit,
2081 stripe_count);
2082 if (ret < 0) {
2083 prterrcode("do_clone: ops->clone", ret);
2084 exit(165);
2085 }
2086
2087 if (randomize_parent_overlap && rbd_image_has_parent(&ctx)) {
2088 int rand = get_random() % 16 + 1; // [1..16]
2089
2090 if (rand < 13) {
2091 uint64_t overlap;
2092
2093 ret = rbd_get_overlap(ctx.image, &overlap);
2094 if (ret < 0) {
2095 prterrcode("do_clone: rbd_get_overlap", ret);
2096 exit(1);
2097 }
2098
2099 if (rand < 10) { // 9/16
2100 newsize = overlap * ((double)rand / 10);
2101 newsize -= newsize % truncbdy;
2102 } else { // 3/16
2103 newsize = 0;
2104 }
2105
2106 assert(newsize != (uint64_t)file_size);
2107 prt("truncating image %s from 0x%llx (overlap 0x%llx) to 0x%llx\n",
2108 ctx.name, file_size, overlap, newsize);
2109
2110 ret = ops->resize(&ctx, newsize);
2111 if (ret < 0) {
2112 prterrcode("do_clone: ops->resize", ret);
2113 exit(1);
2114 }
2115 } else if (rand < 15) { // 2/16
2116 prt("flattening image %s\n", ctx.name);
2117
2118 ret = ops->flatten(&ctx);
2119 if (ret < 0) {
2120 prterrcode("do_clone: ops->flatten", ret);
2121 exit(1);
2122 }
2123 } else { // 2/16
2124 prt("leaving image %s intact\n", ctx.name);
2125 }
2126 }
2127
2128 clone_filename(filename, sizeof(filename), num_clones);
2129 if ((fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0666)) < 0) {
2130 simple_err("do_clone: open", -errno);
2131 exit(162);
2132 }
2133 save_buffer(good_buf, newsize, fd);
2134 if ((ret = close(fd)) < 0) {
2135 simple_err("do_clone: close", -errno);
2136 exit(163);
2137 }
2138
2139 /*
2140 * Close parent.
2141 */
2142 if ((ret = ops->close(&ctx)) < 0) {
2143 prterrcode("do_clone: ops->close", ret);
2144 exit(174);
2145 }
2146
2147 if (journal_replay) {
2148 ret = finalize_journal(ioctx, lastimagename, num_clones - 1,
2149 order, stripe_unit, stripe_count);
2150 if (ret < 0) {
2151 exit(EXIT_FAILURE);
2152 }
2153
2154 ret = register_journal(ioctx, imagename);
2155 if (ret < 0) {
2156 exit(EXIT_FAILURE);
2157 }
2158 }
2159
2160 /*
2161 * Open freshly made clone.
2162 */
2163 if ((ret = ops->open(imagename, &ctx)) < 0) {
2164 prterrcode("do_clone: ops->open", ret);
2165 exit(166);
2166 }
2167
2168 if (num_clones > 1) {
2169 if (journal_replay) {
2170 check_clone(num_clones - 2, true);
2171 }
2172 check_clone(num_clones - 2, false);
2173 }
2174 }
2175
2176 void
2177 check_clone(int clonenum, bool replay_image)
2178 {
2179 char filename[128];
2180 char imagename[128];
2181 int ret, fd;
2182 struct rbd_ctx cur_ctx = RBD_CTX_INIT;
2183 struct stat file_info;
2184 char *good_buf, *temp_buf;
2185
2186 if (replay_image) {
2187 replay_imagename(imagename, sizeof(imagename), clonenum);
2188 } else {
2189 clone_imagename(imagename, sizeof(imagename), clonenum);
2190 }
2191
2192 if ((ret = ops->open(imagename, &cur_ctx)) < 0) {
2193 prterrcode("check_clone: ops->open", ret);
2194 exit(167);
2195 }
2196
2197 clone_filename(filename, sizeof(filename), clonenum + 1);
2198 if ((fd = open(filename, O_RDONLY)) < 0) {
2199 simple_err("check_clone: open", -errno);
2200 exit(168);
2201 }
2202
2203 prt("checking clone #%d, image %s against file %s\n",
2204 clonenum, imagename, filename);
2205 if ((ret = fstat(fd, &file_info)) < 0) {
2206 simple_err("check_clone: fstat", -errno);
2207 exit(169);
2208 }
2209
2210 good_buf = NULL;
2211 ret = posix_memalign((void **)&good_buf,
2212 MAX(writebdy, (int)sizeof(void *)),
2213 file_info.st_size);
2214 if (ret > 0) {
2215 prterrcode("check_clone: posix_memalign(good_buf)", -ret);
2216 exit(96);
2217 }
2218
2219 temp_buf = NULL;
2220 ret = posix_memalign((void **)&temp_buf,
2221 MAX(readbdy, (int)sizeof(void *)),
2222 file_info.st_size);
2223 if (ret > 0) {
2224 prterrcode("check_clone: posix_memalign(temp_buf)", -ret);
2225 exit(97);
2226 }
2227
2228 if ((ret = pread(fd, good_buf, file_info.st_size, 0)) < 0) {
2229 simple_err("check_clone: pread", -errno);
2230 exit(170);
2231 }
2232 if ((ret = ops->read(&cur_ctx, 0, file_info.st_size, temp_buf)) < 0) {
2233 prterrcode("check_clone: ops->read", ret);
2234 exit(171);
2235 }
2236 close(fd);
2237 if ((ret = ops->close(&cur_ctx)) < 0) {
2238 prterrcode("check_clone: ops->close", ret);
2239 exit(174);
2240 }
2241 check_buffers(good_buf, temp_buf, 0, file_info.st_size);
2242
2243 if (!replay_image) {
2244 unlink(filename);
2245 }
2246
2247 free(good_buf);
2248 free(temp_buf);
2249 }
2250
2251 void
2252 writefileimage()
2253 {
2254 ssize_t ret;
2255
2256 ret = ops->write(&ctx, 0, file_size, good_buf);
2257 if (ret != file_size) {
2258 if (ret < 0)
2259 prterrcode("writefileimage: ops->write", ret);
2260 else
2261 prt("short write: 0x%x bytes instead of 0x%llx\n",
2262 ret, (unsigned long long)file_size);
2263 report_failure(172);
2264 }
2265
2266 if (!lite) {
2267 ret = ops->resize(&ctx, file_size);
2268 if (ret < 0) {
2269 prterrcode("writefileimage: ops->resize", ret);
2270 report_failure(173);
2271 }
2272 }
2273 }
2274
2275 void
2276 do_flatten()
2277 {
2278 int ret;
2279
2280 if (!rbd_image_has_parent(&ctx)) {
2281 log4(OP_SKIPPED, OP_FLATTEN, 0, 0);
2282 return;
2283 }
2284 log4(OP_FLATTEN, 0, 0, 0);
2285 prt("%lu flatten\n", testcalls);
2286
2287 ret = ops->flatten(&ctx);
2288 if (ret < 0) {
2289 prterrcode("writefileimage: ops->flatten", ret);
2290 exit(177);
2291 }
2292 }
2293
2294 void
2295 docloseopen(void)
2296 {
2297 char *name;
2298 int ret;
2299
2300 if (testcalls <= simulatedopcount)
2301 return;
2302
2303 name = strdup(ctx.name);
2304
2305 if (debug)
2306 prt("%lu close/open\n", testcalls);
2307
2308 ret = ops->close(&ctx);
2309 if (ret < 0) {
2310 prterrcode("docloseopen: ops->close", ret);
2311 report_failure(180);
2312 }
2313
2314 ret = ops->open(name, &ctx);
2315 if (ret < 0) {
2316 prterrcode("docloseopen: ops->open", ret);
2317 report_failure(181);
2318 }
2319
2320 free(name);
2321 }
2322
2323 #define TRIM_OFF_LEN(off, len, size) \
2324 do { \
2325 if (size) \
2326 (off) %= (size); \
2327 else \
2328 (off) = 0; \
2329 if ((unsigned)(off) + (unsigned)(len) > (unsigned)(size)) \
2330 (len) = (size) - (off); \
2331 } while (0)
2332
2333 void
2334 test(void)
2335 {
2336 unsigned long offset;
2337 unsigned long size = maxoplen;
2338 unsigned long rv = get_random();
2339 unsigned long op;
2340
2341 if (simulatedopcount > 0 && testcalls == simulatedopcount)
2342 writefileimage();
2343
2344 testcalls++;
2345
2346 if (closeprob)
2347 closeopen = (rv >> 3) < (1u << 28) / (unsigned)closeprob;
2348
2349 if (debugstart > 0 && testcalls >= debugstart)
2350 debug = 1;
2351
2352 if (!quiet && testcalls < simulatedopcount && testcalls % 100000 == 0)
2353 prt("%lu...\n", testcalls);
2354
2355 offset = get_random();
2356 if (randomoplen)
2357 size = get_random() % (maxoplen + 1);
2358
2359 /* calculate appropriate op to run */
2360 if (lite)
2361 op = rv % OP_MAX_LITE;
2362 else
2363 op = rv % OP_MAX_FULL;
2364
2365 switch (op) {
2366 case OP_MAPREAD:
2367 if (!mapped_reads)
2368 op = OP_READ;
2369 break;
2370 case OP_MAPWRITE:
2371 if (!mapped_writes)
2372 op = OP_WRITE;
2373 break;
2374 case OP_FALLOCATE:
2375 if (!fallocate_calls) {
2376 log4(OP_SKIPPED, OP_FALLOCATE, offset, size);
2377 goto out;
2378 }
2379 break;
2380 case OP_PUNCH_HOLE:
2381 if (!punch_hole_calls) {
2382 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, size);
2383 goto out;
2384 }
2385 break;
2386 case OP_CLONE:
2387 /* clone, 8% chance */
2388 if (!clone_calls || file_size == 0 || get_random() % 100 >= 8) {
2389 log4(OP_SKIPPED, OP_CLONE, 0, 0);
2390 goto out;
2391 }
2392 break;
2393 case OP_FLATTEN:
2394 /* flatten four times as rarely as clone, 2% chance */
2395 if (get_random() % 100 >= 2) {
2396 log4(OP_SKIPPED, OP_FLATTEN, 0, 0);
2397 goto out;
2398 }
2399 break;
2400 case OP_WRITESAME:
2401 /* writesame not implemented */
2402 if (!ops->writesame) {
2403 log4(OP_SKIPPED, OP_WRITESAME, offset, size);
2404 goto out;
2405 }
2406 break;
2407 case OP_COMPARE_AND_WRITE:
2408 /* compare_and_write not implemented */
2409 if (!ops->compare_and_write) {
2410 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
2411 goto out;
2412 }
2413 break;
2414 }
2415
2416 switch (op) {
2417 case OP_READ:
2418 TRIM_OFF_LEN(offset, size, file_size);
2419 doread(offset, size);
2420 break;
2421
2422 case OP_WRITE:
2423 TRIM_OFF_LEN(offset, size, maxfilelen);
2424 dowrite(offset, size);
2425 break;
2426
2427 case OP_MAPREAD:
2428 TRIM_OFF_LEN(offset, size, file_size);
2429 exit(183);
2430 break;
2431
2432 case OP_MAPWRITE:
2433 TRIM_OFF_LEN(offset, size, maxfilelen);
2434 exit(182);
2435 break;
2436
2437 case OP_TRUNCATE:
2438 if (!style)
2439 size = get_random() % maxfilelen;
2440 dotruncate(size);
2441 break;
2442
2443 case OP_PUNCH_HOLE:
2444 TRIM_OFF_LEN(offset, size, file_size);
2445 do_punch_hole(offset, size);
2446 break;
2447
2448 case OP_WRITESAME:
2449 TRIM_OFF_LEN(offset, size, maxfilelen);
2450 dowritesame(offset, size);
2451 break;
2452 case OP_COMPARE_AND_WRITE:
2453 TRIM_OFF_LEN(offset, size, file_size);
2454 docompareandwrite(offset, size);
2455 break;
2456
2457 case OP_CLONE:
2458 do_clone();
2459 break;
2460
2461 case OP_FLATTEN:
2462 do_flatten();
2463 break;
2464
2465 default:
2466 prterr("test: unknown operation");
2467 report_failure(42);
2468 break;
2469 }
2470
2471 out:
2472 if (sizechecks && testcalls > simulatedopcount)
2473 check_size();
2474 if (closeopen)
2475 docloseopen();
2476 }
2477
2478
2479 void
2480 cleanup(int sig)
2481 {
2482 if (sig)
2483 prt("signal %d\n", sig);
2484 prt("testcalls = %lu\n", testcalls);
2485 exit(sig);
2486 }
2487
2488
2489 void
2490 usage(void)
2491 {
2492 fprintf(stdout, "usage: %s",
2493 "fsx [-dfjknqxyACFHKLORUWZ] [-b opnum] [-c Prob] [-h holebdy] [-l flen] [-m start:end] [-o oplen] [-p progressinterval] [-r readbdy] [-s style] [-t truncbdy] [-w writebdy] [-D startingop] [-N numops] [-P dirpath] [-S seed] pname iname\n\
2494 -b opnum: beginning operation number (default 1)\n\
2495 -c P: 1 in P chance of file close+open at each op (default infinity)\n\
2496 -d: debug output for all operations\n\
2497 -f: flush and invalidate cache after I/O\n\
2498 -h holebdy: 4096 would make discards page aligned (default 1)\n\
2499 -j: journal replay stress test\n\
2500 -k: keep data on success (default 0)\n\
2501 -l flen: the upper bound on file size (default 262144)\n\
2502 -m startop:endop: monitor (print debug output) specified byte range (default 0:infinity)\n\
2503 -n: no verifications of file size\n\
2504 -o oplen: the upper bound on operation size (default 65536)\n\
2505 -p progressinterval: debug output at specified operation interval\n\
2506 -q: quieter operation\n\
2507 -r readbdy: 4096 would make reads page aligned (default 1)\n\
2508 -s style: 1 gives smaller truncates (default 0)\n\
2509 -t truncbdy: 4096 would make truncates page aligned (default 1)\n\
2510 -w writebdy: 4096 would make writes page aligned (default 1)\n\
2511 -x: preallocate file space before starting, XFS only (default 0)\n\
2512 -y: synchronize changes to a file\n"
2513
2514 " -C: do not use clone calls\n\
2515 -D startingop: debug output starting at specified operation\n"
2516 #ifdef FALLOCATE
2517 " -F: Do not use fallocate (preallocation) calls\n"
2518 #endif
2519 " -H: do not use punch hole calls\n\
2520 -K: enable krbd mode (use -t and -h too)\n\
2521 -M: enable rbd-nbd mode (use -t and -h too)\n\
2522 -L: fsxLite - no file creations & no file size changes\n\
2523 -N numops: total # operations to do (default infinity)\n\
2524 -O: use oplen (see -o flag) for every op (default random)\n\
2525 -P dirpath: save .fsxlog and .fsxgood files in dirpath (default ./)\n\
2526 -R: read() system calls only (mapped reads disabled)\n\
2527 -S seed: for random # generator (default 1) 0 gets timestamp\n\
2528 -U: disable randomized striping\n\
2529 -W: mapped write operations DISabled\n\
2530 -Z: O_DIRECT (use -R, -W, -r and -w too)\n\
2531 poolname: this is REQUIRED (no default)\n\
2532 imagename: this is REQUIRED (no default)\n");
2533 exit(89);
2534 }
2535
2536
2537 int
2538 getnum(char *s, char **e)
2539 {
2540 int ret;
2541
2542 *e = (char *) 0;
2543 ret = strtol(s, e, 0);
2544 if (*e)
2545 switch (**e) {
2546 case 'b':
2547 case 'B':
2548 ret *= 512;
2549 *e = *e + 1;
2550 break;
2551 case 'k':
2552 case 'K':
2553 ret *= 1024;
2554 *e = *e + 1;
2555 break;
2556 case 'm':
2557 case 'M':
2558 ret *= 1024*1024;
2559 *e = *e + 1;
2560 break;
2561 case 'w':
2562 case 'W':
2563 ret *= 4;
2564 *e = *e + 1;
2565 break;
2566 }
2567 return (ret);
2568 }
2569
2570 void
2571 test_fallocate()
2572 {
2573 #ifdef FALLOCATE
2574 if (!lite && fallocate_calls) {
2575 if (fallocate(fd, 0, 0, 1) && errno == EOPNOTSUPP) {
2576 if(!quiet)
2577 warn("main: filesystem does not support fallocate, disabling\n");
2578 fallocate_calls = 0;
2579 } else {
2580 ftruncate(fd, 0);
2581 }
2582 }
2583 #else /* ! FALLOCATE */
2584 fallocate_calls = 0;
2585 #endif
2586
2587 }
2588
2589 void remove_image(rados_ioctx_t ioctx, char *imagename, bool remove_snap,
2590 bool unregister) {
2591 rbd_image_t image;
2592 char errmsg[128];
2593 int ret;
2594
2595 if ((ret = rbd_open(ioctx, imagename, &image, NULL)) < 0) {
2596 sprintf(errmsg, "rbd_open %s", imagename);
2597 prterrcode(errmsg, ret);
2598 report_failure(101);
2599 }
2600 if (remove_snap) {
2601 if ((ret = rbd_snap_unprotect(image, "snap")) < 0) {
2602 sprintf(errmsg, "rbd_snap_unprotect %s@snap",
2603 imagename);
2604 prterrcode(errmsg, ret);
2605 report_failure(102);
2606 }
2607 if ((ret = rbd_snap_remove(image, "snap")) < 0) {
2608 sprintf(errmsg, "rbd_snap_remove %s@snap",
2609 imagename);
2610 prterrcode(errmsg, ret);
2611 report_failure(103);
2612 }
2613 }
2614 if ((ret = rbd_close(image)) < 0) {
2615 sprintf(errmsg, "rbd_close %s", imagename);
2616 prterrcode(errmsg, ret);
2617 report_failure(104);
2618 }
2619
2620 if (unregister &&
2621 (ret = unregister_journal(ioctx, imagename)) < 0) {
2622 report_failure(105);
2623 }
2624
2625 if ((ret = rbd_remove(ioctx, imagename)) < 0) {
2626 sprintf(errmsg, "rbd_remove %s", imagename);
2627 prterrcode(errmsg, ret);
2628 report_failure(106);
2629 }
2630 }
2631
2632 int
2633 main(int argc, char **argv)
2634 {
2635 int i, style, ch, ret;
2636 char *endp;
2637 char goodfile[1024];
2638 char logfile[1024];
2639
2640 goodfile[0] = 0;
2641 logfile[0] = 0;
2642
2643 page_size = getpagesize();
2644 page_mask = page_size - 1;
2645 mmap_mask = page_mask;
2646
2647 setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
2648
2649 while ((ch = getopt(argc, argv, "b:c:dfh:jkl:m:no:p:qr:s:t:w:xyCD:FHKMLN:OP:RS:UWZ"))
2650 != EOF)
2651 switch (ch) {
2652 case 'b':
2653 simulatedopcount = getnum(optarg, &endp);
2654 if (!quiet)
2655 fprintf(stdout, "Will begin at operation %lu\n",
2656 simulatedopcount);
2657 if (simulatedopcount == 0)
2658 usage();
2659 simulatedopcount -= 1;
2660 break;
2661 case 'c':
2662 closeprob = getnum(optarg, &endp);
2663 if (!quiet)
2664 fprintf(stdout,
2665 "Chance of close/open is 1 in %d\n",
2666 closeprob);
2667 if (closeprob <= 0)
2668 usage();
2669 break;
2670 case 'd':
2671 debug = 1;
2672 break;
2673 case 'f':
2674 flush_enabled = 1;
2675 break;
2676 case 'h':
2677 holebdy = getnum(optarg, &endp);
2678 if (holebdy <= 0)
2679 usage();
2680 break;
2681 case 'j':
2682 journal_replay = true;
2683 break;
2684 case 'k':
2685 keep_on_success = 1;
2686 break;
2687 case 'l':
2688 {
2689 int _num = getnum(optarg, &endp);
2690 if (_num <= 0)
2691 usage();
2692 maxfilelen = _num;
2693 }
2694 break;
2695 case 'm':
2696 monitorstart = getnum(optarg, &endp);
2697 if (monitorstart < 0)
2698 usage();
2699 if (!endp || *endp++ != ':')
2700 usage();
2701 monitorend = getnum(endp, &endp);
2702 if (monitorend < 0)
2703 usage();
2704 if (monitorend == 0)
2705 monitorend = -1; /* aka infinity */
2706 debug = 1;
2707 break;
2708 case 'n':
2709 sizechecks = 0;
2710 break;
2711 case 'o':
2712 maxoplen = getnum(optarg, &endp);
2713 if (maxoplen <= 0)
2714 usage();
2715 break;
2716 case 'p':
2717 progressinterval = getnum(optarg, &endp);
2718 if (progressinterval == 0)
2719 usage();
2720 break;
2721 case 'q':
2722 quiet = 1;
2723 break;
2724 case 'r':
2725 readbdy = getnum(optarg, &endp);
2726 if (readbdy <= 0)
2727 usage();
2728 break;
2729 case 's':
2730 style = getnum(optarg, &endp);
2731 if (style < 0 || style > 1)
2732 usage();
2733 break;
2734 case 't':
2735 truncbdy = getnum(optarg, &endp);
2736 if (truncbdy <= 0)
2737 usage();
2738 break;
2739 case 'w':
2740 writebdy = getnum(optarg, &endp);
2741 if (writebdy <= 0)
2742 usage();
2743 break;
2744 case 'x':
2745 prealloc = 1;
2746 break;
2747 case 'y':
2748 do_fsync = 1;
2749 break;
2750 case 'C':
2751 clone_calls = 0;
2752 break;
2753 case 'D':
2754 debugstart = getnum(optarg, &endp);
2755 if (debugstart < 1)
2756 usage();
2757 break;
2758 case 'F':
2759 fallocate_calls = 0;
2760 break;
2761 case 'H':
2762 punch_hole_calls = 0;
2763 break;
2764 case 'K':
2765 prt("krbd mode enabled\n");
2766 ops = &krbd_operations;
2767 break;
2768 case 'M':
2769 prt("rbd-nbd mode enabled\n");
2770 ops = &nbd_operations;
2771 break;
2772 case 'L':
2773 prt("lite mode not supported for rbd\n");
2774 exit(1);
2775 break;
2776 case 'N':
2777 numops = getnum(optarg, &endp);
2778 if (numops < 0)
2779 usage();
2780 break;
2781 case 'O':
2782 randomoplen = 0;
2783 break;
2784 case 'P':
2785 strncpy(dirpath, optarg, sizeof(dirpath)-1);
2786 dirpath[sizeof(dirpath)-1] = '\0';
2787 strncpy(goodfile, dirpath, sizeof(goodfile)-1);
2788 goodfile[sizeof(goodfile)-1] = '\0';
2789 if (strlen(goodfile) < sizeof(goodfile)-2) {
2790 strcat(goodfile, "/");
2791 } else {
2792 prt("file name to long\n");
2793 exit(1);
2794 }
2795 strncpy(logfile, dirpath, sizeof(logfile)-1);
2796 logfile[sizeof(logfile)-1] = '\0';
2797 if (strlen(logfile) < sizeof(logfile)-2) {
2798 strcat(logfile, "/");
2799 } else {
2800 prt("file path to long\n");
2801 exit(1);
2802 }
2803 break;
2804 case 'R':
2805 mapped_reads = 0;
2806 if (!quiet)
2807 fprintf(stdout, "mapped reads DISABLED\n");
2808 break;
2809 case 'S':
2810 seed = getnum(optarg, &endp);
2811 if (seed == 0)
2812 seed = time(0) % 10000;
2813 if (!quiet)
2814 fprintf(stdout, "Seed set to %d\n", seed);
2815 if (seed < 0)
2816 usage();
2817 break;
2818 case 'U':
2819 randomize_striping = 0;
2820 break;
2821 case 'W':
2822 mapped_writes = 0;
2823 if (!quiet)
2824 fprintf(stdout, "mapped writes DISABLED\n");
2825 break;
2826 case 'Z':
2827 o_direct = O_DIRECT;
2828 break;
2829 default:
2830 usage();
2831 /* NOTREACHED */
2832 }
2833 argc -= optind;
2834 argv += optind;
2835 if (argc != 2)
2836 usage();
2837 pool = argv[0];
2838 iname = argv[1];
2839
2840 signal(SIGHUP, cleanup);
2841 signal(SIGINT, cleanup);
2842 signal(SIGPIPE, cleanup);
2843 signal(SIGALRM, cleanup);
2844 signal(SIGTERM, cleanup);
2845 signal(SIGXCPU, cleanup);
2846 signal(SIGXFSZ, cleanup);
2847 signal(SIGVTALRM, cleanup);
2848 signal(SIGUSR1, cleanup);
2849 signal(SIGUSR2, cleanup);
2850
2851 random_generator.seed(seed);
2852
2853 ret = create_image();
2854 if (ret < 0) {
2855 prterrcode(iname, ret);
2856 exit(90);
2857 }
2858 ret = ops->open(iname, &ctx);
2859 if (ret < 0) {
2860 simple_err("Error opening image", ret);
2861 exit(91);
2862 }
2863 if (!dirpath[0])
2864 strcat(dirpath, ".");
2865 strncat(goodfile, iname, 256);
2866 strcat (goodfile, ".fsxgood");
2867 fsxgoodfd = open(goodfile, O_RDWR|O_CREAT|O_TRUNC, 0666);
2868 if (fsxgoodfd < 0) {
2869 prterr(goodfile);
2870 exit(92);
2871 }
2872 strncat(logfile, iname, 256);
2873 strcat (logfile, ".fsxlog");
2874 fsxlogf = fopen(logfile, "w");
2875 if (fsxlogf == NULL) {
2876 prterr(logfile);
2877 exit(93);
2878 }
2879
2880 original_buf = (char *) malloc(maxfilelen);
2881 for (i = 0; i < (int)maxfilelen; i++)
2882 original_buf[i] = get_random() % 256;
2883
2884 ret = posix_memalign((void **)&good_buf,
2885 MAX(writebdy, (int)sizeof(void *)), maxfilelen);
2886 if (ret > 0) {
2887 if (ret == EINVAL)
2888 prt("writebdy is not a suitable power of two\n");
2889 else
2890 prterrcode("main: posix_memalign(good_buf)", -ret);
2891 exit(94);
2892 }
2893 memset(good_buf, '\0', maxfilelen);
2894
2895 ret = posix_memalign((void **)&temp_buf,
2896 MAX(readbdy, (int)sizeof(void *)), maxfilelen);
2897 if (ret > 0) {
2898 if (ret == EINVAL)
2899 prt("readbdy is not a suitable power of two\n");
2900 else
2901 prterrcode("main: posix_memalign(temp_buf)", -ret);
2902 exit(95);
2903 }
2904 memset(temp_buf, '\0', maxfilelen);
2905
2906 if (lite) { /* zero entire existing file */
2907 ssize_t written;
2908
2909 written = ops->write(&ctx, 0, (size_t)maxfilelen, good_buf);
2910 if (written != (ssize_t)maxfilelen) {
2911 if (written < 0) {
2912 prterrcode(iname, written);
2913 warn("main: error on write");
2914 } else
2915 warn("main: short write, 0x%x bytes instead "
2916 "of 0x%lx\n",
2917 (unsigned)written,
2918 maxfilelen);
2919 exit(98);
2920 }
2921 } else
2922 check_trunc_hack();
2923
2924 //test_fallocate();
2925
2926 while (numops == -1 || numops--)
2927 test();
2928
2929 ret = ops->close(&ctx);
2930 if (ret < 0) {
2931 prterrcode("ops->close", ret);
2932 report_failure(99);
2933 }
2934
2935 if (journal_replay) {
2936 char imagename[1024];
2937 clone_imagename(imagename, sizeof(imagename), num_clones);
2938 ret = finalize_journal(ioctx, imagename, num_clones, 0, 0, 0);
2939 if (ret < 0) {
2940 report_failure(100);
2941 }
2942 }
2943
2944 if (num_clones > 0) {
2945 if (journal_replay) {
2946 check_clone(num_clones - 1, true);
2947 }
2948 check_clone(num_clones - 1, false);
2949 }
2950
2951 if (!keep_on_success) {
2952 while (num_clones >= 0) {
2953 static bool remove_snap = false;
2954
2955 if (journal_replay) {
2956 char replayimagename[1024];
2957 replay_imagename(replayimagename,
2958 sizeof(replayimagename),
2959 num_clones);
2960 remove_image(ioctx, replayimagename,
2961 remove_snap,
2962 false);
2963 }
2964
2965 char clonename[128];
2966 clone_imagename(clonename, 128, num_clones);
2967 remove_image(ioctx, clonename, remove_snap,
2968 journal_replay);
2969
2970 remove_snap = true;
2971 num_clones--;
2972 }
2973 }
2974
2975 prt("All operations completed A-OK!\n");
2976 fclose(fsxlogf);
2977
2978 rados_ioctx_destroy(ioctx);
2979 krbd_destroy(krbd);
2980 rados_shutdown(cluster);
2981
2982 free(original_buf);
2983 free(good_buf);
2984 free(temp_buf);
2985
2986 exit(0);
2987 return 0;
2988 }