]> git.proxmox.com Git - ceph.git/blob - ceph/src/test/librbd/fsx.cc
update sources to v12.2.3
[ceph.git] / ceph / src / test / librbd / fsx.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:8; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=8 smarttab
3 /*
4 * Copyright (C) 1991, NeXT Computer, Inc. All Rights Reserverd.
5 *
6 * File: fsx.cc
7 * Author: Avadis Tevanian, Jr.
8 *
9 * File system exerciser.
10 *
11 * Rewritten 8/98 by Conrad Minshall.
12 *
13 * Small changes to work under Linux -- davej.
14 *
15 * Checks for mmap last-page zero fill.
16 */
17
18 #include <sys/types.h>
19 #include <unistd.h>
20 #include <limits.h>
21 #include <time.h>
22 #include <strings.h>
23 #include <sys/file.h>
24 #include <sys/stat.h>
25 #include <sys/mman.h>
26 #include <linux/fs.h>
27 #include <sys/ioctl.h>
28 #ifdef HAVE_ERR_H
29 #include <err.h>
30 #endif
31 #include <signal.h>
32 #include <stdbool.h>
33 #include <stddef.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <stdarg.h>
38 #include <assert.h>
39 #include <errno.h>
40 #include <math.h>
41 #include <fcntl.h>
42 #include <random>
43
44 #include "include/intarith.h"
45 #include "include/krbd.h"
46 #include "include/rados/librados.h"
47 #include "include/rados/librados.hpp"
48 #include "include/rbd/librbd.h"
49 #include "include/rbd/librbd.hpp"
50 #include "common/Cond.h"
51 #include "common/SubProcess.h"
52 #include "common/safe_io.h"
53 #include "journal/Journaler.h"
54 #include "journal/ReplayEntry.h"
55 #include "journal/ReplayHandler.h"
56 #include "journal/Settings.h"
57
58 #include <boost/scope_exit.hpp>
59
60 #define NUMPRINTCOLUMNS 32 /* # columns of data to print on each line */
61
62 /*
63 * A log entry is an operation and a bunch of arguments.
64 */
65
66 struct log_entry {
67 int operation;
68 int args[3];
69 };
70
71 #define LOGSIZE 1000
72
73 struct log_entry oplog[LOGSIZE]; /* the log */
74 int logptr = 0; /* current position in log */
75 int logcount = 0; /* total ops */
76
77 /*
78 * The operation matrix is complex due to conditional execution of different
79 * features. Hence when we come to deciding what operation to run, we need to
80 * be careful in how we select the different operations. The active operations
81 * are mapped to numbers as follows:
82 *
83 * lite !lite
84 * READ: 0 0
85 * WRITE: 1 1
86 * MAPREAD: 2 2
87 * MAPWRITE: 3 3
88 * TRUNCATE: - 4
89 * FALLOCATE: - 5
90 * PUNCH HOLE: - 6
91 * WRITESAME: - 7
92 * COMPAREANDWRITE: - 8
93 *
94 * When mapped read/writes are disabled, they are simply converted to normal
95 * reads and writes. When fallocate/fpunch calls are disabled, they are
96 * converted to OP_SKIPPED. Hence OP_SKIPPED needs to have a number higher than
97 * the operation selction matrix, as does the OP_CLOSEOPEN which is an
98 * operation modifier rather than an operation in itself.
99 *
100 * Because of the "lite" version, we also need to have different "maximum
101 * operation" defines to allow the ops to be selected correctly based on the
102 * mode being run.
103 */
104
105 /* common operations */
106 #define OP_READ 0
107 #define OP_WRITE 1
108 #define OP_MAPREAD 2
109 #define OP_MAPWRITE 3
110 #define OP_MAX_LITE 4
111
112 /* !lite operations */
113 #define OP_TRUNCATE 4
114 #define OP_FALLOCATE 5
115 #define OP_PUNCH_HOLE 6
116 #define OP_WRITESAME 7
117 #define OP_COMPARE_AND_WRITE 8
118 /* rbd-specific operations */
119 #define OP_CLONE 9
120 #define OP_FLATTEN 10
121 #define OP_MAX_FULL 11
122
123 /* operation modifiers */
124 #define OP_CLOSEOPEN 100
125 #define OP_SKIPPED 101
126
127 #undef PAGE_SIZE
128 #define PAGE_SIZE getpagesize()
129 #undef PAGE_MASK
130 #define PAGE_MASK (PAGE_SIZE - 1)
131
132
133 char *original_buf; /* a pointer to the original data */
134 char *good_buf; /* a pointer to the correct data */
135 char *temp_buf; /* a pointer to the current data */
136
137 char dirpath[1024];
138
139 off_t file_size = 0;
140 off_t biggest = 0;
141 unsigned long testcalls = 0; /* calls to function "test" */
142
143 unsigned long simulatedopcount = 0; /* -b flag */
144 int closeprob = 0; /* -c flag */
145 int debug = 0; /* -d flag */
146 unsigned long debugstart = 0; /* -D flag */
147 int flush_enabled = 0; /* -f flag */
148 int holebdy = 1; /* -h flag */
149 bool journal_replay = false; /* -j flah */
150 int keep_on_success = 0; /* -k flag */
151 int do_fsync = 0; /* -y flag */
152 unsigned long maxfilelen = 256 * 1024; /* -l flag */
153 int sizechecks = 1; /* -n flag disables them */
154 int maxoplen = 64 * 1024; /* -o flag */
155 int quiet = 0; /* -q flag */
156 unsigned long progressinterval = 0; /* -p flag */
157 int readbdy = 1; /* -r flag */
158 int style = 0; /* -s flag */
159 int prealloc = 0; /* -x flag */
160 int truncbdy = 1; /* -t flag */
161 int writebdy = 1; /* -w flag */
162 long monitorstart = -1; /* -m flag */
163 long monitorend = -1; /* -m flag */
164 int lite = 0; /* -L flag */
165 long numops = -1; /* -N flag */
166 int randomoplen = 1; /* -O flag disables it */
167 int seed = 1; /* -S flag */
168 int mapped_writes = 0; /* -W flag disables */
169 int fallocate_calls = 0; /* -F flag disables */
170 int punch_hole_calls = 1; /* -H flag disables */
171 int clone_calls = 1; /* -C flag disables */
172 int randomize_striping = 1; /* -U flag disables */
173 int randomize_parent_overlap = 1;
174 int mapped_reads = 0; /* -R flag disables it */
175 int fsxgoodfd = 0;
176 int o_direct = 0; /* -Z flag */
177
178 int num_clones = 0;
179
180 int page_size;
181 int page_mask;
182 int mmap_mask;
183
184 FILE * fsxlogf = NULL;
185 int badoff = -1;
186 int closeopen = 0;
187
188 void
189 vwarnc(int code, const char *fmt, va_list ap) {
190 fprintf(stderr, "fsx: ");
191 if (fmt != NULL) {
192 vfprintf(stderr, fmt, ap);
193 fprintf(stderr, ": ");
194 }
195 fprintf(stderr, "%s\n", strerror(code));
196 }
197
198 void
199 warn(const char * fmt, ...) {
200 va_list ap;
201 va_start(ap, fmt);
202 vwarnc(errno, fmt, ap);
203 va_end(ap);
204 }
205
206 #define BUF_SIZE 1024
207
208 void
209 prt(const char *fmt, ...)
210 {
211 va_list args;
212 char buffer[BUF_SIZE];
213
214 va_start(args, fmt);
215 vsnprintf(buffer, BUF_SIZE, fmt, args);
216 va_end(args);
217 fprintf(stdout, "%s", buffer);
218 if (fsxlogf)
219 fprintf(fsxlogf, "%s", buffer);
220 }
221
222 void
223 prterr(const char *prefix)
224 {
225 prt("%s%s%s\n", prefix, prefix ? ": " : "", strerror(errno));
226 }
227
228 void
229 prterrcode(const char *prefix, int code)
230 {
231 prt("%s%s%s\n", prefix, prefix ? ": " : "", strerror(-code));
232 }
233
234 void
235 simple_err(const char *msg, int err)
236 {
237 fprintf(stderr, "%s: %s\n", msg, strerror(-err));
238 }
239
240 /*
241 * random
242 */
243 std::mt19937 random_generator;
244
245 uint_fast32_t
246 get_random(void)
247 {
248 return random_generator();
249 }
250
251 void replay_imagename(char *buf, size_t len, int clones);
252
253 namespace {
254
255 static const std::string JOURNAL_CLIENT_ID("fsx");
256
257 struct ReplayHandler : public journal::ReplayHandler {
258 journal::Journaler *journaler;
259 journal::Journaler *replay_journaler;
260 Context *on_finish;
261
262 ReplayHandler(journal::Journaler *journaler,
263 journal::Journaler *replay_journaler, Context *on_finish)
264 : journaler(journaler), replay_journaler(replay_journaler),
265 on_finish(on_finish) {
266 }
267
268 void get() override {
269 }
270 void put() override {
271 }
272
273 void handle_entries_available() override {
274 while (true) {
275 journal::ReplayEntry replay_entry;
276 if (!journaler->try_pop_front(&replay_entry)) {
277 return;
278 }
279
280 replay_journaler->append(0, replay_entry.get_data());
281 }
282 }
283
284 void handle_complete(int r) override {
285 on_finish->complete(r);
286 }
287 };
288
289 int get_image_id(librados::IoCtx &io_ctx, const char *image_name,
290 std::string *image_id) {
291 librbd::RBD rbd;
292 librbd::Image image;
293 int r = rbd.open(io_ctx, image, image_name);
294 if (r < 0) {
295 simple_err("failed to open image", r);
296 return r;
297 }
298
299 rbd_image_info_t info;
300 r = image.stat(info, sizeof(info));
301 if (r < 0) {
302 simple_err("failed to stat image", r);
303 return r;
304 }
305
306 *image_id = std::string(&info.block_name_prefix[strlen(RBD_DATA_PREFIX)]);
307 return 0;
308 }
309
310 int register_journal(rados_ioctx_t ioctx, const char *image_name) {
311 librados::IoCtx io_ctx;
312 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
313
314 std::string image_id;
315 int r = get_image_id(io_ctx, image_name, &image_id);
316 if (r < 0) {
317 return r;
318 }
319
320 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {});
321 r = journaler.register_client(bufferlist());
322 if (r < 0) {
323 simple_err("failed to register journal client", r);
324 return r;
325 }
326 return 0;
327 }
328
329 int unregister_journal(rados_ioctx_t ioctx, const char *image_name) {
330 librados::IoCtx io_ctx;
331 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
332
333 std::string image_id;
334 int r = get_image_id(io_ctx, image_name, &image_id);
335 if (r < 0) {
336 return r;
337 }
338
339 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {});
340 r = journaler.unregister_client();
341 if (r < 0) {
342 simple_err("failed to unregister journal client", r);
343 return r;
344 }
345 return 0;
346 }
347
348 int create_replay_image(rados_ioctx_t ioctx, int order,
349 uint64_t stripe_unit, int stripe_count,
350 const char *replay_image_name,
351 const char *last_replay_image_name) {
352 librados::IoCtx io_ctx;
353 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
354
355 int r;
356 librbd::RBD rbd;
357 if (last_replay_image_name == nullptr) {
358 r = rbd.create2(io_ctx, replay_image_name, 0,
359 RBD_FEATURES_ALL, &order);
360 } else {
361 r = rbd.clone2(io_ctx, last_replay_image_name, "snap",
362 io_ctx, replay_image_name, RBD_FEATURES_ALL,
363 &order, stripe_unit, stripe_count);
364 }
365
366 if (r < 0) {
367 simple_err("failed to create replay image", r);
368 return r;
369 }
370
371 return 0;
372 }
373
374 int replay_journal(rados_ioctx_t ioctx, const char *image_name,
375 const char *replay_image_name) {
376 librados::IoCtx io_ctx;
377 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
378
379 std::string image_id;
380 int r = get_image_id(io_ctx, image_name, &image_id);
381 if (r < 0) {
382 return r;
383 }
384
385 std::string replay_image_id;
386 r = get_image_id(io_ctx, replay_image_name, &replay_image_id);
387 if (r < 0) {
388 return r;
389 }
390
391 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {});
392 C_SaferCond init_ctx;
393 journaler.init(&init_ctx);
394 BOOST_SCOPE_EXIT_ALL( (&journaler) ) {
395 journaler.shut_down();
396 };
397
398 r = init_ctx.wait();
399 if (r < 0) {
400 simple_err("failed to initialize journal", r);
401 return r;
402 }
403
404 journal::Journaler replay_journaler(io_ctx, replay_image_id, "", {});
405
406 C_SaferCond replay_init_ctx;
407 replay_journaler.init(&replay_init_ctx);
408 BOOST_SCOPE_EXIT_ALL( (&replay_journaler) ) {
409 replay_journaler.shut_down();
410 };
411
412 r = replay_init_ctx.wait();
413 if (r < 0) {
414 simple_err("failed to initialize replay journal", r);
415 return r;
416 }
417
418 replay_journaler.start_append(0, 0, 0);
419
420 C_SaferCond replay_ctx;
421 ReplayHandler replay_handler(&journaler, &replay_journaler,
422 &replay_ctx);
423
424 // copy journal events from source image to replay image
425 journaler.start_replay(&replay_handler);
426 r = replay_ctx.wait();
427
428 journaler.stop_replay();
429
430 C_SaferCond stop_ctx;
431 replay_journaler.stop_append(&stop_ctx);
432 int stop_r = stop_ctx.wait();
433 if (r == 0 && stop_r < 0) {
434 r = stop_r;
435 }
436
437 if (r < 0) {
438 simple_err("failed to replay journal", r);
439 return r;
440 }
441
442 librbd::RBD rbd;
443 librbd::Image image;
444 r = rbd.open(io_ctx, image, replay_image_name);
445 if (r < 0) {
446 simple_err("failed to open replay image", r);
447 return r;
448 }
449
450 // perform an IO op to initiate the journal replay
451 bufferlist bl;
452 r = static_cast<ssize_t>(image.write(0, 0, bl));
453 if (r < 0) {
454 simple_err("failed to write to replay image", r);
455 return r;
456 }
457 return 0;
458 }
459
460 int finalize_journal(rados_ioctx_t ioctx, const char *imagename, int clones,
461 int order, uint64_t stripe_unit, int stripe_count) {
462 char replayimagename[1024];
463 replay_imagename(replayimagename, sizeof(replayimagename), clones);
464
465 char lastreplayimagename[1024];
466 if (clones > 0) {
467 replay_imagename(lastreplayimagename,
468 sizeof(lastreplayimagename), clones - 1);
469 }
470
471 int ret = create_replay_image(ioctx, order, stripe_unit,
472 stripe_count, replayimagename,
473 clones > 0 ? lastreplayimagename :
474 nullptr);
475 if (ret < 0) {
476 exit(EXIT_FAILURE);
477 }
478
479 ret = replay_journal(ioctx, imagename, replayimagename);
480 if (ret < 0) {
481 exit(EXIT_FAILURE);
482 }
483 return 0;
484 }
485
486 } // anonymous namespace
487
488 /*
489 * rbd
490 */
491
492 struct rbd_ctx {
493 const char *name; /* image name */
494 rbd_image_t image; /* image handle */
495 const char *krbd_name; /* image /dev/rbd<id> name */ /* reused for nbd test */
496 int krbd_fd; /* image /dev/rbd<id> fd */ /* reused for nbd test */
497 };
498
499 #define RBD_CTX_INIT (struct rbd_ctx) { NULL, NULL, NULL, -1}
500
501 struct rbd_operations {
502 int (*open)(const char *name, struct rbd_ctx *ctx);
503 int (*close)(struct rbd_ctx *ctx);
504 ssize_t (*read)(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf);
505 ssize_t (*write)(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf);
506 int (*flush)(struct rbd_ctx *ctx);
507 int (*discard)(struct rbd_ctx *ctx, uint64_t off, uint64_t len);
508 int (*get_size)(struct rbd_ctx *ctx, uint64_t *size);
509 int (*resize)(struct rbd_ctx *ctx, uint64_t size);
510 int (*clone)(struct rbd_ctx *ctx, const char *src_snapname,
511 const char *dst_imagename, int *order, int stripe_unit,
512 int stripe_count);
513 int (*flatten)(struct rbd_ctx *ctx);
514 ssize_t (*writesame)(struct rbd_ctx *ctx, uint64_t off, size_t len,
515 const char *buf, size_t data_len);
516 ssize_t (*compare_and_write)(struct rbd_ctx *ctx, uint64_t off, size_t len,
517 const char *cmp_buf, const char *buf);
518 };
519
520 char *pool; /* name of the pool our test image is in */
521 char *iname; /* name of our test image */
522 rados_t cluster; /* handle for our test cluster */
523 rados_ioctx_t ioctx; /* handle for our test pool */
524 struct krbd_ctx *krbd; /* handle for libkrbd */
525 bool skip_partial_discard; /* rbd_skip_partial_discard config value*/
526
527 /*
528 * librbd/krbd rbd_operations handlers. Given the rest of fsx.c, no
529 * attempt to do error handling is made in these handlers.
530 */
531
532 int
533 __librbd_open(const char *name, struct rbd_ctx *ctx)
534 {
535 rbd_image_t image;
536 int ret;
537
538 assert(!ctx->name && !ctx->image &&
539 !ctx->krbd_name && ctx->krbd_fd < 0);
540
541 ret = rbd_open(ioctx, name, &image, NULL);
542 if (ret < 0) {
543 prt("rbd_open(%s) failed\n", name);
544 return ret;
545 }
546
547 ctx->name = strdup(name);
548 ctx->image = image;
549 ctx->krbd_name = NULL;
550 ctx->krbd_fd = -1;
551
552 return 0;
553 }
554
555 int
556 librbd_open(const char *name, struct rbd_ctx *ctx)
557 {
558 return __librbd_open(name, ctx);
559 }
560
561 int
562 __librbd_close(struct rbd_ctx *ctx)
563 {
564 int ret;
565
566 assert(ctx->name && ctx->image);
567
568 ret = rbd_close(ctx->image);
569 if (ret < 0) {
570 prt("rbd_close(%s) failed\n", ctx->name);
571 return ret;
572 }
573
574 free((void *)ctx->name);
575
576 ctx->name = NULL;
577 ctx->image = NULL;
578
579 return 0;
580 }
581
582 int
583 librbd_close(struct rbd_ctx *ctx)
584 {
585 return __librbd_close(ctx);
586 }
587
588 int
589 librbd_verify_object_map(struct rbd_ctx *ctx)
590 {
591 int n;
592 uint64_t flags;
593 n = rbd_get_flags(ctx->image, &flags);
594 if (n < 0) {
595 prt("rbd_get_flags() failed\n");
596 return n;
597 }
598
599 if ((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
600 prt("rbd_get_flags() indicates object map is invalid\n");
601 return -EINVAL;
602 }
603 return 0;
604 }
605
606 ssize_t
607 librbd_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
608 {
609 ssize_t n;
610
611 n = rbd_read(ctx->image, off, len, buf);
612 if (n < 0)
613 prt("rbd_read(%llu, %zu) failed\n", off, len);
614
615 return n;
616 }
617
618 ssize_t
619 librbd_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
620 {
621 ssize_t n;
622 int ret;
623
624 n = rbd_write(ctx->image, off, len, buf);
625 if (n < 0) {
626 prt("rbd_write(%llu, %zu) failed\n", off, len);
627 return n;
628 }
629
630 ret = librbd_verify_object_map(ctx);
631 if (ret < 0) {
632 return ret;
633 }
634 return n;
635 }
636
637 int
638 librbd_flush(struct rbd_ctx *ctx)
639 {
640 int ret;
641
642 ret = rbd_flush(ctx->image);
643 if (ret < 0) {
644 prt("rbd_flush failed\n");
645 return ret;
646 }
647
648 return librbd_verify_object_map(ctx);
649 }
650
651 int
652 librbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
653 {
654 int ret;
655
656 ret = rbd_discard(ctx->image, off, len);
657 if (ret < 0) {
658 prt("rbd_discard(%llu, %llu) failed\n", off, len);
659 return ret;
660 }
661
662 return librbd_verify_object_map(ctx);
663 }
664
665 ssize_t
666 librbd_writesame(struct rbd_ctx *ctx, uint64_t off, size_t len,
667 const char *buf, size_t data_len)
668 {
669 ssize_t n;
670 int ret;
671
672 n = rbd_writesame(ctx->image, off, len, buf, data_len, 0);
673 if (n < 0) {
674 prt("rbd_writesame(%llu, %zu) failed\n", off, len);
675 return n;
676 }
677
678 ret = librbd_verify_object_map(ctx);
679 if (ret < 0) {
680 return ret;
681 }
682 return n;
683 }
684
685 ssize_t
686 librbd_compare_and_write(struct rbd_ctx *ctx, uint64_t off, size_t len,
687 const char *cmp_buf, const char *buf)
688 {
689 ssize_t n;
690 int ret;
691 uint64_t mismatch_off = 0;
692
693 n = rbd_compare_and_write(ctx->image, off, len, cmp_buf, buf, &mismatch_off, 0);
694 if (n == -EINVAL) {
695 return n;
696 } else if (n < 0) {
697 prt("rbd_compare_and_write mismatch(%llu, %zu, %llu) failed\n",
698 off, len, mismatch_off);
699 return n;
700 }
701
702 ret = librbd_verify_object_map(ctx);
703 if (ret < 0) {
704 return ret;
705 }
706 return n;
707
708 }
709
710 int
711 librbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
712 {
713 rbd_image_info_t info;
714 int ret;
715
716 ret = rbd_stat(ctx->image, &info, sizeof(info));
717 if (ret < 0) {
718 prt("rbd_stat failed\n");
719 return ret;
720 }
721
722 *size = info.size;
723
724 return 0;
725 }
726
727 int
728 __librbd_resize(struct rbd_ctx *ctx, uint64_t size)
729 {
730 int ret;
731
732 ret = rbd_resize(ctx->image, size);
733 if (ret < 0) {
734 prt("rbd_resize(%llu) failed\n", size);
735 return ret;
736 }
737
738 return librbd_verify_object_map(ctx);
739 }
740
741 int
742 librbd_resize(struct rbd_ctx *ctx, uint64_t size)
743 {
744 return __librbd_resize(ctx, size);
745 }
746
747 int
748 __librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
749 const char *dst_imagename, int *order, int stripe_unit,
750 int stripe_count, bool krbd)
751 {
752 int ret;
753
754 ret = rbd_snap_create(ctx->image, src_snapname);
755 if (ret < 0) {
756 prt("rbd_snap_create(%s@%s) failed\n", ctx->name,
757 src_snapname);
758 return ret;
759 }
760
761 ret = rbd_snap_protect(ctx->image, src_snapname);
762 if (ret < 0) {
763 prt("rbd_snap_protect(%s@%s) failed\n", ctx->name,
764 src_snapname);
765 return ret;
766 }
767
768 uint64_t features = RBD_FEATURES_ALL;
769 if (krbd) {
770 features &= ~(RBD_FEATURE_OBJECT_MAP |
771 RBD_FEATURE_FAST_DIFF |
772 RBD_FEATURE_DEEP_FLATTEN |
773 RBD_FEATURE_JOURNALING);
774 }
775 ret = rbd_clone2(ioctx, ctx->name, src_snapname, ioctx,
776 dst_imagename, features, order,
777 stripe_unit, stripe_count);
778 if (ret < 0) {
779 prt("rbd_clone2(%s@%s -> %s) failed\n", ctx->name,
780 src_snapname, dst_imagename);
781 return ret;
782 }
783
784 return 0;
785 }
786
787 int
788 librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
789 const char *dst_imagename, int *order, int stripe_unit,
790 int stripe_count)
791 {
792 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
793 stripe_unit, stripe_count, false);
794 }
795
796 int
797 __librbd_flatten(struct rbd_ctx *ctx)
798 {
799 int ret;
800
801 ret = rbd_flatten(ctx->image);
802 if (ret < 0) {
803 prt("rbd_flatten failed\n");
804 return ret;
805 }
806
807 return librbd_verify_object_map(ctx);
808 }
809
810 int
811 librbd_flatten(struct rbd_ctx *ctx)
812 {
813 return __librbd_flatten(ctx);
814 }
815
816 const struct rbd_operations librbd_operations = {
817 librbd_open,
818 librbd_close,
819 librbd_read,
820 librbd_write,
821 librbd_flush,
822 librbd_discard,
823 librbd_get_size,
824 librbd_resize,
825 librbd_clone,
826 librbd_flatten,
827 librbd_writesame,
828 librbd_compare_and_write,
829 };
830
831 int
832 krbd_open(const char *name, struct rbd_ctx *ctx)
833 {
834 char *devnode;
835 int fd;
836 int ret;
837
838 ret = __librbd_open(name, ctx);
839 if (ret < 0)
840 return ret;
841
842 ret = krbd_map(krbd, pool, name, "", "", &devnode);
843 if (ret < 0) {
844 prt("krbd_map(%s) failed\n", name);
845 return ret;
846 }
847
848 fd = open(devnode, O_RDWR | o_direct);
849 if (fd < 0) {
850 ret = -errno;
851 prt("open(%s) failed\n", devnode);
852 return ret;
853 }
854
855 ctx->krbd_name = devnode;
856 ctx->krbd_fd = fd;
857
858 return 0;
859 }
860
861 int
862 krbd_close(struct rbd_ctx *ctx)
863 {
864 int ret;
865
866 assert(ctx->krbd_name && ctx->krbd_fd >= 0);
867
868 if (close(ctx->krbd_fd) < 0) {
869 ret = -errno;
870 prt("close(%s) failed\n", ctx->krbd_name);
871 return ret;
872 }
873
874 ret = krbd_unmap(krbd, ctx->krbd_name, "");
875 if (ret < 0) {
876 prt("krbd_unmap(%s) failed\n", ctx->krbd_name);
877 return ret;
878 }
879
880 free((void *)ctx->krbd_name);
881
882 ctx->krbd_name = NULL;
883 ctx->krbd_fd = -1;
884
885 return __librbd_close(ctx);
886 }
887
888 ssize_t
889 krbd_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
890 {
891 ssize_t n;
892
893 n = pread(ctx->krbd_fd, buf, len, off);
894 if (n < 0) {
895 n = -errno;
896 prt("pread(%llu, %zu) failed\n", off, len);
897 return n;
898 }
899
900 return n;
901 }
902
903 ssize_t
904 krbd_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
905 {
906 ssize_t n;
907
908 n = pwrite(ctx->krbd_fd, buf, len, off);
909 if (n < 0) {
910 n = -errno;
911 prt("pwrite(%llu, %zu) failed\n", off, len);
912 return n;
913 }
914
915 return n;
916 }
917
918 int
919 __krbd_flush(struct rbd_ctx *ctx, bool invalidate)
920 {
921 int ret;
922
923 if (o_direct)
924 return 0;
925
926 /*
927 * BLKFLSBUF will sync the filesystem on top of the device (we
928 * don't care about that here, since we write directly to it),
929 * write out any dirty buffers and invalidate the buffer cache.
930 * It won't do a hardware cache flush.
931 *
932 * fsync() will write out any dirty buffers and do a hardware
933 * cache flush (which we don't care about either, because for
934 * krbd it's a noop). It won't try to empty the buffer cache
935 * nor poke the filesystem before writing out.
936 *
937 * Given that, for our purposes, fsync is a flush, while
938 * BLKFLSBUF is a flush+invalidate.
939 */
940 if (invalidate)
941 ret = ioctl(ctx->krbd_fd, BLKFLSBUF, NULL);
942 else
943 ret = fsync(ctx->krbd_fd);
944 if (ret < 0) {
945 ret = -errno;
946 prt("%s failed\n", invalidate ? "BLKFLSBUF" : "fsync");
947 return ret;
948 }
949
950 return 0;
951 }
952
953 int
954 krbd_flush(struct rbd_ctx *ctx)
955 {
956 return __krbd_flush(ctx, false);
957 }
958
959 int
960 krbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
961 {
962 uint64_t range[2] = { off, len };
963 int ret;
964
965 /*
966 * BLKDISCARD goes straight to disk and doesn't do anything
967 * about dirty buffers. This means we need to flush so that
968 *
969 * write 0..3M
970 * discard 1..2M
971 *
972 * results in "data 0000 data" rather than "data data data" on
973 * disk and invalidate so that
974 *
975 * discard 1..2M
976 * read 0..3M
977 *
978 * returns "data 0000 data" rather than "data data data" in
979 * case 1..2M was cached.
980 */
981 ret = __krbd_flush(ctx, true);
982 if (ret < 0)
983 return ret;
984
985 /*
986 * off and len must be 512-byte aligned, otherwise BLKDISCARD
987 * will fail with -EINVAL. This means that -K (enable krbd
988 * mode) requires -h 512 or similar.
989 */
990 if (ioctl(ctx->krbd_fd, BLKDISCARD, &range) < 0) {
991 ret = -errno;
992 prt("BLKDISCARD(%llu, %llu) failed\n", off, len);
993 return ret;
994 }
995
996 return 0;
997 }
998
999 int
1000 krbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
1001 {
1002 uint64_t bytes;
1003
1004 if (ioctl(ctx->krbd_fd, BLKGETSIZE64, &bytes) < 0) {
1005 int ret = -errno;
1006 prt("BLKGETSIZE64 failed\n");
1007 return ret;
1008 }
1009
1010 *size = bytes;
1011
1012 return 0;
1013 }
1014
1015 int
1016 krbd_resize(struct rbd_ctx *ctx, uint64_t size)
1017 {
1018 int ret;
1019
1020 assert(size % truncbdy == 0);
1021
1022 /*
1023 * When krbd detects a size change, it calls revalidate_disk(),
1024 * which ends up calling invalidate_bdev(), which invalidates
1025 * clean pages and does nothing about dirty pages beyond the
1026 * new size. The preceding cache flush makes sure those pages
1027 * are invalidated, which is what we need on shrink so that
1028 *
1029 * write 0..1M
1030 * resize 0
1031 * resize 2M
1032 * read 0..2M
1033 *
1034 * returns "0000 0000" rather than "data 0000".
1035 */
1036 ret = __krbd_flush(ctx, false);
1037 if (ret < 0)
1038 return ret;
1039
1040 return __librbd_resize(ctx, size);
1041 }
1042
1043 int
1044 krbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
1045 const char *dst_imagename, int *order, int stripe_unit,
1046 int stripe_count)
1047 {
1048 int ret;
1049
1050 ret = __krbd_flush(ctx, false);
1051 if (ret < 0)
1052 return ret;
1053
1054 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
1055 stripe_unit, stripe_count, true);
1056 }
1057
1058 int
1059 krbd_flatten(struct rbd_ctx *ctx)
1060 {
1061 int ret;
1062
1063 ret = __krbd_flush(ctx, false);
1064 if (ret < 0)
1065 return ret;
1066
1067 return __librbd_flatten(ctx);
1068 }
1069
1070 const struct rbd_operations krbd_operations = {
1071 krbd_open,
1072 krbd_close,
1073 krbd_read,
1074 krbd_write,
1075 krbd_flush,
1076 krbd_discard,
1077 krbd_get_size,
1078 krbd_resize,
1079 krbd_clone,
1080 krbd_flatten,
1081 NULL,
1082 };
1083
1084 int
1085 nbd_open(const char *name, struct rbd_ctx *ctx)
1086 {
1087 int r;
1088 int fd;
1089 char dev[4096];
1090 char *devnode;
1091
1092 SubProcess process("rbd-nbd", SubProcess::KEEP, SubProcess::PIPE,
1093 SubProcess::KEEP);
1094 process.add_cmd_arg("map");
1095 std::string img;
1096 img.append(pool);
1097 img.append("/");
1098 img.append(name);
1099 process.add_cmd_arg(img.c_str());
1100
1101 r = __librbd_open(name, ctx);
1102 if (r < 0)
1103 return r;
1104
1105 r = process.spawn();
1106 if (r < 0) {
1107 prt("nbd_open failed to run rbd-nbd error: %s\n", process.err().c_str());
1108 return r;
1109 }
1110 r = safe_read(process.get_stdout(), dev, sizeof(dev));
1111 if (r < 0) {
1112 prt("nbd_open failed to get nbd device path\n");
1113 return r;
1114 }
1115 for (int i = 0; i < r; ++i)
1116 if (dev[i] == 10 || dev[i] == 13)
1117 dev[i] = 0;
1118 dev[r] = 0;
1119 r = process.join();
1120 if (r) {
1121 prt("rbd-nbd failed with error: %s", process.err().c_str());
1122 return -EINVAL;
1123 }
1124
1125 devnode = strdup(dev);
1126 if (!devnode)
1127 return -ENOMEM;
1128
1129 fd = open(devnode, O_RDWR | o_direct);
1130 if (fd < 0) {
1131 r = -errno;
1132 prt("open(%s) failed\n", devnode);
1133 return r;
1134 }
1135
1136 ctx->krbd_name = devnode;
1137 ctx->krbd_fd = fd;
1138
1139 return 0;
1140 }
1141
1142 int
1143 nbd_close(struct rbd_ctx *ctx)
1144 {
1145 int r;
1146
1147 assert(ctx->krbd_name && ctx->krbd_fd >= 0);
1148
1149 if (close(ctx->krbd_fd) < 0) {
1150 r = -errno;
1151 prt("close(%s) failed\n", ctx->krbd_name);
1152 return r;
1153 }
1154
1155 SubProcess process("rbd-nbd");
1156 process.add_cmd_arg("unmap");
1157 process.add_cmd_arg(ctx->krbd_name);
1158
1159 r = process.spawn();
1160 if (r < 0) {
1161 prt("nbd_close failed to run rbd-nbd error: %s\n", process.err().c_str());
1162 return r;
1163 }
1164 r = process.join();
1165 if (r) {
1166 prt("rbd-nbd failed with error: %d", process.err().c_str());
1167 return -EINVAL;
1168 }
1169
1170 free((void *)ctx->krbd_name);
1171
1172 ctx->krbd_name = NULL;
1173 ctx->krbd_fd = -1;
1174
1175 return __librbd_close(ctx);
1176 }
1177
1178 int
1179 nbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
1180 const char *dst_imagename, int *order, int stripe_unit,
1181 int stripe_count)
1182 {
1183 int ret;
1184
1185 ret = __krbd_flush(ctx, false);
1186 if (ret < 0)
1187 return ret;
1188
1189 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
1190 stripe_unit, stripe_count, false);
1191 }
1192
1193 const struct rbd_operations nbd_operations = {
1194 nbd_open,
1195 nbd_close,
1196 krbd_read,
1197 krbd_write,
1198 krbd_flush,
1199 krbd_discard,
1200 krbd_get_size,
1201 krbd_resize,
1202 nbd_clone,
1203 krbd_flatten,
1204 NULL,
1205 };
1206
1207 struct rbd_ctx ctx = RBD_CTX_INIT;
1208 const struct rbd_operations *ops = &librbd_operations;
1209
1210 static bool rbd_image_has_parent(struct rbd_ctx *ctx)
1211 {
1212 int ret;
1213
1214 ret = rbd_get_parent_info(ctx->image, NULL, 0, NULL, 0, NULL, 0);
1215 if (ret < 0 && ret != -ENOENT) {
1216 prterrcode("rbd_get_parent_info", ret);
1217 exit(1);
1218 }
1219
1220 return !ret;
1221 }
1222
1223 /*
1224 * fsx
1225 */
1226
1227 void
1228 log4(int operation, int arg0, int arg1, int arg2)
1229 {
1230 struct log_entry *le;
1231
1232 le = &oplog[logptr];
1233 le->operation = operation;
1234 if (closeopen)
1235 le->operation = ~ le->operation;
1236 le->args[0] = arg0;
1237 le->args[1] = arg1;
1238 le->args[2] = arg2;
1239 logptr++;
1240 logcount++;
1241 if (logptr >= LOGSIZE)
1242 logptr = 0;
1243 }
1244
1245 void
1246 logdump(void)
1247 {
1248 int i, count, down;
1249 struct log_entry *lp;
1250 const char *falloc_type[3] = {"PAST_EOF", "EXTENDING", "INTERIOR"};
1251
1252 prt("LOG DUMP (%d total operations):\n", logcount);
1253 if (logcount < LOGSIZE) {
1254 i = 0;
1255 count = logcount;
1256 } else {
1257 i = logptr;
1258 count = LOGSIZE;
1259 }
1260 for ( ; count > 0; count--) {
1261 int opnum;
1262
1263 opnum = i+1 + (logcount/LOGSIZE)*LOGSIZE;
1264 prt("%d(%3d mod 256): ", opnum, opnum%256);
1265 lp = &oplog[i];
1266 if ((closeopen = lp->operation < 0))
1267 lp->operation = ~ lp->operation;
1268
1269 switch (lp->operation) {
1270 case OP_MAPREAD:
1271 prt("MAPREAD 0x%x thru 0x%x\t(0x%x bytes)",
1272 lp->args[0], lp->args[0] + lp->args[1] - 1,
1273 lp->args[1]);
1274 if (badoff >= lp->args[0] && badoff <
1275 lp->args[0] + lp->args[1])
1276 prt("\t***RRRR***");
1277 break;
1278 case OP_MAPWRITE:
1279 prt("MAPWRITE 0x%x thru 0x%x\t(0x%x bytes)",
1280 lp->args[0], lp->args[0] + lp->args[1] - 1,
1281 lp->args[1]);
1282 if (badoff >= lp->args[0] && badoff <
1283 lp->args[0] + lp->args[1])
1284 prt("\t******WWWW");
1285 break;
1286 case OP_READ:
1287 prt("READ 0x%x thru 0x%x\t(0x%x bytes)",
1288 lp->args[0], lp->args[0] + lp->args[1] - 1,
1289 lp->args[1]);
1290 if (badoff >= lp->args[0] &&
1291 badoff < lp->args[0] + lp->args[1])
1292 prt("\t***RRRR***");
1293 break;
1294 case OP_WRITE:
1295 prt("WRITE 0x%x thru 0x%x\t(0x%x bytes)",
1296 lp->args[0], lp->args[0] + lp->args[1] - 1,
1297 lp->args[1]);
1298 if (lp->args[0] > lp->args[2])
1299 prt(" HOLE");
1300 else if (lp->args[0] + lp->args[1] > lp->args[2])
1301 prt(" EXTEND");
1302 if ((badoff >= lp->args[0] || badoff >=lp->args[2]) &&
1303 badoff < lp->args[0] + lp->args[1])
1304 prt("\t***WWWW");
1305 break;
1306 case OP_TRUNCATE:
1307 down = lp->args[0] < lp->args[1];
1308 prt("TRUNCATE %s\tfrom 0x%x to 0x%x",
1309 down ? "DOWN" : "UP", lp->args[1], lp->args[0]);
1310 if (badoff >= lp->args[!down] &&
1311 badoff < lp->args[!!down])
1312 prt("\t******WWWW");
1313 break;
1314 case OP_FALLOCATE:
1315 /* 0: offset 1: length 2: where alloced */
1316 prt("FALLOC 0x%x thru 0x%x\t(0x%x bytes) %s",
1317 lp->args[0], lp->args[0] + lp->args[1],
1318 lp->args[1], falloc_type[lp->args[2]]);
1319 if (badoff >= lp->args[0] &&
1320 badoff < lp->args[0] + lp->args[1])
1321 prt("\t******FFFF");
1322 break;
1323 case OP_PUNCH_HOLE:
1324 prt("PUNCH 0x%x thru 0x%x\t(0x%x bytes)",
1325 lp->args[0], lp->args[0] + lp->args[1] - 1,
1326 lp->args[1]);
1327 if (badoff >= lp->args[0] && badoff <
1328 lp->args[0] + lp->args[1])
1329 prt("\t******PPPP");
1330 break;
1331 case OP_WRITESAME:
1332 prt("WRITESAME 0x%x thru 0x%x\t(0x%x bytes) data_size 0x%x",
1333 lp->args[0], lp->args[0] + lp->args[1] - 1,
1334 lp->args[1], lp->args[2]);
1335 if (badoff >= lp->args[0] &&
1336 badoff < lp->args[0] + lp->args[1])
1337 prt("\t***WSWSWSWS");
1338 break;
1339 case OP_COMPARE_AND_WRITE:
1340 prt("COMPARE_AND_WRITE 0x%x thru 0x%x\t(0x%x bytes)",
1341 lp->args[0], lp->args[0] + lp->args[1] - 1,
1342 lp->args[1]);
1343 if (lp->args[0] > lp->args[2])
1344 prt(" HOLE");
1345 else if (lp->args[0] + lp->args[1] > lp->args[2])
1346 prt(" EXTEND");
1347 if ((badoff >= lp->args[0] || badoff >=lp->args[2]) &&
1348 badoff < lp->args[0] + lp->args[1])
1349 prt("\t***WWWW");
1350 break;
1351 case OP_CLONE:
1352 prt("CLONE");
1353 break;
1354 case OP_FLATTEN:
1355 prt("FLATTEN");
1356 break;
1357 case OP_SKIPPED:
1358 prt("SKIPPED (no operation)");
1359 break;
1360 default:
1361 prt("BOGUS LOG ENTRY (operation code = %d)!",
1362 lp->operation);
1363 }
1364 if (closeopen)
1365 prt("\n\t\tCLOSE/OPEN");
1366 prt("\n");
1367 i++;
1368 if (i == LOGSIZE)
1369 i = 0;
1370 }
1371 }
1372
1373 void
1374 save_buffer(char *buffer, off_t bufferlength, int fd)
1375 {
1376 off_t ret;
1377 ssize_t byteswritten;
1378
1379 if (fd <= 0 || bufferlength == 0)
1380 return;
1381
1382 if (bufferlength > SSIZE_MAX) {
1383 prt("fsx flaw: overflow in save_buffer\n");
1384 exit(67);
1385 }
1386
1387 ret = lseek(fd, (off_t)0, SEEK_SET);
1388 if (ret == (off_t)-1)
1389 prterr("save_buffer: lseek 0");
1390
1391 byteswritten = write(fd, buffer, (size_t)bufferlength);
1392 if (byteswritten != bufferlength) {
1393 if (byteswritten == -1)
1394 prterr("save_buffer write");
1395 else
1396 warn("save_buffer: short write, 0x%x bytes instead of 0x%llx\n",
1397 (unsigned)byteswritten,
1398 (unsigned long long)bufferlength);
1399 }
1400 }
1401
1402
1403 void
1404 report_failure(int status)
1405 {
1406 logdump();
1407
1408 if (fsxgoodfd) {
1409 if (good_buf) {
1410 save_buffer(good_buf, file_size, fsxgoodfd);
1411 prt("Correct content saved for comparison\n");
1412 prt("(maybe hexdump \"%s\" vs \"%s.fsxgood\")\n",
1413 iname, iname);
1414 }
1415 close(fsxgoodfd);
1416 }
1417 sleep(3); // so the log can flush to disk. KLUDGEY!
1418 exit(status);
1419 }
1420
1421 #define short_at(cp) ((unsigned short)((*((unsigned char *)(cp)) << 8) | \
1422 *(((unsigned char *)(cp)) + 1)))
1423
1424 int
1425 fsxcmp(char *good_buf, char *temp_buf, unsigned size)
1426 {
1427 if (!skip_partial_discard) {
1428 return memcmp(good_buf, temp_buf, size);
1429 }
1430
1431 for (unsigned i = 0; i < size; i++) {
1432 if (good_buf[i] != temp_buf[i] && good_buf[i] != 0) {
1433 return good_buf[i] - temp_buf[i];
1434 }
1435 }
1436 return 0;
1437 }
1438
1439 void
1440 check_buffers(char *good_buf, char *temp_buf, unsigned offset, unsigned size)
1441 {
1442 if (fsxcmp(good_buf + offset, temp_buf, size) != 0) {
1443 unsigned i = 0;
1444 unsigned n = 0;
1445
1446 prt("READ BAD DATA: offset = 0x%x, size = 0x%x, fname = %s\n",
1447 offset, size, iname);
1448 prt("OFFSET\tGOOD\tBAD\tRANGE\n");
1449 while (size > 0) {
1450 unsigned char c = good_buf[offset];
1451 unsigned char t = temp_buf[i];
1452 if (c != t) {
1453 if (n < 16) {
1454 unsigned bad = short_at(&temp_buf[i]);
1455 prt("0x%5x\t0x%04x\t0x%04x", offset,
1456 short_at(&good_buf[offset]), bad);
1457 unsigned op = temp_buf[(offset & 1) ? i+1 : i];
1458 prt("\t0x%5x\n", n);
1459 if (op)
1460 prt("operation# (mod 256) for "
1461 "the bad data may be %u\n",
1462 ((unsigned)op & 0xff));
1463 else
1464 prt("operation# (mod 256) for "
1465 "the bad data unknown, check"
1466 " HOLE and EXTEND ops\n");
1467 }
1468 n++;
1469 badoff = offset;
1470 }
1471 offset++;
1472 i++;
1473 size--;
1474 }
1475 report_failure(110);
1476 }
1477 }
1478
1479
1480 void
1481 check_size(void)
1482 {
1483 uint64_t size;
1484 int ret;
1485
1486 ret = ops->get_size(&ctx, &size);
1487 if (ret < 0)
1488 prterrcode("check_size: ops->get_size", ret);
1489
1490 if ((uint64_t)file_size != size) {
1491 prt("Size error: expected 0x%llx stat 0x%llx\n",
1492 (unsigned long long)file_size,
1493 (unsigned long long)size);
1494 report_failure(120);
1495 }
1496 }
1497
1498 #define TRUNC_HACK_SIZE (200ULL << 9) /* 512-byte aligned for krbd */
1499
1500 void
1501 check_trunc_hack(void)
1502 {
1503 uint64_t size;
1504 int ret;
1505
1506 ret = ops->resize(&ctx, 0ULL);
1507 if (ret < 0)
1508 prterrcode("check_trunc_hack: ops->resize pre", ret);
1509
1510 ret = ops->resize(&ctx, TRUNC_HACK_SIZE);
1511 if (ret < 0)
1512 prterrcode("check_trunc_hack: ops->resize actual", ret);
1513
1514 ret = ops->get_size(&ctx, &size);
1515 if (ret < 0)
1516 prterrcode("check_trunc_hack: ops->get_size", ret);
1517
1518 if (size != TRUNC_HACK_SIZE) {
1519 prt("no extend on truncate! not posix!\n");
1520 exit(130);
1521 }
1522
1523 ret = ops->resize(&ctx, 0ULL);
1524 if (ret < 0)
1525 prterrcode("check_trunc_hack: ops->resize post", ret);
1526 }
1527
1528 int
1529 create_image()
1530 {
1531 int r;
1532 int order = 0;
1533 char buf[32];
1534
1535 r = rados_create(&cluster, NULL);
1536 if (r < 0) {
1537 simple_err("Could not create cluster handle", r);
1538 return r;
1539 }
1540 rados_conf_parse_env(cluster, NULL);
1541 r = rados_conf_read_file(cluster, NULL);
1542 if (r < 0) {
1543 simple_err("Error reading ceph config file", r);
1544 goto failed_shutdown;
1545 }
1546 r = rados_connect(cluster);
1547 if (r < 0) {
1548 simple_err("Error connecting to cluster", r);
1549 goto failed_shutdown;
1550 }
1551 r = krbd_create_from_context(rados_cct(cluster), &krbd);
1552 if (r < 0) {
1553 simple_err("Could not create libkrbd handle", r);
1554 goto failed_shutdown;
1555 }
1556
1557 r = rados_pool_create(cluster, pool);
1558 if (r < 0 && r != -EEXIST) {
1559 simple_err("Error creating pool", r);
1560 goto failed_krbd;
1561 }
1562 r = rados_ioctx_create(cluster, pool, &ioctx);
1563 if (r < 0) {
1564 simple_err("Error creating ioctx", r);
1565 goto failed_krbd;
1566 }
1567 rados_application_enable(ioctx, "rbd", 1);
1568
1569 if (clone_calls || journal_replay) {
1570 uint64_t features = 0;
1571 if (clone_calls) {
1572 features |= RBD_FEATURE_LAYERING;
1573 }
1574 if (journal_replay) {
1575 features |= (RBD_FEATURE_EXCLUSIVE_LOCK |
1576 RBD_FEATURE_JOURNALING);
1577 }
1578 r = rbd_create2(ioctx, iname, 0, features, &order);
1579 } else {
1580 r = rbd_create(ioctx, iname, 0, &order);
1581 }
1582 if (r < 0) {
1583 simple_err("Error creating image", r);
1584 goto failed_open;
1585 }
1586
1587 if (journal_replay) {
1588 r = register_journal(ioctx, iname);
1589 if (r < 0) {
1590 goto failed_open;
1591 }
1592 }
1593
1594 r = rados_conf_get(cluster, "rbd_skip_partial_discard", buf,
1595 sizeof(buf));
1596 if (r < 0) {
1597 simple_err("Could not get rbd_skip_partial_discard value", r);
1598 goto failed_open;
1599 }
1600 skip_partial_discard = (strcmp(buf, "true") == 0);
1601
1602 return 0;
1603
1604 failed_open:
1605 rados_ioctx_destroy(ioctx);
1606 failed_krbd:
1607 krbd_destroy(krbd);
1608 failed_shutdown:
1609 rados_shutdown(cluster);
1610 return r;
1611 }
1612
1613 void
1614 doflush(unsigned offset, unsigned size)
1615 {
1616 int ret;
1617
1618 if (o_direct)
1619 return;
1620
1621 ret = ops->flush(&ctx);
1622 if (ret < 0)
1623 prterrcode("doflush: ops->flush", ret);
1624 }
1625
1626 void
1627 doread(unsigned offset, unsigned size)
1628 {
1629 int ret;
1630
1631 offset -= offset % readbdy;
1632 if (o_direct)
1633 size -= size % readbdy;
1634 if (size == 0) {
1635 if (!quiet && testcalls > simulatedopcount && !o_direct)
1636 prt("skipping zero size read\n");
1637 log4(OP_SKIPPED, OP_READ, offset, size);
1638 return;
1639 }
1640 if (size + offset > file_size) {
1641 if (!quiet && testcalls > simulatedopcount)
1642 prt("skipping seek/read past end of file\n");
1643 log4(OP_SKIPPED, OP_READ, offset, size);
1644 return;
1645 }
1646
1647 log4(OP_READ, offset, size, 0);
1648
1649 if (testcalls <= simulatedopcount)
1650 return;
1651
1652 if (!quiet &&
1653 ((progressinterval && testcalls % progressinterval == 0) ||
1654 (debug &&
1655 (monitorstart == -1 ||
1656 (static_cast<long>(offset + size) > monitorstart &&
1657 (monitorend == -1 ||
1658 static_cast<long>(offset) <= monitorend))))))
1659 prt("%lu read\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
1660 offset, offset + size - 1, size);
1661
1662 ret = ops->read(&ctx, offset, size, temp_buf);
1663 if (ret != (int)size) {
1664 if (ret < 0)
1665 prterrcode("doread: ops->read", ret);
1666 else
1667 prt("short read: 0x%x bytes instead of 0x%x\n",
1668 ret, size);
1669 report_failure(141);
1670 }
1671
1672 check_buffers(good_buf, temp_buf, offset, size);
1673 }
1674
1675
1676 void
1677 check_eofpage(char *s, unsigned offset, char *p, int size)
1678 {
1679 unsigned long last_page, should_be_zero;
1680
1681 if (offset + size <= (file_size & ~page_mask))
1682 return;
1683 /*
1684 * we landed in the last page of the file
1685 * test to make sure the VM system provided 0's
1686 * beyond the true end of the file mapping
1687 * (as required by mmap def in 1996 posix 1003.1)
1688 */
1689 last_page = ((unsigned long)p + (offset & page_mask) + size) & ~page_mask;
1690
1691 for (should_be_zero = last_page + (file_size & page_mask);
1692 should_be_zero < last_page + page_size;
1693 should_be_zero++)
1694 if (*(char *)should_be_zero) {
1695 prt("Mapped %s: non-zero data past EOF (0x%llx) page offset 0x%x is 0x%04x\n",
1696 s, file_size - 1, should_be_zero & page_mask,
1697 short_at(should_be_zero));
1698 report_failure(205);
1699 }
1700 }
1701
1702
1703 void
1704 gendata(char *original_buf, char *good_buf, unsigned offset, unsigned size)
1705 {
1706 while (size--) {
1707 good_buf[offset] = testcalls % 256;
1708 if (offset % 2)
1709 good_buf[offset] += original_buf[offset];
1710 offset++;
1711 }
1712 }
1713
1714
1715 void
1716 dowrite(unsigned offset, unsigned size)
1717 {
1718 ssize_t ret;
1719 off_t newsize;
1720
1721 offset -= offset % writebdy;
1722 if (o_direct)
1723 size -= size % writebdy;
1724 if (size == 0) {
1725 if (!quiet && testcalls > simulatedopcount && !o_direct)
1726 prt("skipping zero size write\n");
1727 log4(OP_SKIPPED, OP_WRITE, offset, size);
1728 return;
1729 }
1730
1731 log4(OP_WRITE, offset, size, file_size);
1732
1733 gendata(original_buf, good_buf, offset, size);
1734 if (file_size < offset + size) {
1735 newsize = ceil(((double)offset + size) / truncbdy) * truncbdy;
1736 if (file_size < newsize)
1737 memset(good_buf + file_size, '\0', newsize - file_size);
1738 file_size = newsize;
1739 if (lite) {
1740 warn("Lite file size bug in fsx!");
1741 report_failure(149);
1742 }
1743 ret = ops->resize(&ctx, newsize);
1744 if (ret < 0) {
1745 prterrcode("dowrite: ops->resize", ret);
1746 report_failure(150);
1747 }
1748 }
1749
1750 if (testcalls <= simulatedopcount)
1751 return;
1752
1753 if (!quiet &&
1754 ((progressinterval && testcalls % progressinterval == 0) ||
1755 (debug &&
1756 (monitorstart == -1 ||
1757 (static_cast<long>(offset + size) > monitorstart &&
1758 (monitorend == -1 ||
1759 static_cast<long>(offset) <= monitorend))))))
1760 prt("%lu write\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
1761 offset, offset + size - 1, size);
1762
1763 ret = ops->write(&ctx, offset, size, good_buf + offset);
1764 if (ret != (ssize_t)size) {
1765 if (ret < 0)
1766 prterrcode("dowrite: ops->write", ret);
1767 else
1768 prt("short write: 0x%x bytes instead of 0x%x\n",
1769 ret, size);
1770 report_failure(151);
1771 }
1772
1773 if (flush_enabled)
1774 doflush(offset, size);
1775 }
1776
1777
1778 void
1779 dotruncate(unsigned size)
1780 {
1781 int oldsize = file_size;
1782 int ret;
1783
1784 size -= size % truncbdy;
1785 if (size > biggest) {
1786 biggest = size;
1787 if (!quiet && testcalls > simulatedopcount)
1788 prt("truncating to largest ever: 0x%x\n", size);
1789 }
1790
1791 log4(OP_TRUNCATE, size, (unsigned)file_size, 0);
1792
1793 if (size > file_size)
1794 memset(good_buf + file_size, '\0', size - file_size);
1795 else if (size < file_size)
1796 memset(good_buf + size, '\0', file_size - size);
1797 file_size = size;
1798
1799 if (testcalls <= simulatedopcount)
1800 return;
1801
1802 if ((progressinterval && testcalls % progressinterval == 0) ||
1803 (debug && (monitorstart == -1 || monitorend == -1 ||
1804 (long)size <= monitorend)))
1805 prt("%lu trunc\tfrom 0x%x to 0x%x\n", testcalls, oldsize, size);
1806
1807 ret = ops->resize(&ctx, size);
1808 if (ret < 0) {
1809 prterrcode("dotruncate: ops->resize", ret);
1810 report_failure(160);
1811 }
1812 }
1813
1814 void
1815 do_punch_hole(unsigned offset, unsigned length)
1816 {
1817 unsigned end_offset;
1818 int max_offset = 0;
1819 int max_len = 0;
1820 int ret;
1821
1822 offset -= offset % holebdy;
1823 length -= length % holebdy;
1824 if (length == 0) {
1825 if (!quiet && testcalls > simulatedopcount)
1826 prt("skipping zero length punch hole\n");
1827 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, length);
1828 return;
1829 }
1830
1831 if (file_size <= (loff_t)offset) {
1832 if (!quiet && testcalls > simulatedopcount)
1833 prt("skipping hole punch off the end of the file\n");
1834 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, length);
1835 return;
1836 }
1837
1838 end_offset = offset + length;
1839
1840 log4(OP_PUNCH_HOLE, offset, length, 0);
1841
1842 if (testcalls <= simulatedopcount)
1843 return;
1844
1845 if ((progressinterval && testcalls % progressinterval == 0) ||
1846 (debug && (monitorstart == -1 || monitorend == -1 ||
1847 (long)end_offset <= monitorend))) {
1848 prt("%lu punch\tfrom 0x%x to 0x%x, (0x%x bytes)\n", testcalls,
1849 offset, offset+length, length);
1850 }
1851
1852 ret = ops->discard(&ctx, (unsigned long long)offset,
1853 (unsigned long long)length);
1854 if (ret < 0) {
1855 prterrcode("do_punch_hole: ops->discard", ret);
1856 report_failure(161);
1857 }
1858
1859 max_offset = offset < file_size ? offset : file_size;
1860 max_len = max_offset + length <= file_size ? length :
1861 file_size - max_offset;
1862 memset(good_buf + max_offset, '\0', max_len);
1863 }
1864
1865 unsigned get_data_size(unsigned size)
1866 {
1867 unsigned i;
1868 unsigned hint;
1869 unsigned max = sqrt((double)size) + 1;
1870 unsigned good = 1;
1871 unsigned curr = good;
1872
1873 hint = get_random() % max;
1874
1875 for (i = 1; i < max && curr < hint; i++) {
1876 if (size % i == 0) {
1877 good = curr;
1878 curr = i;
1879 }
1880 }
1881
1882 if (curr == hint)
1883 good = curr;
1884
1885 return good;
1886 }
1887
1888 void
1889 dowritesame(unsigned offset, unsigned size)
1890 {
1891 ssize_t ret;
1892 off_t newsize;
1893 unsigned buf_off;
1894 unsigned data_size;
1895 int n;
1896
1897 offset -= offset % writebdy;
1898 if (o_direct)
1899 size -= size % writebdy;
1900 if (size == 0) {
1901 if (!quiet && testcalls > simulatedopcount && !o_direct)
1902 prt("skipping zero size writesame\n");
1903 log4(OP_SKIPPED, OP_WRITESAME, offset, size);
1904 return;
1905 }
1906
1907 data_size = get_data_size(size);
1908
1909 log4(OP_WRITESAME, offset, size, data_size);
1910
1911 gendata(original_buf, good_buf, offset, data_size);
1912 if (file_size < offset + size) {
1913 newsize = ceil(((double)offset + size) / truncbdy) * truncbdy;
1914 if (file_size < newsize)
1915 memset(good_buf + file_size, '\0', newsize - file_size);
1916 file_size = newsize;
1917 if (lite) {
1918 warn("Lite file size bug in fsx!");
1919 report_failure(162);
1920 }
1921 ret = ops->resize(&ctx, newsize);
1922 if (ret < 0) {
1923 prterrcode("dowritesame: ops->resize", ret);
1924 report_failure(163);
1925 }
1926 }
1927
1928 for (n = size / data_size, buf_off = data_size; n > 1; n--) {
1929 memcpy(good_buf + offset + buf_off, good_buf + offset, data_size);
1930 buf_off += data_size;
1931 }
1932
1933 if (testcalls <= simulatedopcount)
1934 return;
1935
1936 if (!quiet &&
1937 ((progressinterval && testcalls % progressinterval == 0) ||
1938 (debug &&
1939 (monitorstart == -1 ||
1940 (static_cast<long>(offset + size) > monitorstart &&
1941 (monitorend == -1 ||
1942 static_cast<long>(offset) <= monitorend))))))
1943 prt("%lu writesame\t0x%x thru\t0x%x\tdata_size\t0x%x(0x%x bytes)\n", testcalls,
1944 offset, offset + size - 1, data_size, size);
1945
1946 ret = ops->writesame(&ctx, offset, size, good_buf + offset, data_size);
1947 if (ret != (ssize_t)size) {
1948 if (ret < 0)
1949 prterrcode("dowritesame: ops->writesame", ret);
1950 else
1951 prt("short writesame: 0x%x bytes instead of 0x%x\n",
1952 ret, size);
1953 report_failure(164);
1954 }
1955
1956 if (flush_enabled)
1957 doflush(offset, size);
1958 }
1959
1960 void
1961 docompareandwrite(unsigned offset, unsigned size)
1962 {
1963 int ret;
1964
1965 if (skip_partial_discard) {
1966 if (!quiet && testcalls > simulatedopcount)
1967 prt("compare and write disabled\n");
1968 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
1969 return;
1970 }
1971
1972 offset -= offset % writebdy;
1973 if (o_direct)
1974 size -= size % writebdy;
1975
1976 if (size == 0) {
1977 if (!quiet && testcalls > simulatedopcount && !o_direct)
1978 prt("skipping zero size read\n");
1979 log4(OP_SKIPPED, OP_READ, offset, size);
1980 return;
1981 }
1982
1983 if (size + offset > file_size) {
1984 if (!quiet && testcalls > simulatedopcount)
1985 prt("skipping seek/compare past end of file\n");
1986 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
1987 return;
1988 }
1989
1990 memcpy(temp_buf + offset, good_buf + offset, size);
1991 gendata(original_buf, good_buf, offset, size);
1992 log4(OP_COMPARE_AND_WRITE, offset, size, 0);
1993
1994 if (testcalls <= simulatedopcount)
1995 return;
1996
1997 if (!quiet &&
1998 ((progressinterval && testcalls % progressinterval == 0) ||
1999 (debug &&
2000 (monitorstart == -1 ||
2001 (static_cast<long>(offset + size) > monitorstart &&
2002 (monitorend == -1 ||
2003 static_cast<long>(offset) <= monitorend))))))
2004 prt("%lu compareandwrite\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
2005 offset, offset + size - 1, size);
2006
2007 ret = ops->compare_and_write(&ctx, offset, size, temp_buf + offset,
2008 good_buf + offset);
2009 if (ret != (ssize_t)size) {
2010 if (ret == -EINVAL) {
2011 memcpy(good_buf + offset, temp_buf + offset, size);
2012 return;
2013 }
2014 if (ret < 0)
2015 prterrcode("docompareandwrite: ops->compare_and_write", ret);
2016 else
2017 prt("short write: 0x%x bytes instead of 0x%x\n", ret, size);
2018 report_failure(151);
2019 return;
2020 }
2021
2022 if (flush_enabled)
2023 doflush(offset, size);
2024 }
2025
2026 void clone_filename(char *buf, size_t len, int clones)
2027 {
2028 snprintf(buf, len, "%s/fsx-%s-parent%d",
2029 dirpath, iname, clones);
2030 }
2031
2032 void clone_imagename(char *buf, size_t len, int clones)
2033 {
2034 if (clones > 0)
2035 snprintf(buf, len, "%s-clone%d", iname, clones);
2036 else
2037 strncpy(buf, iname, len);
2038 buf[len - 1] = '\0';
2039 }
2040
2041 void replay_imagename(char *buf, size_t len, int clones)
2042 {
2043 clone_imagename(buf, len, clones);
2044 strncat(buf, "-replay", len - strlen(buf));
2045 buf[len - 1] = '\0';
2046 }
2047
2048 void check_clone(int clonenum, bool replay_image);
2049
2050 void
2051 do_clone()
2052 {
2053 char filename[1024];
2054 char imagename[1024];
2055 char lastimagename[1024];
2056 int ret, fd;
2057 int order = 0, stripe_unit = 0, stripe_count = 0;
2058 uint64_t newsize = file_size;
2059
2060 log4(OP_CLONE, 0, 0, 0);
2061 ++num_clones;
2062
2063 if (randomize_striping) {
2064 order = 18 + get_random() % 8;
2065 stripe_unit = 1ull << (order - 1 - (get_random() % 8));
2066 stripe_count = 2 + get_random() % 14;
2067 }
2068
2069 prt("%lu clone\t%d order %d su %d sc %d\n", testcalls, num_clones,
2070 order, stripe_unit, stripe_count);
2071
2072 clone_imagename(imagename, sizeof(imagename), num_clones);
2073 clone_imagename(lastimagename, sizeof(lastimagename),
2074 num_clones - 1);
2075 assert(strcmp(lastimagename, ctx.name) == 0);
2076
2077 ret = ops->clone(&ctx, "snap", imagename, &order, stripe_unit,
2078 stripe_count);
2079 if (ret < 0) {
2080 prterrcode("do_clone: ops->clone", ret);
2081 exit(165);
2082 }
2083
2084 if (randomize_parent_overlap && rbd_image_has_parent(&ctx)) {
2085 int rand = get_random() % 16 + 1; // [1..16]
2086
2087 if (rand < 13) {
2088 uint64_t overlap;
2089
2090 ret = rbd_get_overlap(ctx.image, &overlap);
2091 if (ret < 0) {
2092 prterrcode("do_clone: rbd_get_overlap", ret);
2093 exit(1);
2094 }
2095
2096 if (rand < 10) { // 9/16
2097 newsize = overlap * ((double)rand / 10);
2098 newsize -= newsize % truncbdy;
2099 } else { // 3/16
2100 newsize = 0;
2101 }
2102
2103 assert(newsize != (uint64_t)file_size);
2104 prt("truncating image %s from 0x%llx (overlap 0x%llx) to 0x%llx\n",
2105 ctx.name, file_size, overlap, newsize);
2106
2107 ret = ops->resize(&ctx, newsize);
2108 if (ret < 0) {
2109 prterrcode("do_clone: ops->resize", ret);
2110 exit(1);
2111 }
2112 } else if (rand < 15) { // 2/16
2113 prt("flattening image %s\n", ctx.name);
2114
2115 ret = ops->flatten(&ctx);
2116 if (ret < 0) {
2117 prterrcode("do_clone: ops->flatten", ret);
2118 exit(1);
2119 }
2120 } else { // 2/16
2121 prt("leaving image %s intact\n", ctx.name);
2122 }
2123 }
2124
2125 clone_filename(filename, sizeof(filename), num_clones);
2126 if ((fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0666)) < 0) {
2127 simple_err("do_clone: open", -errno);
2128 exit(162);
2129 }
2130 save_buffer(good_buf, newsize, fd);
2131 if ((ret = close(fd)) < 0) {
2132 simple_err("do_clone: close", -errno);
2133 exit(163);
2134 }
2135
2136 /*
2137 * Close parent.
2138 */
2139 if ((ret = ops->close(&ctx)) < 0) {
2140 prterrcode("do_clone: ops->close", ret);
2141 exit(174);
2142 }
2143
2144 if (journal_replay) {
2145 ret = finalize_journal(ioctx, lastimagename, num_clones - 1,
2146 order, stripe_unit, stripe_count);
2147 if (ret < 0) {
2148 exit(EXIT_FAILURE);
2149 }
2150
2151 ret = register_journal(ioctx, imagename);
2152 if (ret < 0) {
2153 exit(EXIT_FAILURE);
2154 }
2155 }
2156
2157 /*
2158 * Open freshly made clone.
2159 */
2160 if ((ret = ops->open(imagename, &ctx)) < 0) {
2161 prterrcode("do_clone: ops->open", ret);
2162 exit(166);
2163 }
2164
2165 if (num_clones > 1) {
2166 if (journal_replay) {
2167 check_clone(num_clones - 2, true);
2168 }
2169 check_clone(num_clones - 2, false);
2170 }
2171 }
2172
2173 void
2174 check_clone(int clonenum, bool replay_image)
2175 {
2176 char filename[128];
2177 char imagename[128];
2178 int ret, fd;
2179 struct rbd_ctx cur_ctx = RBD_CTX_INIT;
2180 struct stat file_info;
2181 char *good_buf, *temp_buf;
2182
2183 if (replay_image) {
2184 replay_imagename(imagename, sizeof(imagename), clonenum);
2185 } else {
2186 clone_imagename(imagename, sizeof(imagename), clonenum);
2187 }
2188
2189 if ((ret = ops->open(imagename, &cur_ctx)) < 0) {
2190 prterrcode("check_clone: ops->open", ret);
2191 exit(167);
2192 }
2193
2194 clone_filename(filename, sizeof(filename), clonenum + 1);
2195 if ((fd = open(filename, O_RDONLY)) < 0) {
2196 simple_err("check_clone: open", -errno);
2197 exit(168);
2198 }
2199
2200 prt("checking clone #%d, image %s against file %s\n",
2201 clonenum, imagename, filename);
2202 if ((ret = fstat(fd, &file_info)) < 0) {
2203 simple_err("check_clone: fstat", -errno);
2204 exit(169);
2205 }
2206
2207 good_buf = NULL;
2208 ret = posix_memalign((void **)&good_buf,
2209 MAX(writebdy, (int)sizeof(void *)),
2210 file_info.st_size);
2211 if (ret > 0) {
2212 prterrcode("check_clone: posix_memalign(good_buf)", -ret);
2213 exit(96);
2214 }
2215
2216 temp_buf = NULL;
2217 ret = posix_memalign((void **)&temp_buf,
2218 MAX(readbdy, (int)sizeof(void *)),
2219 file_info.st_size);
2220 if (ret > 0) {
2221 prterrcode("check_clone: posix_memalign(temp_buf)", -ret);
2222 exit(97);
2223 }
2224
2225 if ((ret = pread(fd, good_buf, file_info.st_size, 0)) < 0) {
2226 simple_err("check_clone: pread", -errno);
2227 exit(170);
2228 }
2229 if ((ret = ops->read(&cur_ctx, 0, file_info.st_size, temp_buf)) < 0) {
2230 prterrcode("check_clone: ops->read", ret);
2231 exit(171);
2232 }
2233 close(fd);
2234 if ((ret = ops->close(&cur_ctx)) < 0) {
2235 prterrcode("check_clone: ops->close", ret);
2236 exit(174);
2237 }
2238 check_buffers(good_buf, temp_buf, 0, file_info.st_size);
2239
2240 if (!replay_image) {
2241 unlink(filename);
2242 }
2243
2244 free(good_buf);
2245 free(temp_buf);
2246 }
2247
2248 void
2249 writefileimage()
2250 {
2251 ssize_t ret;
2252
2253 ret = ops->write(&ctx, 0, file_size, good_buf);
2254 if (ret != file_size) {
2255 if (ret < 0)
2256 prterrcode("writefileimage: ops->write", ret);
2257 else
2258 prt("short write: 0x%x bytes instead of 0x%llx\n",
2259 ret, (unsigned long long)file_size);
2260 report_failure(172);
2261 }
2262
2263 if (!lite) {
2264 ret = ops->resize(&ctx, file_size);
2265 if (ret < 0) {
2266 prterrcode("writefileimage: ops->resize", ret);
2267 report_failure(173);
2268 }
2269 }
2270 }
2271
2272 void
2273 do_flatten()
2274 {
2275 int ret;
2276
2277 if (!rbd_image_has_parent(&ctx)) {
2278 log4(OP_SKIPPED, OP_FLATTEN, 0, 0);
2279 return;
2280 }
2281 log4(OP_FLATTEN, 0, 0, 0);
2282 prt("%lu flatten\n", testcalls);
2283
2284 ret = ops->flatten(&ctx);
2285 if (ret < 0) {
2286 prterrcode("writefileimage: ops->flatten", ret);
2287 exit(177);
2288 }
2289 }
2290
2291 void
2292 docloseopen(void)
2293 {
2294 char *name;
2295 int ret;
2296
2297 if (testcalls <= simulatedopcount)
2298 return;
2299
2300 name = strdup(ctx.name);
2301
2302 if (debug)
2303 prt("%lu close/open\n", testcalls);
2304
2305 ret = ops->close(&ctx);
2306 if (ret < 0) {
2307 prterrcode("docloseopen: ops->close", ret);
2308 report_failure(180);
2309 }
2310
2311 ret = ops->open(name, &ctx);
2312 if (ret < 0) {
2313 prterrcode("docloseopen: ops->open", ret);
2314 report_failure(181);
2315 }
2316
2317 free(name);
2318 }
2319
2320 #define TRIM_OFF_LEN(off, len, size) \
2321 do { \
2322 if (size) \
2323 (off) %= (size); \
2324 else \
2325 (off) = 0; \
2326 if ((unsigned)(off) + (unsigned)(len) > (unsigned)(size)) \
2327 (len) = (size) - (off); \
2328 } while (0)
2329
2330 void
2331 test(void)
2332 {
2333 unsigned long offset;
2334 unsigned long size = maxoplen;
2335 unsigned long rv = get_random();
2336 unsigned long op;
2337
2338 if (simulatedopcount > 0 && testcalls == simulatedopcount)
2339 writefileimage();
2340
2341 testcalls++;
2342
2343 if (closeprob)
2344 closeopen = (rv >> 3) < (1u << 28) / (unsigned)closeprob;
2345
2346 if (debugstart > 0 && testcalls >= debugstart)
2347 debug = 1;
2348
2349 if (!quiet && testcalls < simulatedopcount && testcalls % 100000 == 0)
2350 prt("%lu...\n", testcalls);
2351
2352 offset = get_random();
2353 if (randomoplen)
2354 size = get_random() % (maxoplen + 1);
2355
2356 /* calculate appropriate op to run */
2357 if (lite)
2358 op = rv % OP_MAX_LITE;
2359 else
2360 op = rv % OP_MAX_FULL;
2361
2362 switch (op) {
2363 case OP_MAPREAD:
2364 if (!mapped_reads)
2365 op = OP_READ;
2366 break;
2367 case OP_MAPWRITE:
2368 if (!mapped_writes)
2369 op = OP_WRITE;
2370 break;
2371 case OP_FALLOCATE:
2372 if (!fallocate_calls) {
2373 log4(OP_SKIPPED, OP_FALLOCATE, offset, size);
2374 goto out;
2375 }
2376 break;
2377 case OP_PUNCH_HOLE:
2378 if (!punch_hole_calls) {
2379 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, size);
2380 goto out;
2381 }
2382 break;
2383 case OP_CLONE:
2384 /* clone, 8% chance */
2385 if (!clone_calls || file_size == 0 || get_random() % 100 >= 8) {
2386 log4(OP_SKIPPED, OP_CLONE, 0, 0);
2387 goto out;
2388 }
2389 break;
2390 case OP_FLATTEN:
2391 /* flatten four times as rarely as clone, 2% chance */
2392 if (get_random() % 100 >= 2) {
2393 log4(OP_SKIPPED, OP_FLATTEN, 0, 0);
2394 goto out;
2395 }
2396 break;
2397 case OP_WRITESAME:
2398 /* writesame not implemented */
2399 if (!ops->writesame) {
2400 log4(OP_SKIPPED, OP_WRITESAME, offset, size);
2401 goto out;
2402 }
2403 break;
2404 case OP_COMPARE_AND_WRITE:
2405 /* compare_and_write not implemented */
2406 if (!ops->compare_and_write) {
2407 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
2408 goto out;
2409 }
2410 break;
2411 }
2412
2413 switch (op) {
2414 case OP_READ:
2415 TRIM_OFF_LEN(offset, size, file_size);
2416 doread(offset, size);
2417 break;
2418
2419 case OP_WRITE:
2420 TRIM_OFF_LEN(offset, size, maxfilelen);
2421 dowrite(offset, size);
2422 break;
2423
2424 case OP_MAPREAD:
2425 TRIM_OFF_LEN(offset, size, file_size);
2426 exit(183);
2427 break;
2428
2429 case OP_MAPWRITE:
2430 TRIM_OFF_LEN(offset, size, maxfilelen);
2431 exit(182);
2432 break;
2433
2434 case OP_TRUNCATE:
2435 if (!style)
2436 size = get_random() % maxfilelen;
2437 dotruncate(size);
2438 break;
2439
2440 case OP_PUNCH_HOLE:
2441 TRIM_OFF_LEN(offset, size, file_size);
2442 do_punch_hole(offset, size);
2443 break;
2444
2445 case OP_WRITESAME:
2446 TRIM_OFF_LEN(offset, size, maxfilelen);
2447 dowritesame(offset, size);
2448 break;
2449 case OP_COMPARE_AND_WRITE:
2450 TRIM_OFF_LEN(offset, size, file_size);
2451 docompareandwrite(offset, size);
2452 break;
2453
2454 case OP_CLONE:
2455 do_clone();
2456 break;
2457
2458 case OP_FLATTEN:
2459 do_flatten();
2460 break;
2461
2462 default:
2463 prterr("test: unknown operation");
2464 report_failure(42);
2465 break;
2466 }
2467
2468 out:
2469 if (sizechecks && testcalls > simulatedopcount)
2470 check_size();
2471 if (closeopen)
2472 docloseopen();
2473 }
2474
2475
2476 void
2477 cleanup(int sig)
2478 {
2479 if (sig)
2480 prt("signal %d\n", sig);
2481 prt("testcalls = %lu\n", testcalls);
2482 exit(sig);
2483 }
2484
2485
2486 void
2487 usage(void)
2488 {
2489 fprintf(stdout, "usage: %s",
2490 "fsx [-dfjknqxyACFHKLORUWZ] [-b opnum] [-c Prob] [-h holebdy] [-l flen] [-m start:end] [-o oplen] [-p progressinterval] [-r readbdy] [-s style] [-t truncbdy] [-w writebdy] [-D startingop] [-N numops] [-P dirpath] [-S seed] pname iname\n\
2491 -b opnum: beginning operation number (default 1)\n\
2492 -c P: 1 in P chance of file close+open at each op (default infinity)\n\
2493 -d: debug output for all operations\n\
2494 -f: flush and invalidate cache after I/O\n\
2495 -h holebdy: 4096 would make discards page aligned (default 1)\n\
2496 -j: journal replay stress test\n\
2497 -k: keep data on success (default 0)\n\
2498 -l flen: the upper bound on file size (default 262144)\n\
2499 -m startop:endop: monitor (print debug output) specified byte range (default 0:infinity)\n\
2500 -n: no verifications of file size\n\
2501 -o oplen: the upper bound on operation size (default 65536)\n\
2502 -p progressinterval: debug output at specified operation interval\n\
2503 -q: quieter operation\n\
2504 -r readbdy: 4096 would make reads page aligned (default 1)\n\
2505 -s style: 1 gives smaller truncates (default 0)\n\
2506 -t truncbdy: 4096 would make truncates page aligned (default 1)\n\
2507 -w writebdy: 4096 would make writes page aligned (default 1)\n\
2508 -x: preallocate file space before starting, XFS only (default 0)\n\
2509 -y: synchronize changes to a file\n"
2510
2511 " -C: do not use clone calls\n\
2512 -D startingop: debug output starting at specified operation\n"
2513 #ifdef FALLOCATE
2514 " -F: Do not use fallocate (preallocation) calls\n"
2515 #endif
2516 " -H: do not use punch hole calls\n\
2517 -K: enable krbd mode (use -t and -h too)\n\
2518 -M: enable rbd-nbd mode (use -t and -h too)\n\
2519 -L: fsxLite - no file creations & no file size changes\n\
2520 -N numops: total # operations to do (default infinity)\n\
2521 -O: use oplen (see -o flag) for every op (default random)\n\
2522 -P dirpath: save .fsxlog and .fsxgood files in dirpath (default ./)\n\
2523 -R: read() system calls only (mapped reads disabled)\n\
2524 -S seed: for random # generator (default 1) 0 gets timestamp\n\
2525 -U: disable randomized striping\n\
2526 -W: mapped write operations DISabled\n\
2527 -Z: O_DIRECT (use -R, -W, -r and -w too)\n\
2528 poolname: this is REQUIRED (no default)\n\
2529 imagename: this is REQUIRED (no default)\n");
2530 exit(89);
2531 }
2532
2533
2534 int
2535 getnum(char *s, char **e)
2536 {
2537 int ret;
2538
2539 *e = (char *) 0;
2540 ret = strtol(s, e, 0);
2541 if (*e)
2542 switch (**e) {
2543 case 'b':
2544 case 'B':
2545 ret *= 512;
2546 *e = *e + 1;
2547 break;
2548 case 'k':
2549 case 'K':
2550 ret *= 1024;
2551 *e = *e + 1;
2552 break;
2553 case 'm':
2554 case 'M':
2555 ret *= 1024*1024;
2556 *e = *e + 1;
2557 break;
2558 case 'w':
2559 case 'W':
2560 ret *= 4;
2561 *e = *e + 1;
2562 break;
2563 }
2564 return (ret);
2565 }
2566
2567 void
2568 test_fallocate()
2569 {
2570 #ifdef FALLOCATE
2571 if (!lite && fallocate_calls) {
2572 if (fallocate(fd, 0, 0, 1) && errno == EOPNOTSUPP) {
2573 if(!quiet)
2574 warn("main: filesystem does not support fallocate, disabling\n");
2575 fallocate_calls = 0;
2576 } else {
2577 ftruncate(fd, 0);
2578 }
2579 }
2580 #else /* ! FALLOCATE */
2581 fallocate_calls = 0;
2582 #endif
2583
2584 }
2585
2586 void remove_image(rados_ioctx_t ioctx, char *imagename, bool remove_snap,
2587 bool unregister) {
2588 rbd_image_t image;
2589 char errmsg[128];
2590 int ret;
2591
2592 if ((ret = rbd_open(ioctx, imagename, &image, NULL)) < 0) {
2593 sprintf(errmsg, "rbd_open %s", imagename);
2594 prterrcode(errmsg, ret);
2595 report_failure(101);
2596 }
2597 if (remove_snap) {
2598 if ((ret = rbd_snap_unprotect(image, "snap")) < 0) {
2599 sprintf(errmsg, "rbd_snap_unprotect %s@snap",
2600 imagename);
2601 prterrcode(errmsg, ret);
2602 report_failure(102);
2603 }
2604 if ((ret = rbd_snap_remove(image, "snap")) < 0) {
2605 sprintf(errmsg, "rbd_snap_remove %s@snap",
2606 imagename);
2607 prterrcode(errmsg, ret);
2608 report_failure(103);
2609 }
2610 }
2611 if ((ret = rbd_close(image)) < 0) {
2612 sprintf(errmsg, "rbd_close %s", imagename);
2613 prterrcode(errmsg, ret);
2614 report_failure(104);
2615 }
2616
2617 if (unregister &&
2618 (ret = unregister_journal(ioctx, imagename)) < 0) {
2619 report_failure(105);
2620 }
2621
2622 if ((ret = rbd_remove(ioctx, imagename)) < 0) {
2623 sprintf(errmsg, "rbd_remove %s", imagename);
2624 prterrcode(errmsg, ret);
2625 report_failure(106);
2626 }
2627 }
2628
2629 int
2630 main(int argc, char **argv)
2631 {
2632 int i, style, ch, ret;
2633 char *endp;
2634 char goodfile[1024];
2635 char logfile[1024];
2636
2637 goodfile[0] = 0;
2638 logfile[0] = 0;
2639
2640 page_size = getpagesize();
2641 page_mask = page_size - 1;
2642 mmap_mask = page_mask;
2643
2644 setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
2645
2646 while ((ch = getopt(argc, argv, "b:c:dfh:jkl:m:no:p:qr:s:t:w:xyCD:FHKMLN:OP:RS:UWZ"))
2647 != EOF)
2648 switch (ch) {
2649 case 'b':
2650 simulatedopcount = getnum(optarg, &endp);
2651 if (!quiet)
2652 fprintf(stdout, "Will begin at operation %lu\n",
2653 simulatedopcount);
2654 if (simulatedopcount == 0)
2655 usage();
2656 simulatedopcount -= 1;
2657 break;
2658 case 'c':
2659 closeprob = getnum(optarg, &endp);
2660 if (!quiet)
2661 fprintf(stdout,
2662 "Chance of close/open is 1 in %d\n",
2663 closeprob);
2664 if (closeprob <= 0)
2665 usage();
2666 break;
2667 case 'd':
2668 debug = 1;
2669 break;
2670 case 'f':
2671 flush_enabled = 1;
2672 break;
2673 case 'h':
2674 holebdy = getnum(optarg, &endp);
2675 if (holebdy <= 0)
2676 usage();
2677 break;
2678 case 'j':
2679 journal_replay = true;
2680 break;
2681 case 'k':
2682 keep_on_success = 1;
2683 break;
2684 case 'l':
2685 {
2686 int _num = getnum(optarg, &endp);
2687 if (_num <= 0)
2688 usage();
2689 maxfilelen = _num;
2690 }
2691 break;
2692 case 'm':
2693 monitorstart = getnum(optarg, &endp);
2694 if (monitorstart < 0)
2695 usage();
2696 if (!endp || *endp++ != ':')
2697 usage();
2698 monitorend = getnum(endp, &endp);
2699 if (monitorend < 0)
2700 usage();
2701 if (monitorend == 0)
2702 monitorend = -1; /* aka infinity */
2703 debug = 1;
2704 break;
2705 case 'n':
2706 sizechecks = 0;
2707 break;
2708 case 'o':
2709 maxoplen = getnum(optarg, &endp);
2710 if (maxoplen <= 0)
2711 usage();
2712 break;
2713 case 'p':
2714 progressinterval = getnum(optarg, &endp);
2715 if (progressinterval == 0)
2716 usage();
2717 break;
2718 case 'q':
2719 quiet = 1;
2720 break;
2721 case 'r':
2722 readbdy = getnum(optarg, &endp);
2723 if (readbdy <= 0)
2724 usage();
2725 break;
2726 case 's':
2727 style = getnum(optarg, &endp);
2728 if (style < 0 || style > 1)
2729 usage();
2730 break;
2731 case 't':
2732 truncbdy = getnum(optarg, &endp);
2733 if (truncbdy <= 0)
2734 usage();
2735 break;
2736 case 'w':
2737 writebdy = getnum(optarg, &endp);
2738 if (writebdy <= 0)
2739 usage();
2740 break;
2741 case 'x':
2742 prealloc = 1;
2743 break;
2744 case 'y':
2745 do_fsync = 1;
2746 break;
2747 case 'C':
2748 clone_calls = 0;
2749 break;
2750 case 'D':
2751 debugstart = getnum(optarg, &endp);
2752 if (debugstart < 1)
2753 usage();
2754 break;
2755 case 'F':
2756 fallocate_calls = 0;
2757 break;
2758 case 'H':
2759 punch_hole_calls = 0;
2760 break;
2761 case 'K':
2762 prt("krbd mode enabled\n");
2763 ops = &krbd_operations;
2764 break;
2765 case 'M':
2766 prt("rbd-nbd mode enabled\n");
2767 ops = &nbd_operations;
2768 break;
2769 case 'L':
2770 prt("lite mode not supported for rbd\n");
2771 exit(1);
2772 break;
2773 case 'N':
2774 numops = getnum(optarg, &endp);
2775 if (numops < 0)
2776 usage();
2777 break;
2778 case 'O':
2779 randomoplen = 0;
2780 break;
2781 case 'P':
2782 strncpy(dirpath, optarg, sizeof(dirpath)-1);
2783 dirpath[sizeof(dirpath)-1] = '\0';
2784 strncpy(goodfile, dirpath, sizeof(goodfile)-1);
2785 goodfile[sizeof(goodfile)-1] = '\0';
2786 if (strlen(goodfile) < sizeof(goodfile)-2) {
2787 strcat(goodfile, "/");
2788 } else {
2789 prt("file name to long\n");
2790 exit(1);
2791 }
2792 strncpy(logfile, dirpath, sizeof(logfile)-1);
2793 logfile[sizeof(logfile)-1] = '\0';
2794 if (strlen(logfile) < sizeof(logfile)-2) {
2795 strcat(logfile, "/");
2796 } else {
2797 prt("file path to long\n");
2798 exit(1);
2799 }
2800 break;
2801 case 'R':
2802 mapped_reads = 0;
2803 if (!quiet)
2804 fprintf(stdout, "mapped reads DISABLED\n");
2805 break;
2806 case 'S':
2807 seed = getnum(optarg, &endp);
2808 if (seed == 0)
2809 seed = time(0) % 10000;
2810 if (!quiet)
2811 fprintf(stdout, "Seed set to %d\n", seed);
2812 if (seed < 0)
2813 usage();
2814 break;
2815 case 'U':
2816 randomize_striping = 0;
2817 break;
2818 case 'W':
2819 mapped_writes = 0;
2820 if (!quiet)
2821 fprintf(stdout, "mapped writes DISABLED\n");
2822 break;
2823 case 'Z':
2824 o_direct = O_DIRECT;
2825 break;
2826 default:
2827 usage();
2828 /* NOTREACHED */
2829 }
2830 argc -= optind;
2831 argv += optind;
2832 if (argc != 2)
2833 usage();
2834 pool = argv[0];
2835 iname = argv[1];
2836
2837 signal(SIGHUP, cleanup);
2838 signal(SIGINT, cleanup);
2839 signal(SIGPIPE, cleanup);
2840 signal(SIGALRM, cleanup);
2841 signal(SIGTERM, cleanup);
2842 signal(SIGXCPU, cleanup);
2843 signal(SIGXFSZ, cleanup);
2844 signal(SIGVTALRM, cleanup);
2845 signal(SIGUSR1, cleanup);
2846 signal(SIGUSR2, cleanup);
2847
2848 random_generator.seed(seed);
2849
2850 ret = create_image();
2851 if (ret < 0) {
2852 prterrcode(iname, ret);
2853 exit(90);
2854 }
2855 ret = ops->open(iname, &ctx);
2856 if (ret < 0) {
2857 simple_err("Error opening image", ret);
2858 exit(91);
2859 }
2860 if (!dirpath[0])
2861 strcat(dirpath, ".");
2862 strncat(goodfile, iname, 256);
2863 strcat (goodfile, ".fsxgood");
2864 fsxgoodfd = open(goodfile, O_RDWR|O_CREAT|O_TRUNC, 0666);
2865 if (fsxgoodfd < 0) {
2866 prterr(goodfile);
2867 exit(92);
2868 }
2869 strncat(logfile, iname, 256);
2870 strcat (logfile, ".fsxlog");
2871 fsxlogf = fopen(logfile, "w");
2872 if (fsxlogf == NULL) {
2873 prterr(logfile);
2874 exit(93);
2875 }
2876
2877 original_buf = (char *) malloc(maxfilelen);
2878 for (i = 0; i < (int)maxfilelen; i++)
2879 original_buf[i] = get_random() % 256;
2880
2881 ret = posix_memalign((void **)&good_buf,
2882 MAX(writebdy, (int)sizeof(void *)), maxfilelen);
2883 if (ret > 0) {
2884 if (ret == EINVAL)
2885 prt("writebdy is not a suitable power of two\n");
2886 else
2887 prterrcode("main: posix_memalign(good_buf)", -ret);
2888 exit(94);
2889 }
2890 memset(good_buf, '\0', maxfilelen);
2891
2892 ret = posix_memalign((void **)&temp_buf,
2893 MAX(readbdy, (int)sizeof(void *)), maxfilelen);
2894 if (ret > 0) {
2895 if (ret == EINVAL)
2896 prt("readbdy is not a suitable power of two\n");
2897 else
2898 prterrcode("main: posix_memalign(temp_buf)", -ret);
2899 exit(95);
2900 }
2901 memset(temp_buf, '\0', maxfilelen);
2902
2903 if (lite) { /* zero entire existing file */
2904 ssize_t written;
2905
2906 written = ops->write(&ctx, 0, (size_t)maxfilelen, good_buf);
2907 if (written != (ssize_t)maxfilelen) {
2908 if (written < 0) {
2909 prterrcode(iname, written);
2910 warn("main: error on write");
2911 } else
2912 warn("main: short write, 0x%x bytes instead "
2913 "of 0x%lx\n",
2914 (unsigned)written,
2915 maxfilelen);
2916 exit(98);
2917 }
2918 } else
2919 check_trunc_hack();
2920
2921 //test_fallocate();
2922
2923 while (numops == -1 || numops--)
2924 test();
2925
2926 ret = ops->close(&ctx);
2927 if (ret < 0) {
2928 prterrcode("ops->close", ret);
2929 report_failure(99);
2930 }
2931
2932 if (journal_replay) {
2933 char imagename[1024];
2934 clone_imagename(imagename, sizeof(imagename), num_clones);
2935 ret = finalize_journal(ioctx, imagename, num_clones, 0, 0, 0);
2936 if (ret < 0) {
2937 report_failure(100);
2938 }
2939 }
2940
2941 if (num_clones > 0) {
2942 if (journal_replay) {
2943 check_clone(num_clones - 1, true);
2944 }
2945 check_clone(num_clones - 1, false);
2946 }
2947
2948 if (!keep_on_success) {
2949 while (num_clones >= 0) {
2950 static bool remove_snap = false;
2951
2952 if (journal_replay) {
2953 char replayimagename[1024];
2954 replay_imagename(replayimagename,
2955 sizeof(replayimagename),
2956 num_clones);
2957 remove_image(ioctx, replayimagename,
2958 remove_snap,
2959 false);
2960 }
2961
2962 char clonename[128];
2963 clone_imagename(clonename, 128, num_clones);
2964 remove_image(ioctx, clonename, remove_snap,
2965 journal_replay);
2966
2967 remove_snap = true;
2968 num_clones--;
2969 }
2970 }
2971
2972 prt("All operations completed A-OK!\n");
2973 fclose(fsxlogf);
2974
2975 rados_ioctx_destroy(ioctx);
2976 krbd_destroy(krbd);
2977 rados_shutdown(cluster);
2978
2979 free(original_buf);
2980 free(good_buf);
2981 free(temp_buf);
2982
2983 exit(0);
2984 return 0;
2985 }